1 /*-------------------------------------------------------------------------
2 *
3 * costsize.c
4 * Routines to compute (and set) relation sizes and path costs
5 *
6 * Path costs are measured in arbitrary units established by these basic
7 * parameters:
8 *
9 * seq_page_cost Cost of a sequential page fetch
10 * random_page_cost Cost of a non-sequential page fetch
11 * cpu_tuple_cost Cost of typical CPU time to process a tuple
12 * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple
13 * cpu_operator_cost Cost of CPU time to execute an operator or function
14 * parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15 * parallel_setup_cost Cost of setting up shared memory for parallelism
16 *
17 * We expect that the kernel will typically do some amount of read-ahead
18 * optimization; this in conjunction with seek costs means that seq_page_cost
19 * is normally considerably less than random_page_cost. (However, if the
20 * database is fully cached in RAM, it is reasonable to set them equal.)
21 *
22 * We also use a rough estimate "effective_cache_size" of the number of
23 * disk pages in Postgres + OS-level disk cache. (We can't simply use
24 * NBuffers for this purpose because that would ignore the effects of
25 * the kernel's disk cache.)
26 *
27 * Obviously, taking constants for these values is an oversimplification,
28 * but it's tough enough to get any useful estimates even at this level of
29 * detail. Note that all of these parameters are user-settable, in case
30 * the default values are drastically off for a particular platform.
31 *
32 * seq_page_cost and random_page_cost can also be overridden for an individual
33 * tablespace, in case some data is on a fast disk and other data is on a slow
34 * disk. Per-tablespace overrides never apply to temporary work files such as
35 * an external sort or a materialize node that overflows work_mem.
36 *
37 * We compute two separate costs for each path:
38 * total_cost: total estimated cost to fetch all tuples
39 * startup_cost: cost that is expended before first tuple is fetched
40 * In some scenarios, such as when there is a LIMIT or we are implementing
41 * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42 * path's result. A caller can estimate the cost of fetching a partial
43 * result by interpolating between startup_cost and total_cost. In detail:
44 * actual_cost = startup_cost +
45 * (total_cost - startup_cost) * tuples_to_fetch / path->rows;
46 * Note that a base relation's rows count (and, by extension, plan_rows for
47 * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48 * that this equation works properly. (Note: while path->rows is never zero
49 * for ordinary relations, it is zero for paths for provably-empty relations,
50 * so beware of division-by-zero.) The LIMIT is applied as a top-level
51 * plan node.
52 *
53 * For largely historical reasons, most of the routines in this module use
54 * the passed result Path only to store their results (rows, startup_cost and
55 * total_cost) into. All the input data they need is passed as separate
56 * parameters, even though much of it could be extracted from the Path.
57 * An exception is made for the cost_XXXjoin() routines, which expect all
58 * the other fields of the passed XXXPath to be filled in, and similarly
59 * cost_index() assumes the passed IndexPath is valid except for its output
60 * values.
61 *
62 *
63 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
64 * Portions Copyright (c) 1994, Regents of the University of California
65 *
66 * IDENTIFICATION
67 * src/backend/optimizer/path/costsize.c
68 *
69 *-------------------------------------------------------------------------
70 */
71
72 #include "postgres.h"
73
74 #include <math.h>
75
76 #include "access/amapi.h"
77 #include "access/htup_details.h"
78 #include "access/tsmapi.h"
79 #include "executor/executor.h"
80 #include "executor/nodeHash.h"
81 #include "miscadmin.h"
82 #include "nodes/makefuncs.h"
83 #include "nodes/nodeFuncs.h"
84 #include "optimizer/clauses.h"
85 #include "optimizer/cost.h"
86 #include "optimizer/optimizer.h"
87 #include "optimizer/pathnode.h"
88 #include "optimizer/paths.h"
89 #include "optimizer/placeholder.h"
90 #include "optimizer/plancat.h"
91 #include "optimizer/planmain.h"
92 #include "optimizer/restrictinfo.h"
93 #include "parser/parsetree.h"
94 #include "utils/lsyscache.h"
95 #include "utils/selfuncs.h"
96 #include "utils/spccache.h"
97 #include "utils/tuplesort.h"
98
99
100 #define LOG2(x) (log(x) / 0.693147180559945)
101
102 /*
103 * Append and MergeAppend nodes are less expensive than some other operations
104 * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
105 * per-tuple cost as cpu_tuple_cost multiplied by this value.
106 */
107 #define APPEND_CPU_COST_MULTIPLIER 0.5
108
109
110 double seq_page_cost = DEFAULT_SEQ_PAGE_COST;
111 double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
112 double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
113 double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
114 double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
115 double parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
116 double parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
117
118 int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
119
120 Cost disable_cost = 1.0e10;
121
122 int max_parallel_workers_per_gather = 2;
123
124 bool enable_seqscan = true;
125 bool enable_indexscan = true;
126 bool enable_indexonlyscan = true;
127 bool enable_bitmapscan = true;
128 bool enable_tidscan = true;
129 bool enable_sort = true;
130 bool enable_hashagg = true;
131 bool enable_nestloop = true;
132 bool enable_material = true;
133 bool enable_mergejoin = true;
134 bool enable_hashjoin = true;
135 bool enable_gathermerge = true;
136 bool enable_partitionwise_join = false;
137 bool enable_partitionwise_aggregate = false;
138 bool enable_parallel_append = true;
139 bool enable_parallel_hash = true;
140 bool enable_partition_pruning = true;
141
142 typedef struct
143 {
144 PlannerInfo *root;
145 QualCost total;
146 } cost_qual_eval_context;
147
148 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
149 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
150 RestrictInfo *rinfo,
151 PathKey *pathkey);
152 static void cost_rescan(PlannerInfo *root, Path *path,
153 Cost *rescan_startup_cost, Cost *rescan_total_cost);
154 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
155 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
156 ParamPathInfo *param_info,
157 QualCost *qpqual_cost);
158 static bool has_indexed_join_quals(NestPath *joinpath);
159 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
160 List *quals);
161 static double calc_joinrel_size_estimate(PlannerInfo *root,
162 RelOptInfo *joinrel,
163 RelOptInfo *outer_rel,
164 RelOptInfo *inner_rel,
165 double outer_rows,
166 double inner_rows,
167 SpecialJoinInfo *sjinfo,
168 List *restrictlist);
169 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
170 Relids outer_relids,
171 Relids inner_relids,
172 SpecialJoinInfo *sjinfo,
173 List **restrictlist);
174 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
175 int parallel_workers);
176 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
177 static double relation_byte_size(double tuples, int width);
178 static double page_size(double tuples, int width);
179 static double get_parallel_divisor(Path *path);
180
181
182 /*
183 * clamp_row_est
184 * Force a row-count estimate to a sane value.
185 */
186 double
clamp_row_est(double nrows)187 clamp_row_est(double nrows)
188 {
189 /*
190 * Force estimate to be at least one row, to make explain output look
191 * better and to avoid possible divide-by-zero when interpolating costs.
192 * Make it an integer, too.
193 */
194 if (nrows <= 1.0)
195 nrows = 1.0;
196 else
197 nrows = rint(nrows);
198
199 return nrows;
200 }
201
202
203 /*
204 * cost_seqscan
205 * Determines and returns the cost of scanning a relation sequentially.
206 *
207 * 'baserel' is the relation to be scanned
208 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
209 */
210 void
cost_seqscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)211 cost_seqscan(Path *path, PlannerInfo *root,
212 RelOptInfo *baserel, ParamPathInfo *param_info)
213 {
214 Cost startup_cost = 0;
215 Cost cpu_run_cost;
216 Cost disk_run_cost;
217 double spc_seq_page_cost;
218 QualCost qpqual_cost;
219 Cost cpu_per_tuple;
220
221 /* Should only be applied to base relations */
222 Assert(baserel->relid > 0);
223 Assert(baserel->rtekind == RTE_RELATION);
224
225 /* Mark the path with the correct row estimate */
226 if (param_info)
227 path->rows = param_info->ppi_rows;
228 else
229 path->rows = baserel->rows;
230
231 if (!enable_seqscan)
232 startup_cost += disable_cost;
233
234 /* fetch estimated page cost for tablespace containing table */
235 get_tablespace_page_costs(baserel->reltablespace,
236 NULL,
237 &spc_seq_page_cost);
238
239 /*
240 * disk costs
241 */
242 disk_run_cost = spc_seq_page_cost * baserel->pages;
243
244 /* CPU costs */
245 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
246
247 startup_cost += qpqual_cost.startup;
248 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
249 cpu_run_cost = cpu_per_tuple * baserel->tuples;
250 /* tlist eval costs are paid per output row, not per tuple scanned */
251 startup_cost += path->pathtarget->cost.startup;
252 cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
253
254 /* Adjust costing for parallelism, if used. */
255 if (path->parallel_workers > 0)
256 {
257 double parallel_divisor = get_parallel_divisor(path);
258
259 /* The CPU cost is divided among all the workers. */
260 cpu_run_cost /= parallel_divisor;
261
262 /*
263 * It may be possible to amortize some of the I/O cost, but probably
264 * not very much, because most operating systems already do aggressive
265 * prefetching. For now, we assume that the disk run cost can't be
266 * amortized at all.
267 */
268
269 /*
270 * In the case of a parallel plan, the row count needs to represent
271 * the number of tuples processed per worker.
272 */
273 path->rows = clamp_row_est(path->rows / parallel_divisor);
274 }
275
276 path->startup_cost = startup_cost;
277 path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
278 }
279
280 /*
281 * cost_samplescan
282 * Determines and returns the cost of scanning a relation using sampling.
283 *
284 * 'baserel' is the relation to be scanned
285 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
286 */
287 void
cost_samplescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)288 cost_samplescan(Path *path, PlannerInfo *root,
289 RelOptInfo *baserel, ParamPathInfo *param_info)
290 {
291 Cost startup_cost = 0;
292 Cost run_cost = 0;
293 RangeTblEntry *rte;
294 TableSampleClause *tsc;
295 TsmRoutine *tsm;
296 double spc_seq_page_cost,
297 spc_random_page_cost,
298 spc_page_cost;
299 QualCost qpqual_cost;
300 Cost cpu_per_tuple;
301
302 /* Should only be applied to base relations with tablesample clauses */
303 Assert(baserel->relid > 0);
304 rte = planner_rt_fetch(baserel->relid, root);
305 Assert(rte->rtekind == RTE_RELATION);
306 tsc = rte->tablesample;
307 Assert(tsc != NULL);
308 tsm = GetTsmRoutine(tsc->tsmhandler);
309
310 /* Mark the path with the correct row estimate */
311 if (param_info)
312 path->rows = param_info->ppi_rows;
313 else
314 path->rows = baserel->rows;
315
316 /* fetch estimated page cost for tablespace containing table */
317 get_tablespace_page_costs(baserel->reltablespace,
318 &spc_random_page_cost,
319 &spc_seq_page_cost);
320
321 /* if NextSampleBlock is used, assume random access, else sequential */
322 spc_page_cost = (tsm->NextSampleBlock != NULL) ?
323 spc_random_page_cost : spc_seq_page_cost;
324
325 /*
326 * disk costs (recall that baserel->pages has already been set to the
327 * number of pages the sampling method will visit)
328 */
329 run_cost += spc_page_cost * baserel->pages;
330
331 /*
332 * CPU costs (recall that baserel->tuples has already been set to the
333 * number of tuples the sampling method will select). Note that we ignore
334 * execution cost of the TABLESAMPLE parameter expressions; they will be
335 * evaluated only once per scan, and in most usages they'll likely be
336 * simple constants anyway. We also don't charge anything for the
337 * calculations the sampling method might do internally.
338 */
339 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
340
341 startup_cost += qpqual_cost.startup;
342 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
343 run_cost += cpu_per_tuple * baserel->tuples;
344 /* tlist eval costs are paid per output row, not per tuple scanned */
345 startup_cost += path->pathtarget->cost.startup;
346 run_cost += path->pathtarget->cost.per_tuple * path->rows;
347
348 path->startup_cost = startup_cost;
349 path->total_cost = startup_cost + run_cost;
350 }
351
352 /*
353 * cost_gather
354 * Determines and returns the cost of gather path.
355 *
356 * 'rel' is the relation to be operated upon
357 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
358 * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
359 * both 'rel' and 'param_info'. This is useful when the path doesn't exactly
360 * correspond to any particular RelOptInfo.
361 */
362 void
cost_gather(GatherPath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,double * rows)363 cost_gather(GatherPath *path, PlannerInfo *root,
364 RelOptInfo *rel, ParamPathInfo *param_info,
365 double *rows)
366 {
367 Cost startup_cost = 0;
368 Cost run_cost = 0;
369
370 /* Mark the path with the correct row estimate */
371 if (rows)
372 path->path.rows = *rows;
373 else if (param_info)
374 path->path.rows = param_info->ppi_rows;
375 else
376 path->path.rows = rel->rows;
377
378 startup_cost = path->subpath->startup_cost;
379
380 run_cost = path->subpath->total_cost - path->subpath->startup_cost;
381
382 /* Parallel setup and communication cost. */
383 startup_cost += parallel_setup_cost;
384 run_cost += parallel_tuple_cost * path->path.rows;
385
386 path->path.startup_cost = startup_cost;
387 path->path.total_cost = (startup_cost + run_cost);
388 }
389
390 /*
391 * cost_gather_merge
392 * Determines and returns the cost of gather merge path.
393 *
394 * GatherMerge merges several pre-sorted input streams, using a heap that at
395 * any given instant holds the next tuple from each stream. If there are N
396 * streams, we need about N*log2(N) tuple comparisons to construct the heap at
397 * startup, and then for each output tuple, about log2(N) comparisons to
398 * replace the top heap entry with the next tuple from the same stream.
399 */
400 void
cost_gather_merge(GatherMergePath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,Cost input_startup_cost,Cost input_total_cost,double * rows)401 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
402 RelOptInfo *rel, ParamPathInfo *param_info,
403 Cost input_startup_cost, Cost input_total_cost,
404 double *rows)
405 {
406 Cost startup_cost = 0;
407 Cost run_cost = 0;
408 Cost comparison_cost;
409 double N;
410 double logN;
411
412 /* Mark the path with the correct row estimate */
413 if (rows)
414 path->path.rows = *rows;
415 else if (param_info)
416 path->path.rows = param_info->ppi_rows;
417 else
418 path->path.rows = rel->rows;
419
420 if (!enable_gathermerge)
421 startup_cost += disable_cost;
422
423 /*
424 * Add one to the number of workers to account for the leader. This might
425 * be overgenerous since the leader will do less work than other workers
426 * in typical cases, but we'll go with it for now.
427 */
428 Assert(path->num_workers > 0);
429 N = (double) path->num_workers + 1;
430 logN = LOG2(N);
431
432 /* Assumed cost per tuple comparison */
433 comparison_cost = 2.0 * cpu_operator_cost;
434
435 /* Heap creation cost */
436 startup_cost += comparison_cost * N * logN;
437
438 /* Per-tuple heap maintenance cost */
439 run_cost += path->path.rows * comparison_cost * logN;
440
441 /* small cost for heap management, like cost_merge_append */
442 run_cost += cpu_operator_cost * path->path.rows;
443
444 /*
445 * Parallel setup and communication cost. Since Gather Merge, unlike
446 * Gather, requires us to block until a tuple is available from every
447 * worker, we bump the IPC cost up a little bit as compared with Gather.
448 * For lack of a better idea, charge an extra 5%.
449 */
450 startup_cost += parallel_setup_cost;
451 run_cost += parallel_tuple_cost * path->path.rows * 1.05;
452
453 path->path.startup_cost = startup_cost + input_startup_cost;
454 path->path.total_cost = (startup_cost + run_cost + input_total_cost);
455 }
456
457 /*
458 * cost_index
459 * Determines and returns the cost of scanning a relation using an index.
460 *
461 * 'path' describes the indexscan under consideration, and is complete
462 * except for the fields to be set by this routine
463 * 'loop_count' is the number of repetitions of the indexscan to factor into
464 * estimates of caching behavior
465 *
466 * In addition to rows, startup_cost and total_cost, cost_index() sets the
467 * path's indextotalcost and indexselectivity fields. These values will be
468 * needed if the IndexPath is used in a BitmapIndexScan.
469 *
470 * NOTE: path->indexquals must contain only clauses usable as index
471 * restrictions. Any additional quals evaluated as qpquals may reduce the
472 * number of returned tuples, but they won't reduce the number of tuples
473 * we have to fetch from the table, so they don't reduce the scan cost.
474 */
475 void
cost_index(IndexPath * path,PlannerInfo * root,double loop_count,bool partial_path)476 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
477 bool partial_path)
478 {
479 IndexOptInfo *index = path->indexinfo;
480 RelOptInfo *baserel = index->rel;
481 bool indexonly = (path->path.pathtype == T_IndexOnlyScan);
482 amcostestimate_function amcostestimate;
483 List *qpquals;
484 Cost startup_cost = 0;
485 Cost run_cost = 0;
486 Cost cpu_run_cost = 0;
487 Cost indexStartupCost;
488 Cost indexTotalCost;
489 Selectivity indexSelectivity;
490 double indexCorrelation,
491 csquared;
492 double spc_seq_page_cost,
493 spc_random_page_cost;
494 Cost min_IO_cost,
495 max_IO_cost;
496 QualCost qpqual_cost;
497 Cost cpu_per_tuple;
498 double tuples_fetched;
499 double pages_fetched;
500 double rand_heap_pages;
501 double index_pages;
502
503 /* Should only be applied to base relations */
504 Assert(IsA(baserel, RelOptInfo) &&
505 IsA(index, IndexOptInfo));
506 Assert(baserel->relid > 0);
507 Assert(baserel->rtekind == RTE_RELATION);
508
509 /*
510 * Mark the path with the correct row estimate, and identify which quals
511 * will need to be enforced as qpquals. We need not check any quals that
512 * are implied by the index's predicate, so we can use indrestrictinfo not
513 * baserestrictinfo as the list of relevant restriction clauses for the
514 * rel.
515 */
516 if (path->path.param_info)
517 {
518 path->path.rows = path->path.param_info->ppi_rows;
519 /* qpquals come from the rel's restriction clauses and ppi_clauses */
520 qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
521 path->indexclauses),
522 extract_nonindex_conditions(path->path.param_info->ppi_clauses,
523 path->indexclauses));
524 }
525 else
526 {
527 path->path.rows = baserel->rows;
528 /* qpquals come from just the rel's restriction clauses */
529 qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
530 path->indexclauses);
531 }
532
533 if (!enable_indexscan)
534 startup_cost += disable_cost;
535 /* we don't need to check enable_indexonlyscan; indxpath.c does that */
536
537 /*
538 * Call index-access-method-specific code to estimate the processing cost
539 * for scanning the index, as well as the selectivity of the index (ie,
540 * the fraction of main-table tuples we will have to retrieve) and its
541 * correlation to the main-table tuple order. We need a cast here because
542 * pathnodes.h uses a weak function type to avoid including amapi.h.
543 */
544 amcostestimate = (amcostestimate_function) index->amcostestimate;
545 amcostestimate(root, path, loop_count,
546 &indexStartupCost, &indexTotalCost,
547 &indexSelectivity, &indexCorrelation,
548 &index_pages);
549
550 /*
551 * Save amcostestimate's results for possible use in bitmap scan planning.
552 * We don't bother to save indexStartupCost or indexCorrelation, because a
553 * bitmap scan doesn't care about either.
554 */
555 path->indextotalcost = indexTotalCost;
556 path->indexselectivity = indexSelectivity;
557
558 /* all costs for touching index itself included here */
559 startup_cost += indexStartupCost;
560 run_cost += indexTotalCost - indexStartupCost;
561
562 /* estimate number of main-table tuples fetched */
563 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
564
565 /* fetch estimated page costs for tablespace containing table */
566 get_tablespace_page_costs(baserel->reltablespace,
567 &spc_random_page_cost,
568 &spc_seq_page_cost);
569
570 /*----------
571 * Estimate number of main-table pages fetched, and compute I/O cost.
572 *
573 * When the index ordering is uncorrelated with the table ordering,
574 * we use an approximation proposed by Mackert and Lohman (see
575 * index_pages_fetched() for details) to compute the number of pages
576 * fetched, and then charge spc_random_page_cost per page fetched.
577 *
578 * When the index ordering is exactly correlated with the table ordering
579 * (just after a CLUSTER, for example), the number of pages fetched should
580 * be exactly selectivity * table_size. What's more, all but the first
581 * will be sequential fetches, not the random fetches that occur in the
582 * uncorrelated case. So if the number of pages is more than 1, we
583 * ought to charge
584 * spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
585 * For partially-correlated indexes, we ought to charge somewhere between
586 * these two estimates. We currently interpolate linearly between the
587 * estimates based on the correlation squared (XXX is that appropriate?).
588 *
589 * If it's an index-only scan, then we will not need to fetch any heap
590 * pages for which the visibility map shows all tuples are visible.
591 * Hence, reduce the estimated number of heap fetches accordingly.
592 * We use the measured fraction of the entire heap that is all-visible,
593 * which might not be particularly relevant to the subset of the heap
594 * that this query will fetch; but it's not clear how to do better.
595 *----------
596 */
597 if (loop_count > 1)
598 {
599 /*
600 * For repeated indexscans, the appropriate estimate for the
601 * uncorrelated case is to scale up the number of tuples fetched in
602 * the Mackert and Lohman formula by the number of scans, so that we
603 * estimate the number of pages fetched by all the scans; then
604 * pro-rate the costs for one scan. In this case we assume all the
605 * fetches are random accesses.
606 */
607 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
608 baserel->pages,
609 (double) index->pages,
610 root);
611
612 if (indexonly)
613 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
614
615 rand_heap_pages = pages_fetched;
616
617 max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
618
619 /*
620 * In the perfectly correlated case, the number of pages touched by
621 * each scan is selectivity * table_size, and we can use the Mackert
622 * and Lohman formula at the page level to estimate how much work is
623 * saved by caching across scans. We still assume all the fetches are
624 * random, though, which is an overestimate that's hard to correct for
625 * without double-counting the cache effects. (But in most cases
626 * where such a plan is actually interesting, only one page would get
627 * fetched per scan anyway, so it shouldn't matter much.)
628 */
629 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
630
631 pages_fetched = index_pages_fetched(pages_fetched * loop_count,
632 baserel->pages,
633 (double) index->pages,
634 root);
635
636 if (indexonly)
637 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
638
639 min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
640 }
641 else
642 {
643 /*
644 * Normal case: apply the Mackert and Lohman formula, and then
645 * interpolate between that and the correlation-derived result.
646 */
647 pages_fetched = index_pages_fetched(tuples_fetched,
648 baserel->pages,
649 (double) index->pages,
650 root);
651
652 if (indexonly)
653 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
654
655 rand_heap_pages = pages_fetched;
656
657 /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
658 max_IO_cost = pages_fetched * spc_random_page_cost;
659
660 /* min_IO_cost is for the perfectly correlated case (csquared=1) */
661 pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
662
663 if (indexonly)
664 pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
665
666 if (pages_fetched > 0)
667 {
668 min_IO_cost = spc_random_page_cost;
669 if (pages_fetched > 1)
670 min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
671 }
672 else
673 min_IO_cost = 0;
674 }
675
676 if (partial_path)
677 {
678 /*
679 * For index only scans compute workers based on number of index pages
680 * fetched; the number of heap pages we fetch might be so small as to
681 * effectively rule out parallelism, which we don't want to do.
682 */
683 if (indexonly)
684 rand_heap_pages = -1;
685
686 /*
687 * Estimate the number of parallel workers required to scan index. Use
688 * the number of heap pages computed considering heap fetches won't be
689 * sequential as for parallel scans the pages are accessed in random
690 * order.
691 */
692 path->path.parallel_workers = compute_parallel_worker(baserel,
693 rand_heap_pages,
694 index_pages,
695 max_parallel_workers_per_gather);
696
697 /*
698 * Fall out if workers can't be assigned for parallel scan, because in
699 * such a case this path will be rejected. So there is no benefit in
700 * doing extra computation.
701 */
702 if (path->path.parallel_workers <= 0)
703 return;
704
705 path->path.parallel_aware = true;
706 }
707
708 /*
709 * Now interpolate based on estimated index order correlation to get total
710 * disk I/O cost for main table accesses.
711 */
712 csquared = indexCorrelation * indexCorrelation;
713
714 run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
715
716 /*
717 * Estimate CPU costs per tuple.
718 *
719 * What we want here is cpu_tuple_cost plus the evaluation costs of any
720 * qual clauses that we have to evaluate as qpquals.
721 */
722 cost_qual_eval(&qpqual_cost, qpquals, root);
723
724 startup_cost += qpqual_cost.startup;
725 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
726
727 cpu_run_cost += cpu_per_tuple * tuples_fetched;
728
729 /* tlist eval costs are paid per output row, not per tuple scanned */
730 startup_cost += path->path.pathtarget->cost.startup;
731 cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
732
733 /* Adjust costing for parallelism, if used. */
734 if (path->path.parallel_workers > 0)
735 {
736 double parallel_divisor = get_parallel_divisor(&path->path);
737
738 path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
739
740 /* The CPU cost is divided among all the workers. */
741 cpu_run_cost /= parallel_divisor;
742 }
743
744 run_cost += cpu_run_cost;
745
746 path->path.startup_cost = startup_cost;
747 path->path.total_cost = startup_cost + run_cost;
748 }
749
750 /*
751 * extract_nonindex_conditions
752 *
753 * Given a list of quals to be enforced in an indexscan, extract the ones that
754 * will have to be applied as qpquals (ie, the index machinery won't handle
755 * them). Here we detect only whether a qual clause is directly redundant
756 * with some indexclause. If the index path is chosen for use, createplan.c
757 * will try a bit harder to get rid of redundant qual conditions; specifically
758 * it will see if quals can be proven to be implied by the indexquals. But
759 * it does not seem worth the cycles to try to factor that in at this stage,
760 * since we're only trying to estimate qual eval costs. Otherwise this must
761 * match the logic in create_indexscan_plan().
762 *
763 * qual_clauses, and the result, are lists of RestrictInfos.
764 * indexclauses is a list of IndexClauses.
765 */
766 static List *
extract_nonindex_conditions(List * qual_clauses,List * indexclauses)767 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
768 {
769 List *result = NIL;
770 ListCell *lc;
771
772 foreach(lc, qual_clauses)
773 {
774 RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
775
776 if (rinfo->pseudoconstant)
777 continue; /* we may drop pseudoconstants here */
778 if (is_redundant_with_indexclauses(rinfo, indexclauses))
779 continue; /* dup or derived from same EquivalenceClass */
780 /* ... skip the predicate proof attempt createplan.c will try ... */
781 result = lappend(result, rinfo);
782 }
783 return result;
784 }
785
786 /*
787 * index_pages_fetched
788 * Estimate the number of pages actually fetched after accounting for
789 * cache effects.
790 *
791 * We use an approximation proposed by Mackert and Lohman, "Index Scans
792 * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
793 * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
794 * The Mackert and Lohman approximation is that the number of pages
795 * fetched is
796 * PF =
797 * min(2TNs/(2T+Ns), T) when T <= b
798 * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b)
799 * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b)
800 * where
801 * T = # pages in table
802 * N = # tuples in table
803 * s = selectivity = fraction of table to be scanned
804 * b = # buffer pages available (we include kernel space here)
805 *
806 * We assume that effective_cache_size is the total number of buffer pages
807 * available for the whole query, and pro-rate that space across all the
808 * tables in the query and the index currently under consideration. (This
809 * ignores space needed for other indexes used by the query, but since we
810 * don't know which indexes will get used, we can't estimate that very well;
811 * and in any case counting all the tables may well be an overestimate, since
812 * depending on the join plan not all the tables may be scanned concurrently.)
813 *
814 * The product Ns is the number of tuples fetched; we pass in that
815 * product rather than calculating it here. "pages" is the number of pages
816 * in the object under consideration (either an index or a table).
817 * "index_pages" is the amount to add to the total table space, which was
818 * computed for us by make_one_rel.
819 *
820 * Caller is expected to have ensured that tuples_fetched is greater than zero
821 * and rounded to integer (see clamp_row_est). The result will likewise be
822 * greater than zero and integral.
823 */
824 double
index_pages_fetched(double tuples_fetched,BlockNumber pages,double index_pages,PlannerInfo * root)825 index_pages_fetched(double tuples_fetched, BlockNumber pages,
826 double index_pages, PlannerInfo *root)
827 {
828 double pages_fetched;
829 double total_pages;
830 double T,
831 b;
832
833 /* T is # pages in table, but don't allow it to be zero */
834 T = (pages > 1) ? (double) pages : 1.0;
835
836 /* Compute number of pages assumed to be competing for cache space */
837 total_pages = root->total_table_pages + index_pages;
838 total_pages = Max(total_pages, 1.0);
839 Assert(T <= total_pages);
840
841 /* b is pro-rated share of effective_cache_size */
842 b = (double) effective_cache_size * T / total_pages;
843
844 /* force it positive and integral */
845 if (b <= 1.0)
846 b = 1.0;
847 else
848 b = ceil(b);
849
850 /* This part is the Mackert and Lohman formula */
851 if (T <= b)
852 {
853 pages_fetched =
854 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
855 if (pages_fetched >= T)
856 pages_fetched = T;
857 else
858 pages_fetched = ceil(pages_fetched);
859 }
860 else
861 {
862 double lim;
863
864 lim = (2.0 * T * b) / (2.0 * T - b);
865 if (tuples_fetched <= lim)
866 {
867 pages_fetched =
868 (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
869 }
870 else
871 {
872 pages_fetched =
873 b + (tuples_fetched - lim) * (T - b) / T;
874 }
875 pages_fetched = ceil(pages_fetched);
876 }
877 return pages_fetched;
878 }
879
880 /*
881 * get_indexpath_pages
882 * Determine the total size of the indexes used in a bitmap index path.
883 *
884 * Note: if the same index is used more than once in a bitmap tree, we will
885 * count it multiple times, which perhaps is the wrong thing ... but it's
886 * not completely clear, and detecting duplicates is difficult, so ignore it
887 * for now.
888 */
889 static double
get_indexpath_pages(Path * bitmapqual)890 get_indexpath_pages(Path *bitmapqual)
891 {
892 double result = 0;
893 ListCell *l;
894
895 if (IsA(bitmapqual, BitmapAndPath))
896 {
897 BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
898
899 foreach(l, apath->bitmapquals)
900 {
901 result += get_indexpath_pages((Path *) lfirst(l));
902 }
903 }
904 else if (IsA(bitmapqual, BitmapOrPath))
905 {
906 BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
907
908 foreach(l, opath->bitmapquals)
909 {
910 result += get_indexpath_pages((Path *) lfirst(l));
911 }
912 }
913 else if (IsA(bitmapqual, IndexPath))
914 {
915 IndexPath *ipath = (IndexPath *) bitmapqual;
916
917 result = (double) ipath->indexinfo->pages;
918 }
919 else
920 elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
921
922 return result;
923 }
924
925 /*
926 * cost_bitmap_heap_scan
927 * Determines and returns the cost of scanning a relation using a bitmap
928 * index-then-heap plan.
929 *
930 * 'baserel' is the relation to be scanned
931 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
932 * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
933 * 'loop_count' is the number of repetitions of the indexscan to factor into
934 * estimates of caching behavior
935 *
936 * Note: the component IndexPaths in bitmapqual should have been costed
937 * using the same loop_count.
938 */
939 void
cost_bitmap_heap_scan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,Path * bitmapqual,double loop_count)940 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
941 ParamPathInfo *param_info,
942 Path *bitmapqual, double loop_count)
943 {
944 Cost startup_cost = 0;
945 Cost run_cost = 0;
946 Cost indexTotalCost;
947 QualCost qpqual_cost;
948 Cost cpu_per_tuple;
949 Cost cost_per_page;
950 Cost cpu_run_cost;
951 double tuples_fetched;
952 double pages_fetched;
953 double spc_seq_page_cost,
954 spc_random_page_cost;
955 double T;
956
957 /* Should only be applied to base relations */
958 Assert(IsA(baserel, RelOptInfo));
959 Assert(baserel->relid > 0);
960 Assert(baserel->rtekind == RTE_RELATION);
961
962 /* Mark the path with the correct row estimate */
963 if (param_info)
964 path->rows = param_info->ppi_rows;
965 else
966 path->rows = baserel->rows;
967
968 if (!enable_bitmapscan)
969 startup_cost += disable_cost;
970
971 pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
972 loop_count, &indexTotalCost,
973 &tuples_fetched);
974
975 startup_cost += indexTotalCost;
976 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
977
978 /* Fetch estimated page costs for tablespace containing table. */
979 get_tablespace_page_costs(baserel->reltablespace,
980 &spc_random_page_cost,
981 &spc_seq_page_cost);
982
983 /*
984 * For small numbers of pages we should charge spc_random_page_cost
985 * apiece, while if nearly all the table's pages are being read, it's more
986 * appropriate to charge spc_seq_page_cost apiece. The effect is
987 * nonlinear, too. For lack of a better idea, interpolate like this to
988 * determine the cost per page.
989 */
990 if (pages_fetched >= 2.0)
991 cost_per_page = spc_random_page_cost -
992 (spc_random_page_cost - spc_seq_page_cost)
993 * sqrt(pages_fetched / T);
994 else
995 cost_per_page = spc_random_page_cost;
996
997 run_cost += pages_fetched * cost_per_page;
998
999 /*
1000 * Estimate CPU costs per tuple.
1001 *
1002 * Often the indexquals don't need to be rechecked at each tuple ... but
1003 * not always, especially not if there are enough tuples involved that the
1004 * bitmaps become lossy. For the moment, just assume they will be
1005 * rechecked always. This means we charge the full freight for all the
1006 * scan clauses.
1007 */
1008 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1009
1010 startup_cost += qpqual_cost.startup;
1011 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1012 cpu_run_cost = cpu_per_tuple * tuples_fetched;
1013
1014 /* Adjust costing for parallelism, if used. */
1015 if (path->parallel_workers > 0)
1016 {
1017 double parallel_divisor = get_parallel_divisor(path);
1018
1019 /* The CPU cost is divided among all the workers. */
1020 cpu_run_cost /= parallel_divisor;
1021
1022 path->rows = clamp_row_est(path->rows / parallel_divisor);
1023 }
1024
1025
1026 run_cost += cpu_run_cost;
1027
1028 /* tlist eval costs are paid per output row, not per tuple scanned */
1029 startup_cost += path->pathtarget->cost.startup;
1030 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1031
1032 path->startup_cost = startup_cost;
1033 path->total_cost = startup_cost + run_cost;
1034 }
1035
1036 /*
1037 * cost_bitmap_tree_node
1038 * Extract cost and selectivity from a bitmap tree node (index/and/or)
1039 */
1040 void
cost_bitmap_tree_node(Path * path,Cost * cost,Selectivity * selec)1041 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1042 {
1043 if (IsA(path, IndexPath))
1044 {
1045 *cost = ((IndexPath *) path)->indextotalcost;
1046 *selec = ((IndexPath *) path)->indexselectivity;
1047
1048 /*
1049 * Charge a small amount per retrieved tuple to reflect the costs of
1050 * manipulating the bitmap. This is mostly to make sure that a bitmap
1051 * scan doesn't look to be the same cost as an indexscan to retrieve a
1052 * single tuple.
1053 */
1054 *cost += 0.1 * cpu_operator_cost * path->rows;
1055 }
1056 else if (IsA(path, BitmapAndPath))
1057 {
1058 *cost = path->total_cost;
1059 *selec = ((BitmapAndPath *) path)->bitmapselectivity;
1060 }
1061 else if (IsA(path, BitmapOrPath))
1062 {
1063 *cost = path->total_cost;
1064 *selec = ((BitmapOrPath *) path)->bitmapselectivity;
1065 }
1066 else
1067 {
1068 elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1069 *cost = *selec = 0; /* keep compiler quiet */
1070 }
1071 }
1072
1073 /*
1074 * cost_bitmap_and_node
1075 * Estimate the cost of a BitmapAnd node
1076 *
1077 * Note that this considers only the costs of index scanning and bitmap
1078 * creation, not the eventual heap access. In that sense the object isn't
1079 * truly a Path, but it has enough path-like properties (costs in particular)
1080 * to warrant treating it as one. We don't bother to set the path rows field,
1081 * however.
1082 */
1083 void
cost_bitmap_and_node(BitmapAndPath * path,PlannerInfo * root)1084 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1085 {
1086 Cost totalCost;
1087 Selectivity selec;
1088 ListCell *l;
1089
1090 /*
1091 * We estimate AND selectivity on the assumption that the inputs are
1092 * independent. This is probably often wrong, but we don't have the info
1093 * to do better.
1094 *
1095 * The runtime cost of the BitmapAnd itself is estimated at 100x
1096 * cpu_operator_cost for each tbm_intersect needed. Probably too small,
1097 * definitely too simplistic?
1098 */
1099 totalCost = 0.0;
1100 selec = 1.0;
1101 foreach(l, path->bitmapquals)
1102 {
1103 Path *subpath = (Path *) lfirst(l);
1104 Cost subCost;
1105 Selectivity subselec;
1106
1107 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1108
1109 selec *= subselec;
1110
1111 totalCost += subCost;
1112 if (l != list_head(path->bitmapquals))
1113 totalCost += 100.0 * cpu_operator_cost;
1114 }
1115 path->bitmapselectivity = selec;
1116 path->path.rows = 0; /* per above, not used */
1117 path->path.startup_cost = totalCost;
1118 path->path.total_cost = totalCost;
1119 }
1120
1121 /*
1122 * cost_bitmap_or_node
1123 * Estimate the cost of a BitmapOr node
1124 *
1125 * See comments for cost_bitmap_and_node.
1126 */
1127 void
cost_bitmap_or_node(BitmapOrPath * path,PlannerInfo * root)1128 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1129 {
1130 Cost totalCost;
1131 Selectivity selec;
1132 ListCell *l;
1133
1134 /*
1135 * We estimate OR selectivity on the assumption that the inputs are
1136 * non-overlapping, since that's often the case in "x IN (list)" type
1137 * situations. Of course, we clamp to 1.0 at the end.
1138 *
1139 * The runtime cost of the BitmapOr itself is estimated at 100x
1140 * cpu_operator_cost for each tbm_union needed. Probably too small,
1141 * definitely too simplistic? We are aware that the tbm_unions are
1142 * optimized out when the inputs are BitmapIndexScans.
1143 */
1144 totalCost = 0.0;
1145 selec = 0.0;
1146 foreach(l, path->bitmapquals)
1147 {
1148 Path *subpath = (Path *) lfirst(l);
1149 Cost subCost;
1150 Selectivity subselec;
1151
1152 cost_bitmap_tree_node(subpath, &subCost, &subselec);
1153
1154 selec += subselec;
1155
1156 totalCost += subCost;
1157 if (l != list_head(path->bitmapquals) &&
1158 !IsA(subpath, IndexPath))
1159 totalCost += 100.0 * cpu_operator_cost;
1160 }
1161 path->bitmapselectivity = Min(selec, 1.0);
1162 path->path.rows = 0; /* per above, not used */
1163 path->path.startup_cost = totalCost;
1164 path->path.total_cost = totalCost;
1165 }
1166
1167 /*
1168 * cost_tidscan
1169 * Determines and returns the cost of scanning a relation using TIDs.
1170 *
1171 * 'baserel' is the relation to be scanned
1172 * 'tidquals' is the list of TID-checkable quals
1173 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1174 */
1175 void
cost_tidscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,List * tidquals,ParamPathInfo * param_info)1176 cost_tidscan(Path *path, PlannerInfo *root,
1177 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1178 {
1179 Cost startup_cost = 0;
1180 Cost run_cost = 0;
1181 bool isCurrentOf = false;
1182 QualCost qpqual_cost;
1183 Cost cpu_per_tuple;
1184 QualCost tid_qual_cost;
1185 int ntuples;
1186 ListCell *l;
1187 double spc_random_page_cost;
1188
1189 /* Should only be applied to base relations */
1190 Assert(baserel->relid > 0);
1191 Assert(baserel->rtekind == RTE_RELATION);
1192
1193 /* Mark the path with the correct row estimate */
1194 if (param_info)
1195 path->rows = param_info->ppi_rows;
1196 else
1197 path->rows = baserel->rows;
1198
1199 /* Count how many tuples we expect to retrieve */
1200 ntuples = 0;
1201 foreach(l, tidquals)
1202 {
1203 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1204 Expr *qual = rinfo->clause;
1205
1206 if (IsA(qual, ScalarArrayOpExpr))
1207 {
1208 /* Each element of the array yields 1 tuple */
1209 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1210 Node *arraynode = (Node *) lsecond(saop->args);
1211
1212 ntuples += estimate_array_length(arraynode);
1213 }
1214 else if (IsA(qual, CurrentOfExpr))
1215 {
1216 /* CURRENT OF yields 1 tuple */
1217 isCurrentOf = true;
1218 ntuples++;
1219 }
1220 else
1221 {
1222 /* It's just CTID = something, count 1 tuple */
1223 ntuples++;
1224 }
1225 }
1226
1227 /*
1228 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1229 * understands how to do it correctly. Therefore, honor enable_tidscan
1230 * only when CURRENT OF isn't present. Also note that cost_qual_eval
1231 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1232 * subtract off here; that's to prevent other plan types such as seqscan
1233 * from winning.
1234 */
1235 if (isCurrentOf)
1236 {
1237 Assert(baserel->baserestrictcost.startup >= disable_cost);
1238 startup_cost -= disable_cost;
1239 }
1240 else if (!enable_tidscan)
1241 startup_cost += disable_cost;
1242
1243 /*
1244 * The TID qual expressions will be computed once, any other baserestrict
1245 * quals once per retrieved tuple.
1246 */
1247 cost_qual_eval(&tid_qual_cost, tidquals, root);
1248
1249 /* fetch estimated page cost for tablespace containing table */
1250 get_tablespace_page_costs(baserel->reltablespace,
1251 &spc_random_page_cost,
1252 NULL);
1253
1254 /* disk costs --- assume each tuple on a different page */
1255 run_cost += spc_random_page_cost * ntuples;
1256
1257 /* Add scanning CPU costs */
1258 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1259
1260 /* XXX currently we assume TID quals are a subset of qpquals */
1261 startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1262 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1263 tid_qual_cost.per_tuple;
1264 run_cost += cpu_per_tuple * ntuples;
1265
1266 /* tlist eval costs are paid per output row, not per tuple scanned */
1267 startup_cost += path->pathtarget->cost.startup;
1268 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1269
1270 path->startup_cost = startup_cost;
1271 path->total_cost = startup_cost + run_cost;
1272 }
1273
1274 /*
1275 * cost_subqueryscan
1276 * Determines and returns the cost of scanning a subquery RTE.
1277 *
1278 * 'baserel' is the relation to be scanned
1279 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1280 */
1281 void
cost_subqueryscan(SubqueryScanPath * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1282 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1283 RelOptInfo *baserel, ParamPathInfo *param_info)
1284 {
1285 Cost startup_cost;
1286 Cost run_cost;
1287 QualCost qpqual_cost;
1288 Cost cpu_per_tuple;
1289
1290 /* Should only be applied to base relations that are subqueries */
1291 Assert(baserel->relid > 0);
1292 Assert(baserel->rtekind == RTE_SUBQUERY);
1293
1294 /* Mark the path with the correct row estimate */
1295 if (param_info)
1296 path->path.rows = param_info->ppi_rows;
1297 else
1298 path->path.rows = baserel->rows;
1299
1300 /*
1301 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1302 * any restriction clauses and tlist that will be attached to the
1303 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1304 * projection overhead.
1305 */
1306 path->path.startup_cost = path->subpath->startup_cost;
1307 path->path.total_cost = path->subpath->total_cost;
1308
1309 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1310
1311 startup_cost = qpqual_cost.startup;
1312 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1313 run_cost = cpu_per_tuple * baserel->tuples;
1314
1315 /* tlist eval costs are paid per output row, not per tuple scanned */
1316 startup_cost += path->path.pathtarget->cost.startup;
1317 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1318
1319 path->path.startup_cost += startup_cost;
1320 path->path.total_cost += startup_cost + run_cost;
1321 }
1322
1323 /*
1324 * cost_functionscan
1325 * Determines and returns the cost of scanning a function RTE.
1326 *
1327 * 'baserel' is the relation to be scanned
1328 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1329 */
1330 void
cost_functionscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1331 cost_functionscan(Path *path, PlannerInfo *root,
1332 RelOptInfo *baserel, ParamPathInfo *param_info)
1333 {
1334 Cost startup_cost = 0;
1335 Cost run_cost = 0;
1336 QualCost qpqual_cost;
1337 Cost cpu_per_tuple;
1338 RangeTblEntry *rte;
1339 QualCost exprcost;
1340
1341 /* Should only be applied to base relations that are functions */
1342 Assert(baserel->relid > 0);
1343 rte = planner_rt_fetch(baserel->relid, root);
1344 Assert(rte->rtekind == RTE_FUNCTION);
1345
1346 /* Mark the path with the correct row estimate */
1347 if (param_info)
1348 path->rows = param_info->ppi_rows;
1349 else
1350 path->rows = baserel->rows;
1351
1352 /*
1353 * Estimate costs of executing the function expression(s).
1354 *
1355 * Currently, nodeFunctionscan.c always executes the functions to
1356 * completion before returning any rows, and caches the results in a
1357 * tuplestore. So the function eval cost is all startup cost, and per-row
1358 * costs are minimal.
1359 *
1360 * XXX in principle we ought to charge tuplestore spill costs if the
1361 * number of rows is large. However, given how phony our rowcount
1362 * estimates for functions tend to be, there's not a lot of point in that
1363 * refinement right now.
1364 */
1365 cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1366
1367 startup_cost += exprcost.startup + exprcost.per_tuple;
1368
1369 /* Add scanning CPU costs */
1370 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1371
1372 startup_cost += qpqual_cost.startup;
1373 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1374 run_cost += cpu_per_tuple * baserel->tuples;
1375
1376 /* tlist eval costs are paid per output row, not per tuple scanned */
1377 startup_cost += path->pathtarget->cost.startup;
1378 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1379
1380 path->startup_cost = startup_cost;
1381 path->total_cost = startup_cost + run_cost;
1382 }
1383
1384 /*
1385 * cost_tablefuncscan
1386 * Determines and returns the cost of scanning a table function.
1387 *
1388 * 'baserel' is the relation to be scanned
1389 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1390 */
1391 void
cost_tablefuncscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1392 cost_tablefuncscan(Path *path, PlannerInfo *root,
1393 RelOptInfo *baserel, ParamPathInfo *param_info)
1394 {
1395 Cost startup_cost = 0;
1396 Cost run_cost = 0;
1397 QualCost qpqual_cost;
1398 Cost cpu_per_tuple;
1399 RangeTblEntry *rte;
1400 QualCost exprcost;
1401
1402 /* Should only be applied to base relations that are functions */
1403 Assert(baserel->relid > 0);
1404 rte = planner_rt_fetch(baserel->relid, root);
1405 Assert(rte->rtekind == RTE_TABLEFUNC);
1406
1407 /* Mark the path with the correct row estimate */
1408 if (param_info)
1409 path->rows = param_info->ppi_rows;
1410 else
1411 path->rows = baserel->rows;
1412
1413 /*
1414 * Estimate costs of executing the table func expression(s).
1415 *
1416 * XXX in principle we ought to charge tuplestore spill costs if the
1417 * number of rows is large. However, given how phony our rowcount
1418 * estimates for tablefuncs tend to be, there's not a lot of point in that
1419 * refinement right now.
1420 */
1421 cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1422
1423 startup_cost += exprcost.startup + exprcost.per_tuple;
1424
1425 /* Add scanning CPU costs */
1426 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1427
1428 startup_cost += qpqual_cost.startup;
1429 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1430 run_cost += cpu_per_tuple * baserel->tuples;
1431
1432 /* tlist eval costs are paid per output row, not per tuple scanned */
1433 startup_cost += path->pathtarget->cost.startup;
1434 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1435
1436 path->startup_cost = startup_cost;
1437 path->total_cost = startup_cost + run_cost;
1438 }
1439
1440 /*
1441 * cost_valuesscan
1442 * Determines and returns the cost of scanning a VALUES RTE.
1443 *
1444 * 'baserel' is the relation to be scanned
1445 * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1446 */
1447 void
cost_valuesscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1448 cost_valuesscan(Path *path, PlannerInfo *root,
1449 RelOptInfo *baserel, ParamPathInfo *param_info)
1450 {
1451 Cost startup_cost = 0;
1452 Cost run_cost = 0;
1453 QualCost qpqual_cost;
1454 Cost cpu_per_tuple;
1455
1456 /* Should only be applied to base relations that are values lists */
1457 Assert(baserel->relid > 0);
1458 Assert(baserel->rtekind == RTE_VALUES);
1459
1460 /* Mark the path with the correct row estimate */
1461 if (param_info)
1462 path->rows = param_info->ppi_rows;
1463 else
1464 path->rows = baserel->rows;
1465
1466 /*
1467 * For now, estimate list evaluation cost at one operator eval per list
1468 * (probably pretty bogus, but is it worth being smarter?)
1469 */
1470 cpu_per_tuple = cpu_operator_cost;
1471
1472 /* Add scanning CPU costs */
1473 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1474
1475 startup_cost += qpqual_cost.startup;
1476 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1477 run_cost += cpu_per_tuple * baserel->tuples;
1478
1479 /* tlist eval costs are paid per output row, not per tuple scanned */
1480 startup_cost += path->pathtarget->cost.startup;
1481 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1482
1483 path->startup_cost = startup_cost;
1484 path->total_cost = startup_cost + run_cost;
1485 }
1486
1487 /*
1488 * cost_ctescan
1489 * Determines and returns the cost of scanning a CTE RTE.
1490 *
1491 * Note: this is used for both self-reference and regular CTEs; the
1492 * possible cost differences are below the threshold of what we could
1493 * estimate accurately anyway. Note that the costs of evaluating the
1494 * referenced CTE query are added into the final plan as initplan costs,
1495 * and should NOT be counted here.
1496 */
1497 void
cost_ctescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1498 cost_ctescan(Path *path, PlannerInfo *root,
1499 RelOptInfo *baserel, ParamPathInfo *param_info)
1500 {
1501 Cost startup_cost = 0;
1502 Cost run_cost = 0;
1503 QualCost qpqual_cost;
1504 Cost cpu_per_tuple;
1505
1506 /* Should only be applied to base relations that are CTEs */
1507 Assert(baserel->relid > 0);
1508 Assert(baserel->rtekind == RTE_CTE);
1509
1510 /* Mark the path with the correct row estimate */
1511 if (param_info)
1512 path->rows = param_info->ppi_rows;
1513 else
1514 path->rows = baserel->rows;
1515
1516 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1517 cpu_per_tuple = cpu_tuple_cost;
1518
1519 /* Add scanning CPU costs */
1520 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1521
1522 startup_cost += qpqual_cost.startup;
1523 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1524 run_cost += cpu_per_tuple * baserel->tuples;
1525
1526 /* tlist eval costs are paid per output row, not per tuple scanned */
1527 startup_cost += path->pathtarget->cost.startup;
1528 run_cost += path->pathtarget->cost.per_tuple * path->rows;
1529
1530 path->startup_cost = startup_cost;
1531 path->total_cost = startup_cost + run_cost;
1532 }
1533
1534 /*
1535 * cost_namedtuplestorescan
1536 * Determines and returns the cost of scanning a named tuplestore.
1537 */
1538 void
cost_namedtuplestorescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1539 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1540 RelOptInfo *baserel, ParamPathInfo *param_info)
1541 {
1542 Cost startup_cost = 0;
1543 Cost run_cost = 0;
1544 QualCost qpqual_cost;
1545 Cost cpu_per_tuple;
1546
1547 /* Should only be applied to base relations that are Tuplestores */
1548 Assert(baserel->relid > 0);
1549 Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1550
1551 /* Mark the path with the correct row estimate */
1552 if (param_info)
1553 path->rows = param_info->ppi_rows;
1554 else
1555 path->rows = baserel->rows;
1556
1557 /* Charge one CPU tuple cost per row for tuplestore manipulation */
1558 cpu_per_tuple = cpu_tuple_cost;
1559
1560 /* Add scanning CPU costs */
1561 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1562
1563 startup_cost += qpqual_cost.startup;
1564 cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1565 run_cost += cpu_per_tuple * baserel->tuples;
1566
1567 path->startup_cost = startup_cost;
1568 path->total_cost = startup_cost + run_cost;
1569 }
1570
1571 /*
1572 * cost_resultscan
1573 * Determines and returns the cost of scanning an RTE_RESULT relation.
1574 */
1575 void
cost_resultscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1576 cost_resultscan(Path *path, PlannerInfo *root,
1577 RelOptInfo *baserel, ParamPathInfo *param_info)
1578 {
1579 Cost startup_cost = 0;
1580 Cost run_cost = 0;
1581 QualCost qpqual_cost;
1582 Cost cpu_per_tuple;
1583
1584 /* Should only be applied to RTE_RESULT base relations */
1585 Assert(baserel->relid > 0);
1586 Assert(baserel->rtekind == RTE_RESULT);
1587
1588 /* Mark the path with the correct row estimate */
1589 if (param_info)
1590 path->rows = param_info->ppi_rows;
1591 else
1592 path->rows = baserel->rows;
1593
1594 /* We charge qual cost plus cpu_tuple_cost */
1595 get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1596
1597 startup_cost += qpqual_cost.startup;
1598 cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1599 run_cost += cpu_per_tuple * baserel->tuples;
1600
1601 path->startup_cost = startup_cost;
1602 path->total_cost = startup_cost + run_cost;
1603 }
1604
1605 /*
1606 * cost_recursive_union
1607 * Determines and returns the cost of performing a recursive union,
1608 * and also the estimated output size.
1609 *
1610 * We are given Paths for the nonrecursive and recursive terms.
1611 */
1612 void
cost_recursive_union(Path * runion,Path * nrterm,Path * rterm)1613 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1614 {
1615 Cost startup_cost;
1616 Cost total_cost;
1617 double total_rows;
1618
1619 /* We probably have decent estimates for the non-recursive term */
1620 startup_cost = nrterm->startup_cost;
1621 total_cost = nrterm->total_cost;
1622 total_rows = nrterm->rows;
1623
1624 /*
1625 * We arbitrarily assume that about 10 recursive iterations will be
1626 * needed, and that we've managed to get a good fix on the cost and output
1627 * size of each one of them. These are mighty shaky assumptions but it's
1628 * hard to see how to do better.
1629 */
1630 total_cost += 10 * rterm->total_cost;
1631 total_rows += 10 * rterm->rows;
1632
1633 /*
1634 * Also charge cpu_tuple_cost per row to account for the costs of
1635 * manipulating the tuplestores. (We don't worry about possible
1636 * spill-to-disk costs.)
1637 */
1638 total_cost += cpu_tuple_cost * total_rows;
1639
1640 runion->startup_cost = startup_cost;
1641 runion->total_cost = total_cost;
1642 runion->rows = total_rows;
1643 runion->pathtarget->width = Max(nrterm->pathtarget->width,
1644 rterm->pathtarget->width);
1645 }
1646
1647 /*
1648 * cost_sort
1649 * Determines and returns the cost of sorting a relation, including
1650 * the cost of reading the input data.
1651 *
1652 * If the total volume of data to sort is less than sort_mem, we will do
1653 * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1654 * comparisons for t tuples.
1655 *
1656 * If the total volume exceeds sort_mem, we switch to a tape-style merge
1657 * algorithm. There will still be about t*log2(t) tuple comparisons in
1658 * total, but we will also need to write and read each tuple once per
1659 * merge pass. We expect about ceil(logM(r)) merge passes where r is the
1660 * number of initial runs formed and M is the merge order used by tuplesort.c.
1661 * Since the average initial run should be about sort_mem, we have
1662 * disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1663 * cpu = comparison_cost * t * log2(t)
1664 *
1665 * If the sort is bounded (i.e., only the first k result tuples are needed)
1666 * and k tuples can fit into sort_mem, we use a heap method that keeps only
1667 * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1668 *
1669 * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1670 * accesses (XXX can't we refine that guess?)
1671 *
1672 * By default, we charge two operator evals per tuple comparison, which should
1673 * be in the right ballpark in most cases. The caller can tweak this by
1674 * specifying nonzero comparison_cost; typically that's used for any extra
1675 * work that has to be done to prepare the inputs to the comparison operators.
1676 *
1677 * 'pathkeys' is a list of sort keys
1678 * 'input_cost' is the total cost for reading the input data
1679 * 'tuples' is the number of tuples in the relation
1680 * 'width' is the average tuple width in bytes
1681 * 'comparison_cost' is the extra cost per comparison, if any
1682 * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1683 * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1684 *
1685 * NOTE: some callers currently pass NIL for pathkeys because they
1686 * can't conveniently supply the sort keys. Since this routine doesn't
1687 * currently do anything with pathkeys anyway, that doesn't matter...
1688 * but if it ever does, it should react gracefully to lack of key data.
1689 * (Actually, the thing we'd most likely be interested in is just the number
1690 * of sort keys, which all callers *could* supply.)
1691 */
1692 void
cost_sort(Path * path,PlannerInfo * root,List * pathkeys,Cost input_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1693 cost_sort(Path *path, PlannerInfo *root,
1694 List *pathkeys, Cost input_cost, double tuples, int width,
1695 Cost comparison_cost, int sort_mem,
1696 double limit_tuples)
1697 {
1698 Cost startup_cost = input_cost;
1699 Cost run_cost = 0;
1700 double input_bytes = relation_byte_size(tuples, width);
1701 double output_bytes;
1702 double output_tuples;
1703 long sort_mem_bytes = sort_mem * 1024L;
1704
1705 if (!enable_sort)
1706 startup_cost += disable_cost;
1707
1708 path->rows = tuples;
1709
1710 /*
1711 * We want to be sure the cost of a sort is never estimated as zero, even
1712 * if passed-in tuple count is zero. Besides, mustn't do log(0)...
1713 */
1714 if (tuples < 2.0)
1715 tuples = 2.0;
1716
1717 /* Include the default cost-per-comparison */
1718 comparison_cost += 2.0 * cpu_operator_cost;
1719
1720 /* Do we have a useful LIMIT? */
1721 if (limit_tuples > 0 && limit_tuples < tuples)
1722 {
1723 output_tuples = limit_tuples;
1724 output_bytes = relation_byte_size(output_tuples, width);
1725 }
1726 else
1727 {
1728 output_tuples = tuples;
1729 output_bytes = input_bytes;
1730 }
1731
1732 if (output_bytes > sort_mem_bytes)
1733 {
1734 /*
1735 * We'll have to use a disk-based sort of all the tuples
1736 */
1737 double npages = ceil(input_bytes / BLCKSZ);
1738 double nruns = input_bytes / sort_mem_bytes;
1739 double mergeorder = tuplesort_merge_order(sort_mem_bytes);
1740 double log_runs;
1741 double npageaccesses;
1742
1743 /*
1744 * CPU costs
1745 *
1746 * Assume about N log2 N comparisons
1747 */
1748 startup_cost += comparison_cost * tuples * LOG2(tuples);
1749
1750 /* Disk costs */
1751
1752 /* Compute logM(r) as log(r) / log(M) */
1753 if (nruns > mergeorder)
1754 log_runs = ceil(log(nruns) / log(mergeorder));
1755 else
1756 log_runs = 1.0;
1757 npageaccesses = 2.0 * npages * log_runs;
1758 /* Assume 3/4ths of accesses are sequential, 1/4th are not */
1759 startup_cost += npageaccesses *
1760 (seq_page_cost * 0.75 + random_page_cost * 0.25);
1761 }
1762 else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1763 {
1764 /*
1765 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1766 * a total number of tuple comparisons of N log2 K; but the constant
1767 * factor is a bit higher than for quicksort. Tweak it so that the
1768 * cost curve is continuous at the crossover point.
1769 */
1770 startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
1771 }
1772 else
1773 {
1774 /* We'll use plain quicksort on all the input tuples */
1775 startup_cost += comparison_cost * tuples * LOG2(tuples);
1776 }
1777
1778 /*
1779 * Also charge a small amount (arbitrarily set equal to operator cost) per
1780 * extracted tuple. We don't charge cpu_tuple_cost because a Sort node
1781 * doesn't do qual-checking or projection, so it has less overhead than
1782 * most plan nodes. Note it's correct to use tuples not output_tuples
1783 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1784 * counting the LIMIT otherwise.
1785 */
1786 run_cost += cpu_operator_cost * tuples;
1787
1788 path->startup_cost = startup_cost;
1789 path->total_cost = startup_cost + run_cost;
1790 }
1791
1792 /*
1793 * append_nonpartial_cost
1794 * Estimate the cost of the non-partial paths in a Parallel Append.
1795 * The non-partial paths are assumed to be the first "numpaths" paths
1796 * from the subpaths list, and to be in order of decreasing cost.
1797 */
1798 static Cost
append_nonpartial_cost(List * subpaths,int numpaths,int parallel_workers)1799 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1800 {
1801 Cost *costarr;
1802 int arrlen;
1803 ListCell *l;
1804 ListCell *cell;
1805 int i;
1806 int path_index;
1807 int min_index;
1808 int max_index;
1809
1810 if (numpaths == 0)
1811 return 0;
1812
1813 /*
1814 * Array length is number of workers or number of relevants paths,
1815 * whichever is less.
1816 */
1817 arrlen = Min(parallel_workers, numpaths);
1818 costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1819
1820 /* The first few paths will each be claimed by a different worker. */
1821 path_index = 0;
1822 foreach(cell, subpaths)
1823 {
1824 Path *subpath = (Path *) lfirst(cell);
1825
1826 if (path_index == arrlen)
1827 break;
1828 costarr[path_index++] = subpath->total_cost;
1829 }
1830
1831 /*
1832 * Since subpaths are sorted by decreasing cost, the last one will have
1833 * the minimum cost.
1834 */
1835 min_index = arrlen - 1;
1836
1837 /*
1838 * For each of the remaining subpaths, add its cost to the array element
1839 * with minimum cost.
1840 */
1841 for_each_cell(l, cell)
1842 {
1843 Path *subpath = (Path *) lfirst(l);
1844 int i;
1845
1846 /* Consider only the non-partial paths */
1847 if (path_index++ == numpaths)
1848 break;
1849
1850 costarr[min_index] += subpath->total_cost;
1851
1852 /* Update the new min cost array index */
1853 for (min_index = i = 0; i < arrlen; i++)
1854 {
1855 if (costarr[i] < costarr[min_index])
1856 min_index = i;
1857 }
1858 }
1859
1860 /* Return the highest cost from the array */
1861 for (max_index = i = 0; i < arrlen; i++)
1862 {
1863 if (costarr[i] > costarr[max_index])
1864 max_index = i;
1865 }
1866
1867 return costarr[max_index];
1868 }
1869
1870 /*
1871 * cost_append
1872 * Determines and returns the cost of an Append node.
1873 */
1874 void
cost_append(AppendPath * apath)1875 cost_append(AppendPath *apath)
1876 {
1877 ListCell *l;
1878
1879 apath->path.startup_cost = 0;
1880 apath->path.total_cost = 0;
1881 apath->path.rows = 0;
1882
1883 if (apath->subpaths == NIL)
1884 return;
1885
1886 if (!apath->path.parallel_aware)
1887 {
1888 List *pathkeys = apath->path.pathkeys;
1889
1890 if (pathkeys == NIL)
1891 {
1892 Path *subpath = (Path *) linitial(apath->subpaths);
1893
1894 /*
1895 * For an unordered, non-parallel-aware Append we take the startup
1896 * cost as the startup cost of the first subpath.
1897 */
1898 apath->path.startup_cost = subpath->startup_cost;
1899
1900 /* Compute rows and costs as sums of subplan rows and costs. */
1901 foreach(l, apath->subpaths)
1902 {
1903 Path *subpath = (Path *) lfirst(l);
1904
1905 apath->path.rows += subpath->rows;
1906 apath->path.total_cost += subpath->total_cost;
1907 }
1908 }
1909 else
1910 {
1911 /*
1912 * For an ordered, non-parallel-aware Append we take the startup
1913 * cost as the sum of the subpath startup costs. This ensures
1914 * that we don't underestimate the startup cost when a query's
1915 * LIMIT is such that several of the children have to be run to
1916 * satisfy it. This might be overkill --- another plausible hack
1917 * would be to take the Append's startup cost as the maximum of
1918 * the child startup costs. But we don't want to risk believing
1919 * that an ORDER BY LIMIT query can be satisfied at small cost
1920 * when the first child has small startup cost but later ones
1921 * don't. (If we had the ability to deal with nonlinear cost
1922 * interpolation for partial retrievals, we would not need to be
1923 * so conservative about this.)
1924 *
1925 * This case is also different from the above in that we have to
1926 * account for possibly injecting sorts into subpaths that aren't
1927 * natively ordered.
1928 */
1929 foreach(l, apath->subpaths)
1930 {
1931 Path *subpath = (Path *) lfirst(l);
1932 Path sort_path; /* dummy for result of cost_sort */
1933
1934 if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
1935 {
1936 /*
1937 * We'll need to insert a Sort node, so include costs for
1938 * that. We can use the parent's LIMIT if any, since we
1939 * certainly won't pull more than that many tuples from
1940 * any child.
1941 */
1942 cost_sort(&sort_path,
1943 NULL, /* doesn't currently need root */
1944 pathkeys,
1945 subpath->total_cost,
1946 subpath->rows,
1947 subpath->pathtarget->width,
1948 0.0,
1949 work_mem,
1950 apath->limit_tuples);
1951 subpath = &sort_path;
1952 }
1953
1954 apath->path.rows += subpath->rows;
1955 apath->path.startup_cost += subpath->startup_cost;
1956 apath->path.total_cost += subpath->total_cost;
1957 }
1958 }
1959 }
1960 else /* parallel-aware */
1961 {
1962 int i = 0;
1963 double parallel_divisor = get_parallel_divisor(&apath->path);
1964
1965 /* Parallel-aware Append never produces ordered output. */
1966 Assert(apath->path.pathkeys == NIL);
1967
1968 /* Calculate startup cost. */
1969 foreach(l, apath->subpaths)
1970 {
1971 Path *subpath = (Path *) lfirst(l);
1972
1973 /*
1974 * Append will start returning tuples when the child node having
1975 * lowest startup cost is done setting up. We consider only the
1976 * first few subplans that immediately get a worker assigned.
1977 */
1978 if (i == 0)
1979 apath->path.startup_cost = subpath->startup_cost;
1980 else if (i < apath->path.parallel_workers)
1981 apath->path.startup_cost = Min(apath->path.startup_cost,
1982 subpath->startup_cost);
1983
1984 /*
1985 * Apply parallel divisor to subpaths. Scale the number of rows
1986 * for each partial subpath based on the ratio of the parallel
1987 * divisor originally used for the subpath to the one we adopted.
1988 * Also add the cost of partial paths to the total cost, but
1989 * ignore non-partial paths for now.
1990 */
1991 if (i < apath->first_partial_path)
1992 apath->path.rows += subpath->rows / parallel_divisor;
1993 else
1994 {
1995 double subpath_parallel_divisor;
1996
1997 subpath_parallel_divisor = get_parallel_divisor(subpath);
1998 apath->path.rows += subpath->rows * (subpath_parallel_divisor /
1999 parallel_divisor);
2000 apath->path.total_cost += subpath->total_cost;
2001 }
2002
2003 apath->path.rows = clamp_row_est(apath->path.rows);
2004
2005 i++;
2006 }
2007
2008 /* Add cost for non-partial subpaths. */
2009 apath->path.total_cost +=
2010 append_nonpartial_cost(apath->subpaths,
2011 apath->first_partial_path,
2012 apath->path.parallel_workers);
2013 }
2014
2015 /*
2016 * Although Append does not do any selection or projection, it's not free;
2017 * add a small per-tuple overhead.
2018 */
2019 apath->path.total_cost +=
2020 cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2021 }
2022
2023 /*
2024 * cost_merge_append
2025 * Determines and returns the cost of a MergeAppend node.
2026 *
2027 * MergeAppend merges several pre-sorted input streams, using a heap that
2028 * at any given instant holds the next tuple from each stream. If there
2029 * are N streams, we need about N*log2(N) tuple comparisons to construct
2030 * the heap at startup, and then for each output tuple, about log2(N)
2031 * comparisons to replace the top entry.
2032 *
2033 * (The effective value of N will drop once some of the input streams are
2034 * exhausted, but it seems unlikely to be worth trying to account for that.)
2035 *
2036 * The heap is never spilled to disk, since we assume N is not very large.
2037 * So this is much simpler than cost_sort.
2038 *
2039 * As in cost_sort, we charge two operator evals per tuple comparison.
2040 *
2041 * 'pathkeys' is a list of sort keys
2042 * 'n_streams' is the number of input streams
2043 * 'input_startup_cost' is the sum of the input streams' startup costs
2044 * 'input_total_cost' is the sum of the input streams' total costs
2045 * 'tuples' is the number of tuples in all the streams
2046 */
2047 void
cost_merge_append(Path * path,PlannerInfo * root,List * pathkeys,int n_streams,Cost input_startup_cost,Cost input_total_cost,double tuples)2048 cost_merge_append(Path *path, PlannerInfo *root,
2049 List *pathkeys, int n_streams,
2050 Cost input_startup_cost, Cost input_total_cost,
2051 double tuples)
2052 {
2053 Cost startup_cost = 0;
2054 Cost run_cost = 0;
2055 Cost comparison_cost;
2056 double N;
2057 double logN;
2058
2059 /*
2060 * Avoid log(0)...
2061 */
2062 N = (n_streams < 2) ? 2.0 : (double) n_streams;
2063 logN = LOG2(N);
2064
2065 /* Assumed cost per tuple comparison */
2066 comparison_cost = 2.0 * cpu_operator_cost;
2067
2068 /* Heap creation cost */
2069 startup_cost += comparison_cost * N * logN;
2070
2071 /* Per-tuple heap maintenance cost */
2072 run_cost += tuples * comparison_cost * logN;
2073
2074 /*
2075 * Although MergeAppend does not do any selection or projection, it's not
2076 * free; add a small per-tuple overhead.
2077 */
2078 run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2079
2080 path->startup_cost = startup_cost + input_startup_cost;
2081 path->total_cost = startup_cost + run_cost + input_total_cost;
2082 }
2083
2084 /*
2085 * cost_material
2086 * Determines and returns the cost of materializing a relation, including
2087 * the cost of reading the input data.
2088 *
2089 * If the total volume of data to materialize exceeds work_mem, we will need
2090 * to write it to disk, so the cost is much higher in that case.
2091 *
2092 * Note that here we are estimating the costs for the first scan of the
2093 * relation, so the materialization is all overhead --- any savings will
2094 * occur only on rescan, which is estimated in cost_rescan.
2095 */
2096 void
cost_material(Path * path,Cost input_startup_cost,Cost input_total_cost,double tuples,int width)2097 cost_material(Path *path,
2098 Cost input_startup_cost, Cost input_total_cost,
2099 double tuples, int width)
2100 {
2101 Cost startup_cost = input_startup_cost;
2102 Cost run_cost = input_total_cost - input_startup_cost;
2103 double nbytes = relation_byte_size(tuples, width);
2104 long work_mem_bytes = work_mem * 1024L;
2105
2106 path->rows = tuples;
2107
2108 /*
2109 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2110 * reflect bookkeeping overhead. (This rate must be more than what
2111 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2112 * if it is exactly the same then there will be a cost tie between
2113 * nestloop with A outer, materialized B inner and nestloop with B outer,
2114 * materialized A inner. The extra cost ensures we'll prefer
2115 * materializing the smaller rel.) Note that this is normally a good deal
2116 * less than cpu_tuple_cost; which is OK because a Material plan node
2117 * doesn't do qual-checking or projection, so it's got less overhead than
2118 * most plan nodes.
2119 */
2120 run_cost += 2 * cpu_operator_cost * tuples;
2121
2122 /*
2123 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2124 * This cost is assumed to be evenly spread through the plan run phase,
2125 * which isn't exactly accurate but our cost model doesn't allow for
2126 * nonuniform costs within the run phase.
2127 */
2128 if (nbytes > work_mem_bytes)
2129 {
2130 double npages = ceil(nbytes / BLCKSZ);
2131
2132 run_cost += seq_page_cost * npages;
2133 }
2134
2135 path->startup_cost = startup_cost;
2136 path->total_cost = startup_cost + run_cost;
2137 }
2138
2139 /*
2140 * cost_agg
2141 * Determines and returns the cost of performing an Agg plan node,
2142 * including the cost of its input.
2143 *
2144 * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2145 * we are using a hashed Agg node just to do grouping).
2146 *
2147 * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2148 * are for appropriately-sorted input.
2149 */
2150 void
cost_agg(Path * path,PlannerInfo * root,AggStrategy aggstrategy,const AggClauseCosts * aggcosts,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2151 cost_agg(Path *path, PlannerInfo *root,
2152 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2153 int numGroupCols, double numGroups,
2154 List *quals,
2155 Cost input_startup_cost, Cost input_total_cost,
2156 double input_tuples)
2157 {
2158 double output_tuples;
2159 Cost startup_cost;
2160 Cost total_cost;
2161 AggClauseCosts dummy_aggcosts;
2162
2163 /* Use all-zero per-aggregate costs if NULL is passed */
2164 if (aggcosts == NULL)
2165 {
2166 Assert(aggstrategy == AGG_HASHED);
2167 MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2168 aggcosts = &dummy_aggcosts;
2169 }
2170
2171 /*
2172 * The transCost.per_tuple component of aggcosts should be charged once
2173 * per input tuple, corresponding to the costs of evaluating the aggregate
2174 * transfns and their input expressions. The finalCost.per_tuple component
2175 * is charged once per output tuple, corresponding to the costs of
2176 * evaluating the finalfns. Startup costs are of course charged but once.
2177 *
2178 * If we are grouping, we charge an additional cpu_operator_cost per
2179 * grouping column per input tuple for grouping comparisons.
2180 *
2181 * We will produce a single output tuple if not grouping, and a tuple per
2182 * group otherwise. We charge cpu_tuple_cost for each output tuple.
2183 *
2184 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2185 * same total CPU cost, but AGG_SORTED has lower startup cost. If the
2186 * input path is already sorted appropriately, AGG_SORTED should be
2187 * preferred (since it has no risk of memory overflow). This will happen
2188 * as long as the computed total costs are indeed exactly equal --- but if
2189 * there's roundoff error we might do the wrong thing. So be sure that
2190 * the computations below form the same intermediate values in the same
2191 * order.
2192 */
2193 if (aggstrategy == AGG_PLAIN)
2194 {
2195 startup_cost = input_total_cost;
2196 startup_cost += aggcosts->transCost.startup;
2197 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2198 startup_cost += aggcosts->finalCost.startup;
2199 startup_cost += aggcosts->finalCost.per_tuple;
2200 /* we aren't grouping */
2201 total_cost = startup_cost + cpu_tuple_cost;
2202 output_tuples = 1;
2203 }
2204 else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2205 {
2206 /* Here we are able to deliver output on-the-fly */
2207 startup_cost = input_startup_cost;
2208 total_cost = input_total_cost;
2209 if (aggstrategy == AGG_MIXED && !enable_hashagg)
2210 {
2211 startup_cost += disable_cost;
2212 total_cost += disable_cost;
2213 }
2214 /* calcs phrased this way to match HASHED case, see note above */
2215 total_cost += aggcosts->transCost.startup;
2216 total_cost += aggcosts->transCost.per_tuple * input_tuples;
2217 total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2218 total_cost += aggcosts->finalCost.startup;
2219 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2220 total_cost += cpu_tuple_cost * numGroups;
2221 output_tuples = numGroups;
2222 }
2223 else
2224 {
2225 /* must be AGG_HASHED */
2226 startup_cost = input_total_cost;
2227 if (!enable_hashagg)
2228 startup_cost += disable_cost;
2229 startup_cost += aggcosts->transCost.startup;
2230 startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2231 startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2232 startup_cost += aggcosts->finalCost.startup;
2233 total_cost = startup_cost;
2234 total_cost += aggcosts->finalCost.per_tuple * numGroups;
2235 total_cost += cpu_tuple_cost * numGroups;
2236 output_tuples = numGroups;
2237 }
2238
2239 /*
2240 * If there are quals (HAVING quals), account for their cost and
2241 * selectivity.
2242 */
2243 if (quals)
2244 {
2245 QualCost qual_cost;
2246
2247 cost_qual_eval(&qual_cost, quals, root);
2248 startup_cost += qual_cost.startup;
2249 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2250
2251 output_tuples = clamp_row_est(output_tuples *
2252 clauselist_selectivity(root,
2253 quals,
2254 0,
2255 JOIN_INNER,
2256 NULL));
2257 }
2258
2259 path->rows = output_tuples;
2260 path->startup_cost = startup_cost;
2261 path->total_cost = total_cost;
2262 }
2263
2264 /*
2265 * cost_windowagg
2266 * Determines and returns the cost of performing a WindowAgg plan node,
2267 * including the cost of its input.
2268 *
2269 * Input is assumed already properly sorted.
2270 */
2271 void
cost_windowagg(Path * path,PlannerInfo * root,List * windowFuncs,int numPartCols,int numOrderCols,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2272 cost_windowagg(Path *path, PlannerInfo *root,
2273 List *windowFuncs, int numPartCols, int numOrderCols,
2274 Cost input_startup_cost, Cost input_total_cost,
2275 double input_tuples)
2276 {
2277 Cost startup_cost;
2278 Cost total_cost;
2279 ListCell *lc;
2280
2281 startup_cost = input_startup_cost;
2282 total_cost = input_total_cost;
2283
2284 /*
2285 * Window functions are assumed to cost their stated execution cost, plus
2286 * the cost of evaluating their input expressions, per tuple. Since they
2287 * may in fact evaluate their inputs at multiple rows during each cycle,
2288 * this could be a drastic underestimate; but without a way to know how
2289 * many rows the window function will fetch, it's hard to do better. In
2290 * any case, it's a good estimate for all the built-in window functions,
2291 * so we'll just do this for now.
2292 */
2293 foreach(lc, windowFuncs)
2294 {
2295 WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2296 Cost wfunccost;
2297 QualCost argcosts;
2298
2299 argcosts.startup = argcosts.per_tuple = 0;
2300 add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2301 &argcosts);
2302 startup_cost += argcosts.startup;
2303 wfunccost = argcosts.per_tuple;
2304
2305 /* also add the input expressions' cost to per-input-row costs */
2306 cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2307 startup_cost += argcosts.startup;
2308 wfunccost += argcosts.per_tuple;
2309
2310 /*
2311 * Add the filter's cost to per-input-row costs. XXX We should reduce
2312 * input expression costs according to filter selectivity.
2313 */
2314 cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2315 startup_cost += argcosts.startup;
2316 wfunccost += argcosts.per_tuple;
2317
2318 total_cost += wfunccost * input_tuples;
2319 }
2320
2321 /*
2322 * We also charge cpu_operator_cost per grouping column per tuple for
2323 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2324 * overhead.
2325 *
2326 * XXX this neglects costs of spooling the data to disk when it overflows
2327 * work_mem. Sooner or later that should get accounted for.
2328 */
2329 total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2330 total_cost += cpu_tuple_cost * input_tuples;
2331
2332 path->rows = input_tuples;
2333 path->startup_cost = startup_cost;
2334 path->total_cost = total_cost;
2335 }
2336
2337 /*
2338 * cost_group
2339 * Determines and returns the cost of performing a Group plan node,
2340 * including the cost of its input.
2341 *
2342 * Note: caller must ensure that input costs are for appropriately-sorted
2343 * input.
2344 */
2345 void
cost_group(Path * path,PlannerInfo * root,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2346 cost_group(Path *path, PlannerInfo *root,
2347 int numGroupCols, double numGroups,
2348 List *quals,
2349 Cost input_startup_cost, Cost input_total_cost,
2350 double input_tuples)
2351 {
2352 double output_tuples;
2353 Cost startup_cost;
2354 Cost total_cost;
2355
2356 output_tuples = numGroups;
2357 startup_cost = input_startup_cost;
2358 total_cost = input_total_cost;
2359
2360 /*
2361 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2362 * all columns get compared at most of the tuples.
2363 */
2364 total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2365
2366 /*
2367 * If there are quals (HAVING quals), account for their cost and
2368 * selectivity.
2369 */
2370 if (quals)
2371 {
2372 QualCost qual_cost;
2373
2374 cost_qual_eval(&qual_cost, quals, root);
2375 startup_cost += qual_cost.startup;
2376 total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2377
2378 output_tuples = clamp_row_est(output_tuples *
2379 clauselist_selectivity(root,
2380 quals,
2381 0,
2382 JOIN_INNER,
2383 NULL));
2384 }
2385
2386 path->rows = output_tuples;
2387 path->startup_cost = startup_cost;
2388 path->total_cost = total_cost;
2389 }
2390
2391 /*
2392 * initial_cost_nestloop
2393 * Preliminary estimate of the cost of a nestloop join path.
2394 *
2395 * This must quickly produce lower-bound estimates of the path's startup and
2396 * total costs. If we are unable to eliminate the proposed path from
2397 * consideration using the lower bounds, final_cost_nestloop will be called
2398 * to obtain the final estimates.
2399 *
2400 * The exact division of labor between this function and final_cost_nestloop
2401 * is private to them, and represents a tradeoff between speed of the initial
2402 * estimate and getting a tight lower bound. We choose to not examine the
2403 * join quals here, since that's by far the most expensive part of the
2404 * calculations. The end result is that CPU-cost considerations must be
2405 * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2406 * incorporation of the inner path's run cost.
2407 *
2408 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2409 * other data to be used by final_cost_nestloop
2410 * 'jointype' is the type of join to be performed
2411 * 'outer_path' is the outer input to the join
2412 * 'inner_path' is the inner input to the join
2413 * 'extra' contains miscellaneous information about the join
2414 */
2415 void
initial_cost_nestloop(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,Path * outer_path,Path * inner_path,JoinPathExtraData * extra)2416 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2417 JoinType jointype,
2418 Path *outer_path, Path *inner_path,
2419 JoinPathExtraData *extra)
2420 {
2421 Cost startup_cost = 0;
2422 Cost run_cost = 0;
2423 double outer_path_rows = outer_path->rows;
2424 Cost inner_rescan_start_cost;
2425 Cost inner_rescan_total_cost;
2426 Cost inner_run_cost;
2427 Cost inner_rescan_run_cost;
2428
2429 /* estimate costs to rescan the inner relation */
2430 cost_rescan(root, inner_path,
2431 &inner_rescan_start_cost,
2432 &inner_rescan_total_cost);
2433
2434 /* cost of source data */
2435
2436 /*
2437 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2438 * before we can start returning tuples, so the join's startup cost is
2439 * their sum. We'll also pay the inner path's rescan startup cost
2440 * multiple times.
2441 */
2442 startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2443 run_cost += outer_path->total_cost - outer_path->startup_cost;
2444 if (outer_path_rows > 1)
2445 run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2446
2447 inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2448 inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2449
2450 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2451 extra->inner_unique)
2452 {
2453 /*
2454 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2455 * executor will stop after the first match.
2456 *
2457 * Getting decent estimates requires inspection of the join quals,
2458 * which we choose to postpone to final_cost_nestloop.
2459 */
2460
2461 /* Save private data for final_cost_nestloop */
2462 workspace->inner_run_cost = inner_run_cost;
2463 workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2464 }
2465 else
2466 {
2467 /* Normal case; we'll scan whole input rel for each outer row */
2468 run_cost += inner_run_cost;
2469 if (outer_path_rows > 1)
2470 run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2471 }
2472
2473 /* CPU costs left for later */
2474
2475 /* Public result fields */
2476 workspace->startup_cost = startup_cost;
2477 workspace->total_cost = startup_cost + run_cost;
2478 /* Save private data for final_cost_nestloop */
2479 workspace->run_cost = run_cost;
2480 }
2481
2482 /*
2483 * final_cost_nestloop
2484 * Final estimate of the cost and result size of a nestloop join path.
2485 *
2486 * 'path' is already filled in except for the rows and cost fields
2487 * 'workspace' is the result from initial_cost_nestloop
2488 * 'extra' contains miscellaneous information about the join
2489 */
2490 void
final_cost_nestloop(PlannerInfo * root,NestPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2491 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2492 JoinCostWorkspace *workspace,
2493 JoinPathExtraData *extra)
2494 {
2495 Path *outer_path = path->outerjoinpath;
2496 Path *inner_path = path->innerjoinpath;
2497 double outer_path_rows = outer_path->rows;
2498 double inner_path_rows = inner_path->rows;
2499 Cost startup_cost = workspace->startup_cost;
2500 Cost run_cost = workspace->run_cost;
2501 Cost cpu_per_tuple;
2502 QualCost restrict_qual_cost;
2503 double ntuples;
2504
2505 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2506 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2507 outer_path_rows = 1;
2508 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2509 inner_path_rows = 1;
2510
2511 /* Mark the path with the correct row estimate */
2512 if (path->path.param_info)
2513 path->path.rows = path->path.param_info->ppi_rows;
2514 else
2515 path->path.rows = path->path.parent->rows;
2516
2517 /* For partial paths, scale row estimate. */
2518 if (path->path.parallel_workers > 0)
2519 {
2520 double parallel_divisor = get_parallel_divisor(&path->path);
2521
2522 path->path.rows =
2523 clamp_row_est(path->path.rows / parallel_divisor);
2524 }
2525
2526 /*
2527 * We could include disable_cost in the preliminary estimate, but that
2528 * would amount to optimizing for the case where the join method is
2529 * disabled, which doesn't seem like the way to bet.
2530 */
2531 if (!enable_nestloop)
2532 startup_cost += disable_cost;
2533
2534 /* cost of inner-relation source data (we already dealt with outer rel) */
2535
2536 if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2537 extra->inner_unique)
2538 {
2539 /*
2540 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2541 * executor will stop after the first match.
2542 */
2543 Cost inner_run_cost = workspace->inner_run_cost;
2544 Cost inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2545 double outer_matched_rows;
2546 double outer_unmatched_rows;
2547 Selectivity inner_scan_frac;
2548
2549 /*
2550 * For an outer-rel row that has at least one match, we can expect the
2551 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2552 * rows, if the matches are evenly distributed. Since they probably
2553 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2554 * that fraction. (If we used a larger fuzz factor, we'd have to
2555 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2556 * least 1, no such clamp is needed now.)
2557 */
2558 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2559 outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2560 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2561
2562 /*
2563 * Compute number of tuples processed (not number emitted!). First,
2564 * account for successfully-matched outer rows.
2565 */
2566 ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2567
2568 /*
2569 * Now we need to estimate the actual costs of scanning the inner
2570 * relation, which may be quite a bit less than N times inner_run_cost
2571 * due to early scan stops. We consider two cases. If the inner path
2572 * is an indexscan using all the joinquals as indexquals, then an
2573 * unmatched outer row results in an indexscan returning no rows,
2574 * which is probably quite cheap. Otherwise, the executor will have
2575 * to scan the whole inner rel for an unmatched row; not so cheap.
2576 */
2577 if (has_indexed_join_quals(path))
2578 {
2579 /*
2580 * Successfully-matched outer rows will only require scanning
2581 * inner_scan_frac of the inner relation. In this case, we don't
2582 * need to charge the full inner_run_cost even when that's more
2583 * than inner_rescan_run_cost, because we can assume that none of
2584 * the inner scans ever scan the whole inner relation. So it's
2585 * okay to assume that all the inner scan executions can be
2586 * fractions of the full cost, even if materialization is reducing
2587 * the rescan cost. At this writing, it's impossible to get here
2588 * for a materialized inner scan, so inner_run_cost and
2589 * inner_rescan_run_cost will be the same anyway; but just in
2590 * case, use inner_run_cost for the first matched tuple and
2591 * inner_rescan_run_cost for additional ones.
2592 */
2593 run_cost += inner_run_cost * inner_scan_frac;
2594 if (outer_matched_rows > 1)
2595 run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2596
2597 /*
2598 * Add the cost of inner-scan executions for unmatched outer rows.
2599 * We estimate this as the same cost as returning the first tuple
2600 * of a nonempty scan. We consider that these are all rescans,
2601 * since we used inner_run_cost once already.
2602 */
2603 run_cost += outer_unmatched_rows *
2604 inner_rescan_run_cost / inner_path_rows;
2605
2606 /*
2607 * We won't be evaluating any quals at all for unmatched rows, so
2608 * don't add them to ntuples.
2609 */
2610 }
2611 else
2612 {
2613 /*
2614 * Here, a complicating factor is that rescans may be cheaper than
2615 * first scans. If we never scan all the way to the end of the
2616 * inner rel, it might be (depending on the plan type) that we'd
2617 * never pay the whole inner first-scan run cost. However it is
2618 * difficult to estimate whether that will happen (and it could
2619 * not happen if there are any unmatched outer rows!), so be
2620 * conservative and always charge the whole first-scan cost once.
2621 * We consider this charge to correspond to the first unmatched
2622 * outer row, unless there isn't one in our estimate, in which
2623 * case blame it on the first matched row.
2624 */
2625
2626 /* First, count all unmatched join tuples as being processed */
2627 ntuples += outer_unmatched_rows * inner_path_rows;
2628
2629 /* Now add the forced full scan, and decrement appropriate count */
2630 run_cost += inner_run_cost;
2631 if (outer_unmatched_rows >= 1)
2632 outer_unmatched_rows -= 1;
2633 else
2634 outer_matched_rows -= 1;
2635
2636 /* Add inner run cost for additional outer tuples having matches */
2637 if (outer_matched_rows > 0)
2638 run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2639
2640 /* Add inner run cost for additional unmatched outer tuples */
2641 if (outer_unmatched_rows > 0)
2642 run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2643 }
2644 }
2645 else
2646 {
2647 /* Normal-case source costs were included in preliminary estimate */
2648
2649 /* Compute number of tuples processed (not number emitted!) */
2650 ntuples = outer_path_rows * inner_path_rows;
2651 }
2652
2653 /* CPU costs */
2654 cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2655 startup_cost += restrict_qual_cost.startup;
2656 cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2657 run_cost += cpu_per_tuple * ntuples;
2658
2659 /* tlist eval costs are paid per output row, not per tuple scanned */
2660 startup_cost += path->path.pathtarget->cost.startup;
2661 run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2662
2663 path->path.startup_cost = startup_cost;
2664 path->path.total_cost = startup_cost + run_cost;
2665 }
2666
2667 /*
2668 * initial_cost_mergejoin
2669 * Preliminary estimate of the cost of a mergejoin path.
2670 *
2671 * This must quickly produce lower-bound estimates of the path's startup and
2672 * total costs. If we are unable to eliminate the proposed path from
2673 * consideration using the lower bounds, final_cost_mergejoin will be called
2674 * to obtain the final estimates.
2675 *
2676 * The exact division of labor between this function and final_cost_mergejoin
2677 * is private to them, and represents a tradeoff between speed of the initial
2678 * estimate and getting a tight lower bound. We choose to not examine the
2679 * join quals here, except for obtaining the scan selectivity estimate which
2680 * is really essential (but fortunately, use of caching keeps the cost of
2681 * getting that down to something reasonable).
2682 * We also assume that cost_sort is cheap enough to use here.
2683 *
2684 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2685 * other data to be used by final_cost_mergejoin
2686 * 'jointype' is the type of join to be performed
2687 * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2688 * 'outer_path' is the outer input to the join
2689 * 'inner_path' is the inner input to the join
2690 * 'outersortkeys' is the list of sort keys for the outer path
2691 * 'innersortkeys' is the list of sort keys for the inner path
2692 * 'extra' contains miscellaneous information about the join
2693 *
2694 * Note: outersortkeys and innersortkeys should be NIL if no explicit
2695 * sort is needed because the respective source path is already ordered.
2696 */
2697 void
initial_cost_mergejoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * mergeclauses,Path * outer_path,Path * inner_path,List * outersortkeys,List * innersortkeys,JoinPathExtraData * extra)2698 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2699 JoinType jointype,
2700 List *mergeclauses,
2701 Path *outer_path, Path *inner_path,
2702 List *outersortkeys, List *innersortkeys,
2703 JoinPathExtraData *extra)
2704 {
2705 Cost startup_cost = 0;
2706 Cost run_cost = 0;
2707 double outer_path_rows = outer_path->rows;
2708 double inner_path_rows = inner_path->rows;
2709 Cost inner_run_cost;
2710 double outer_rows,
2711 inner_rows,
2712 outer_skip_rows,
2713 inner_skip_rows;
2714 Selectivity outerstartsel,
2715 outerendsel,
2716 innerstartsel,
2717 innerendsel;
2718 Path sort_path; /* dummy for result of cost_sort */
2719
2720 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2721 if (outer_path_rows <= 0 || isnan(outer_path_rows))
2722 outer_path_rows = 1;
2723 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2724 inner_path_rows = 1;
2725
2726 /*
2727 * A merge join will stop as soon as it exhausts either input stream
2728 * (unless it's an outer join, in which case the outer side has to be
2729 * scanned all the way anyway). Estimate fraction of the left and right
2730 * inputs that will actually need to be scanned. Likewise, we can
2731 * estimate the number of rows that will be skipped before the first join
2732 * pair is found, which should be factored into startup cost. We use only
2733 * the first (most significant) merge clause for this purpose. Since
2734 * mergejoinscansel() is a fairly expensive computation, we cache the
2735 * results in the merge clause RestrictInfo.
2736 */
2737 if (mergeclauses && jointype != JOIN_FULL)
2738 {
2739 RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2740 List *opathkeys;
2741 List *ipathkeys;
2742 PathKey *opathkey;
2743 PathKey *ipathkey;
2744 MergeScanSelCache *cache;
2745
2746 /* Get the input pathkeys to determine the sort-order details */
2747 opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2748 ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2749 Assert(opathkeys);
2750 Assert(ipathkeys);
2751 opathkey = (PathKey *) linitial(opathkeys);
2752 ipathkey = (PathKey *) linitial(ipathkeys);
2753 /* debugging check */
2754 if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2755 opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2756 opathkey->pk_strategy != ipathkey->pk_strategy ||
2757 opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2758 elog(ERROR, "left and right pathkeys do not match in mergejoin");
2759
2760 /* Get the selectivity with caching */
2761 cache = cached_scansel(root, firstclause, opathkey);
2762
2763 if (bms_is_subset(firstclause->left_relids,
2764 outer_path->parent->relids))
2765 {
2766 /* left side of clause is outer */
2767 outerstartsel = cache->leftstartsel;
2768 outerendsel = cache->leftendsel;
2769 innerstartsel = cache->rightstartsel;
2770 innerendsel = cache->rightendsel;
2771 }
2772 else
2773 {
2774 /* left side of clause is inner */
2775 outerstartsel = cache->rightstartsel;
2776 outerendsel = cache->rightendsel;
2777 innerstartsel = cache->leftstartsel;
2778 innerendsel = cache->leftendsel;
2779 }
2780 if (jointype == JOIN_LEFT ||
2781 jointype == JOIN_ANTI)
2782 {
2783 outerstartsel = 0.0;
2784 outerendsel = 1.0;
2785 }
2786 else if (jointype == JOIN_RIGHT)
2787 {
2788 innerstartsel = 0.0;
2789 innerendsel = 1.0;
2790 }
2791 }
2792 else
2793 {
2794 /* cope with clauseless or full mergejoin */
2795 outerstartsel = innerstartsel = 0.0;
2796 outerendsel = innerendsel = 1.0;
2797 }
2798
2799 /*
2800 * Convert selectivities to row counts. We force outer_rows and
2801 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
2802 */
2803 outer_skip_rows = rint(outer_path_rows * outerstartsel);
2804 inner_skip_rows = rint(inner_path_rows * innerstartsel);
2805 outer_rows = clamp_row_est(outer_path_rows * outerendsel);
2806 inner_rows = clamp_row_est(inner_path_rows * innerendsel);
2807
2808 /* skip rows can become NaN when path rows has become infinite */
2809 Assert(outer_skip_rows <= outer_rows || isnan(outer_skip_rows));
2810 Assert(inner_skip_rows <= inner_rows || isnan(inner_skip_rows));
2811
2812 /*
2813 * Readjust scan selectivities to account for above rounding. This is
2814 * normally an insignificant effect, but when there are only a few rows in
2815 * the inputs, failing to do this makes for a large percentage error.
2816 */
2817 outerstartsel = outer_skip_rows / outer_path_rows;
2818 innerstartsel = inner_skip_rows / inner_path_rows;
2819 outerendsel = outer_rows / outer_path_rows;
2820 innerendsel = inner_rows / inner_path_rows;
2821
2822 /* start sel can become NaN when path rows has become infinite */
2823 Assert(outerstartsel <= outerendsel || isnan(outerstartsel));
2824 Assert(innerstartsel <= innerendsel || isnan(innerstartsel));
2825
2826 /* cost of source data */
2827
2828 if (outersortkeys) /* do we need to sort outer? */
2829 {
2830 cost_sort(&sort_path,
2831 root,
2832 outersortkeys,
2833 outer_path->total_cost,
2834 outer_path_rows,
2835 outer_path->pathtarget->width,
2836 0.0,
2837 work_mem,
2838 -1.0);
2839 startup_cost += sort_path.startup_cost;
2840 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2841 * outerstartsel;
2842 run_cost += (sort_path.total_cost - sort_path.startup_cost)
2843 * (outerendsel - outerstartsel);
2844 }
2845 else
2846 {
2847 startup_cost += outer_path->startup_cost;
2848 startup_cost += (outer_path->total_cost - outer_path->startup_cost)
2849 * outerstartsel;
2850 run_cost += (outer_path->total_cost - outer_path->startup_cost)
2851 * (outerendsel - outerstartsel);
2852 }
2853
2854 if (innersortkeys) /* do we need to sort inner? */
2855 {
2856 cost_sort(&sort_path,
2857 root,
2858 innersortkeys,
2859 inner_path->total_cost,
2860 inner_path_rows,
2861 inner_path->pathtarget->width,
2862 0.0,
2863 work_mem,
2864 -1.0);
2865 startup_cost += sort_path.startup_cost;
2866 startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2867 * innerstartsel;
2868 inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
2869 * (innerendsel - innerstartsel);
2870 }
2871 else
2872 {
2873 startup_cost += inner_path->startup_cost;
2874 startup_cost += (inner_path->total_cost - inner_path->startup_cost)
2875 * innerstartsel;
2876 inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
2877 * (innerendsel - innerstartsel);
2878 }
2879
2880 /*
2881 * We can't yet determine whether rescanning occurs, or whether
2882 * materialization of the inner input should be done. The minimum
2883 * possible inner input cost, regardless of rescan and materialization
2884 * considerations, is inner_run_cost. We include that in
2885 * workspace->total_cost, but not yet in run_cost.
2886 */
2887
2888 /* CPU costs left for later */
2889
2890 /* Public result fields */
2891 workspace->startup_cost = startup_cost;
2892 workspace->total_cost = startup_cost + run_cost + inner_run_cost;
2893 /* Save private data for final_cost_mergejoin */
2894 workspace->run_cost = run_cost;
2895 workspace->inner_run_cost = inner_run_cost;
2896 workspace->outer_rows = outer_rows;
2897 workspace->inner_rows = inner_rows;
2898 workspace->outer_skip_rows = outer_skip_rows;
2899 workspace->inner_skip_rows = inner_skip_rows;
2900 }
2901
2902 /*
2903 * final_cost_mergejoin
2904 * Final estimate of the cost and result size of a mergejoin path.
2905 *
2906 * Unlike other costsize functions, this routine makes two actual decisions:
2907 * whether the executor will need to do mark/restore, and whether we should
2908 * materialize the inner path. It would be logically cleaner to build
2909 * separate paths testing these alternatives, but that would require repeating
2910 * most of the cost calculations, which are not all that cheap. Since the
2911 * choice will not affect output pathkeys or startup cost, only total cost,
2912 * there is no possibility of wanting to keep more than one path. So it seems
2913 * best to make the decisions here and record them in the path's
2914 * skip_mark_restore and materialize_inner fields.
2915 *
2916 * Mark/restore overhead is usually required, but can be skipped if we know
2917 * that the executor need find only one match per outer tuple, and that the
2918 * mergeclauses are sufficient to identify a match.
2919 *
2920 * We materialize the inner path if we need mark/restore and either the inner
2921 * path can't support mark/restore, or it's cheaper to use an interposed
2922 * Material node to handle mark/restore.
2923 *
2924 * 'path' is already filled in except for the rows and cost fields and
2925 * skip_mark_restore and materialize_inner
2926 * 'workspace' is the result from initial_cost_mergejoin
2927 * 'extra' contains miscellaneous information about the join
2928 */
2929 void
final_cost_mergejoin(PlannerInfo * root,MergePath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2930 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
2931 JoinCostWorkspace *workspace,
2932 JoinPathExtraData *extra)
2933 {
2934 Path *outer_path = path->jpath.outerjoinpath;
2935 Path *inner_path = path->jpath.innerjoinpath;
2936 double inner_path_rows = inner_path->rows;
2937 List *mergeclauses = path->path_mergeclauses;
2938 List *innersortkeys = path->innersortkeys;
2939 Cost startup_cost = workspace->startup_cost;
2940 Cost run_cost = workspace->run_cost;
2941 Cost inner_run_cost = workspace->inner_run_cost;
2942 double outer_rows = workspace->outer_rows;
2943 double inner_rows = workspace->inner_rows;
2944 double outer_skip_rows = workspace->outer_skip_rows;
2945 double inner_skip_rows = workspace->inner_skip_rows;
2946 Cost cpu_per_tuple,
2947 bare_inner_cost,
2948 mat_inner_cost;
2949 QualCost merge_qual_cost;
2950 QualCost qp_qual_cost;
2951 double mergejointuples,
2952 rescannedtuples;
2953 double rescanratio;
2954
2955 /* Protect some assumptions below that rowcounts aren't zero or NaN */
2956 if (inner_path_rows <= 0 || isnan(inner_path_rows))
2957 inner_path_rows = 1;
2958
2959 /* Mark the path with the correct row estimate */
2960 if (path->jpath.path.param_info)
2961 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2962 else
2963 path->jpath.path.rows = path->jpath.path.parent->rows;
2964
2965 /* For partial paths, scale row estimate. */
2966 if (path->jpath.path.parallel_workers > 0)
2967 {
2968 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
2969
2970 path->jpath.path.rows =
2971 clamp_row_est(path->jpath.path.rows / parallel_divisor);
2972 }
2973
2974 /*
2975 * We could include disable_cost in the preliminary estimate, but that
2976 * would amount to optimizing for the case where the join method is
2977 * disabled, which doesn't seem like the way to bet.
2978 */
2979 if (!enable_mergejoin)
2980 startup_cost += disable_cost;
2981
2982 /*
2983 * Compute cost of the mergequals and qpquals (other restriction clauses)
2984 * separately.
2985 */
2986 cost_qual_eval(&merge_qual_cost, mergeclauses, root);
2987 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
2988 qp_qual_cost.startup -= merge_qual_cost.startup;
2989 qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
2990
2991 /*
2992 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2993 * executor will stop scanning for matches after the first match. When
2994 * all the joinclauses are merge clauses, this means we don't ever need to
2995 * back up the merge, and so we can skip mark/restore overhead.
2996 */
2997 if ((path->jpath.jointype == JOIN_SEMI ||
2998 path->jpath.jointype == JOIN_ANTI ||
2999 extra->inner_unique) &&
3000 (list_length(path->jpath.joinrestrictinfo) ==
3001 list_length(path->path_mergeclauses)))
3002 path->skip_mark_restore = true;
3003 else
3004 path->skip_mark_restore = false;
3005
3006 /*
3007 * Get approx # tuples passing the mergequals. We use approx_tuple_count
3008 * here because we need an estimate done with JOIN_INNER semantics.
3009 */
3010 mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3011
3012 /*
3013 * When there are equal merge keys in the outer relation, the mergejoin
3014 * must rescan any matching tuples in the inner relation. This means
3015 * re-fetching inner tuples; we have to estimate how often that happens.
3016 *
3017 * For regular inner and outer joins, the number of re-fetches can be
3018 * estimated approximately as size of merge join output minus size of
3019 * inner relation. Assume that the distinct key values are 1, 2, ..., and
3020 * denote the number of values of each key in the outer relation as m1,
3021 * m2, ...; in the inner relation, n1, n2, ... Then we have
3022 *
3023 * size of join = m1 * n1 + m2 * n2 + ...
3024 *
3025 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3026 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3027 * relation
3028 *
3029 * This equation works correctly for outer tuples having no inner match
3030 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3031 * are effectively subtracting those from the number of rescanned tuples,
3032 * when we should not. Can we do better without expensive selectivity
3033 * computations?
3034 *
3035 * The whole issue is moot if we are working from a unique-ified outer
3036 * input, or if we know we don't need to mark/restore at all.
3037 */
3038 if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
3039 rescannedtuples = 0;
3040 else
3041 {
3042 rescannedtuples = mergejointuples - inner_path_rows;
3043 /* Must clamp because of possible underestimate */
3044 if (rescannedtuples < 0)
3045 rescannedtuples = 0;
3046 }
3047
3048 /*
3049 * We'll inflate various costs this much to account for rescanning. Note
3050 * that this is to be multiplied by something involving inner_rows, or
3051 * another number related to the portion of the inner rel we'll scan.
3052 */
3053 rescanratio = 1.0 + (rescannedtuples / inner_rows);
3054
3055 /*
3056 * Decide whether we want to materialize the inner input to shield it from
3057 * mark/restore and performing re-fetches. Our cost model for regular
3058 * re-fetches is that a re-fetch costs the same as an original fetch,
3059 * which is probably an overestimate; but on the other hand we ignore the
3060 * bookkeeping costs of mark/restore. Not clear if it's worth developing
3061 * a more refined model. So we just need to inflate the inner run cost by
3062 * rescanratio.
3063 */
3064 bare_inner_cost = inner_run_cost * rescanratio;
3065
3066 /*
3067 * When we interpose a Material node the re-fetch cost is assumed to be
3068 * just cpu_operator_cost per tuple, independently of the underlying
3069 * plan's cost; and we charge an extra cpu_operator_cost per original
3070 * fetch as well. Note that we're assuming the materialize node will
3071 * never spill to disk, since it only has to remember tuples back to the
3072 * last mark. (If there are a huge number of duplicates, our other cost
3073 * factors will make the path so expensive that it probably won't get
3074 * chosen anyway.) So we don't use cost_rescan here.
3075 *
3076 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3077 * of the generated Material node.
3078 */
3079 mat_inner_cost = inner_run_cost +
3080 cpu_operator_cost * inner_rows * rescanratio;
3081
3082 /*
3083 * If we don't need mark/restore at all, we don't need materialization.
3084 */
3085 if (path->skip_mark_restore)
3086 path->materialize_inner = false;
3087
3088 /*
3089 * Prefer materializing if it looks cheaper, unless the user has asked to
3090 * suppress materialization.
3091 */
3092 else if (enable_material && mat_inner_cost < bare_inner_cost)
3093 path->materialize_inner = true;
3094
3095 /*
3096 * Even if materializing doesn't look cheaper, we *must* do it if the
3097 * inner path is to be used directly (without sorting) and it doesn't
3098 * support mark/restore.
3099 *
3100 * Since the inner side must be ordered, and only Sorts and IndexScans can
3101 * create order to begin with, and they both support mark/restore, you
3102 * might think there's no problem --- but you'd be wrong. Nestloop and
3103 * merge joins can *preserve* the order of their inputs, so they can be
3104 * selected as the input of a mergejoin, and they don't support
3105 * mark/restore at present.
3106 *
3107 * We don't test the value of enable_material here, because
3108 * materialization is required for correctness in this case, and turning
3109 * it off does not entitle us to deliver an invalid plan.
3110 */
3111 else if (innersortkeys == NIL &&
3112 !ExecSupportsMarkRestore(inner_path))
3113 path->materialize_inner = true;
3114
3115 /*
3116 * Also, force materializing if the inner path is to be sorted and the
3117 * sort is expected to spill to disk. This is because the final merge
3118 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3119 * We don't try to adjust the cost estimates for this consideration,
3120 * though.
3121 *
3122 * Since materialization is a performance optimization in this case,
3123 * rather than necessary for correctness, we skip it if enable_material is
3124 * off.
3125 */
3126 else if (enable_material && innersortkeys != NIL &&
3127 relation_byte_size(inner_path_rows,
3128 inner_path->pathtarget->width) >
3129 (work_mem * 1024L))
3130 path->materialize_inner = true;
3131 else
3132 path->materialize_inner = false;
3133
3134 /* Charge the right incremental cost for the chosen case */
3135 if (path->materialize_inner)
3136 run_cost += mat_inner_cost;
3137 else
3138 run_cost += bare_inner_cost;
3139
3140 /* CPU costs */
3141
3142 /*
3143 * The number of tuple comparisons needed is approximately number of outer
3144 * rows plus number of inner rows plus number of rescanned tuples (can we
3145 * refine this?). At each one, we need to evaluate the mergejoin quals.
3146 */
3147 startup_cost += merge_qual_cost.startup;
3148 startup_cost += merge_qual_cost.per_tuple *
3149 (outer_skip_rows + inner_skip_rows * rescanratio);
3150 run_cost += merge_qual_cost.per_tuple *
3151 ((outer_rows - outer_skip_rows) +
3152 (inner_rows - inner_skip_rows) * rescanratio);
3153
3154 /*
3155 * For each tuple that gets through the mergejoin proper, we charge
3156 * cpu_tuple_cost plus the cost of evaluating additional restriction
3157 * clauses that are to be applied at the join. (This is pessimistic since
3158 * not all of the quals may get evaluated at each tuple.)
3159 *
3160 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3161 * evaluations here, but it's probably not worth the trouble.
3162 */
3163 startup_cost += qp_qual_cost.startup;
3164 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3165 run_cost += cpu_per_tuple * mergejointuples;
3166
3167 /* tlist eval costs are paid per output row, not per tuple scanned */
3168 startup_cost += path->jpath.path.pathtarget->cost.startup;
3169 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3170
3171 path->jpath.path.startup_cost = startup_cost;
3172 path->jpath.path.total_cost = startup_cost + run_cost;
3173 }
3174
3175 /*
3176 * run mergejoinscansel() with caching
3177 */
3178 static MergeScanSelCache *
cached_scansel(PlannerInfo * root,RestrictInfo * rinfo,PathKey * pathkey)3179 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3180 {
3181 MergeScanSelCache *cache;
3182 ListCell *lc;
3183 Selectivity leftstartsel,
3184 leftendsel,
3185 rightstartsel,
3186 rightendsel;
3187 MemoryContext oldcontext;
3188
3189 /* Do we have this result already? */
3190 foreach(lc, rinfo->scansel_cache)
3191 {
3192 cache = (MergeScanSelCache *) lfirst(lc);
3193 if (cache->opfamily == pathkey->pk_opfamily &&
3194 cache->collation == pathkey->pk_eclass->ec_collation &&
3195 cache->strategy == pathkey->pk_strategy &&
3196 cache->nulls_first == pathkey->pk_nulls_first)
3197 return cache;
3198 }
3199
3200 /* Nope, do the computation */
3201 mergejoinscansel(root,
3202 (Node *) rinfo->clause,
3203 pathkey->pk_opfamily,
3204 pathkey->pk_strategy,
3205 pathkey->pk_nulls_first,
3206 &leftstartsel,
3207 &leftendsel,
3208 &rightstartsel,
3209 &rightendsel);
3210
3211 /* Cache the result in suitably long-lived workspace */
3212 oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3213
3214 cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3215 cache->opfamily = pathkey->pk_opfamily;
3216 cache->collation = pathkey->pk_eclass->ec_collation;
3217 cache->strategy = pathkey->pk_strategy;
3218 cache->nulls_first = pathkey->pk_nulls_first;
3219 cache->leftstartsel = leftstartsel;
3220 cache->leftendsel = leftendsel;
3221 cache->rightstartsel = rightstartsel;
3222 cache->rightendsel = rightendsel;
3223
3224 rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3225
3226 MemoryContextSwitchTo(oldcontext);
3227
3228 return cache;
3229 }
3230
3231 /*
3232 * initial_cost_hashjoin
3233 * Preliminary estimate of the cost of a hashjoin path.
3234 *
3235 * This must quickly produce lower-bound estimates of the path's startup and
3236 * total costs. If we are unable to eliminate the proposed path from
3237 * consideration using the lower bounds, final_cost_hashjoin will be called
3238 * to obtain the final estimates.
3239 *
3240 * The exact division of labor between this function and final_cost_hashjoin
3241 * is private to them, and represents a tradeoff between speed of the initial
3242 * estimate and getting a tight lower bound. We choose to not examine the
3243 * join quals here (other than by counting the number of hash clauses),
3244 * so we can't do much with CPU costs. We do assume that
3245 * ExecChooseHashTableSize is cheap enough to use here.
3246 *
3247 * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3248 * other data to be used by final_cost_hashjoin
3249 * 'jointype' is the type of join to be performed
3250 * 'hashclauses' is the list of joinclauses to be used as hash clauses
3251 * 'outer_path' is the outer input to the join
3252 * 'inner_path' is the inner input to the join
3253 * 'extra' contains miscellaneous information about the join
3254 * 'parallel_hash' indicates that inner_path is partial and that a shared
3255 * hash table will be built in parallel
3256 */
3257 void
initial_cost_hashjoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * hashclauses,Path * outer_path,Path * inner_path,JoinPathExtraData * extra,bool parallel_hash)3258 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3259 JoinType jointype,
3260 List *hashclauses,
3261 Path *outer_path, Path *inner_path,
3262 JoinPathExtraData *extra,
3263 bool parallel_hash)
3264 {
3265 Cost startup_cost = 0;
3266 Cost run_cost = 0;
3267 double outer_path_rows = outer_path->rows;
3268 double inner_path_rows = inner_path->rows;
3269 double inner_path_rows_total = inner_path_rows;
3270 int num_hashclauses = list_length(hashclauses);
3271 int numbuckets;
3272 int numbatches;
3273 int num_skew_mcvs;
3274 size_t space_allowed; /* unused */
3275
3276 /* cost of source data */
3277 startup_cost += outer_path->startup_cost;
3278 run_cost += outer_path->total_cost - outer_path->startup_cost;
3279 startup_cost += inner_path->total_cost;
3280
3281 /*
3282 * Cost of computing hash function: must do it once per input tuple. We
3283 * charge one cpu_operator_cost for each column's hash function. Also,
3284 * tack on one cpu_tuple_cost per inner row, to model the costs of
3285 * inserting the row into the hashtable.
3286 *
3287 * XXX when a hashclause is more complex than a single operator, we really
3288 * should charge the extra eval costs of the left or right side, as
3289 * appropriate, here. This seems more work than it's worth at the moment.
3290 */
3291 startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3292 * inner_path_rows;
3293 run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3294
3295 /*
3296 * If this is a parallel hash build, then the value we have for
3297 * inner_rows_total currently refers only to the rows returned by each
3298 * participant. For shared hash table size estimation, we need the total
3299 * number, so we need to undo the division.
3300 */
3301 if (parallel_hash)
3302 inner_path_rows_total *= get_parallel_divisor(inner_path);
3303
3304 /*
3305 * Get hash table size that executor would use for inner relation.
3306 *
3307 * XXX for the moment, always assume that skew optimization will be
3308 * performed. As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
3309 * trying to determine that for sure.
3310 *
3311 * XXX at some point it might be interesting to try to account for skew
3312 * optimization in the cost estimate, but for now, we don't.
3313 */
3314 ExecChooseHashTableSize(inner_path_rows_total,
3315 inner_path->pathtarget->width,
3316 true, /* useskew */
3317 parallel_hash, /* try_combined_work_mem */
3318 outer_path->parallel_workers,
3319 &space_allowed,
3320 &numbuckets,
3321 &numbatches,
3322 &num_skew_mcvs);
3323
3324 /*
3325 * If inner relation is too big then we will need to "batch" the join,
3326 * which implies writing and reading most of the tuples to disk an extra
3327 * time. Charge seq_page_cost per page, since the I/O should be nice and
3328 * sequential. Writing the inner rel counts as startup cost, all the rest
3329 * as run cost.
3330 */
3331 if (numbatches > 1)
3332 {
3333 double outerpages = page_size(outer_path_rows,
3334 outer_path->pathtarget->width);
3335 double innerpages = page_size(inner_path_rows,
3336 inner_path->pathtarget->width);
3337
3338 startup_cost += seq_page_cost * innerpages;
3339 run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3340 }
3341
3342 /* CPU costs left for later */
3343
3344 /* Public result fields */
3345 workspace->startup_cost = startup_cost;
3346 workspace->total_cost = startup_cost + run_cost;
3347 /* Save private data for final_cost_hashjoin */
3348 workspace->run_cost = run_cost;
3349 workspace->numbuckets = numbuckets;
3350 workspace->numbatches = numbatches;
3351 workspace->inner_rows_total = inner_path_rows_total;
3352 }
3353
3354 /*
3355 * final_cost_hashjoin
3356 * Final estimate of the cost and result size of a hashjoin path.
3357 *
3358 * Note: the numbatches estimate is also saved into 'path' for use later
3359 *
3360 * 'path' is already filled in except for the rows and cost fields and
3361 * num_batches
3362 * 'workspace' is the result from initial_cost_hashjoin
3363 * 'extra' contains miscellaneous information about the join
3364 */
3365 void
final_cost_hashjoin(PlannerInfo * root,HashPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3366 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3367 JoinCostWorkspace *workspace,
3368 JoinPathExtraData *extra)
3369 {
3370 Path *outer_path = path->jpath.outerjoinpath;
3371 Path *inner_path = path->jpath.innerjoinpath;
3372 double outer_path_rows = outer_path->rows;
3373 double inner_path_rows = inner_path->rows;
3374 double inner_path_rows_total = workspace->inner_rows_total;
3375 List *hashclauses = path->path_hashclauses;
3376 Cost startup_cost = workspace->startup_cost;
3377 Cost run_cost = workspace->run_cost;
3378 int numbuckets = workspace->numbuckets;
3379 int numbatches = workspace->numbatches;
3380 Cost cpu_per_tuple;
3381 QualCost hash_qual_cost;
3382 QualCost qp_qual_cost;
3383 double hashjointuples;
3384 double virtualbuckets;
3385 Selectivity innerbucketsize;
3386 Selectivity innermcvfreq;
3387 ListCell *hcl;
3388
3389 /* Mark the path with the correct row estimate */
3390 if (path->jpath.path.param_info)
3391 path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3392 else
3393 path->jpath.path.rows = path->jpath.path.parent->rows;
3394
3395 /* For partial paths, scale row estimate. */
3396 if (path->jpath.path.parallel_workers > 0)
3397 {
3398 double parallel_divisor = get_parallel_divisor(&path->jpath.path);
3399
3400 path->jpath.path.rows =
3401 clamp_row_est(path->jpath.path.rows / parallel_divisor);
3402 }
3403
3404 /*
3405 * We could include disable_cost in the preliminary estimate, but that
3406 * would amount to optimizing for the case where the join method is
3407 * disabled, which doesn't seem like the way to bet.
3408 */
3409 if (!enable_hashjoin)
3410 startup_cost += disable_cost;
3411
3412 /* mark the path with estimated # of batches */
3413 path->num_batches = numbatches;
3414
3415 /* store the total number of tuples (sum of partial row estimates) */
3416 path->inner_rows_total = inner_path_rows_total;
3417
3418 /* and compute the number of "virtual" buckets in the whole join */
3419 virtualbuckets = (double) numbuckets * (double) numbatches;
3420
3421 /*
3422 * Determine bucketsize fraction and MCV frequency for the inner relation.
3423 * We use the smallest bucketsize or MCV frequency estimated for any
3424 * individual hashclause; this is undoubtedly conservative.
3425 *
3426 * BUT: if inner relation has been unique-ified, we can assume it's good
3427 * for hashing. This is important both because it's the right answer, and
3428 * because we avoid contaminating the cache with a value that's wrong for
3429 * non-unique-ified paths.
3430 */
3431 if (IsA(inner_path, UniquePath))
3432 {
3433 innerbucketsize = 1.0 / virtualbuckets;
3434 innermcvfreq = 0.0;
3435 }
3436 else
3437 {
3438 innerbucketsize = 1.0;
3439 innermcvfreq = 1.0;
3440 foreach(hcl, hashclauses)
3441 {
3442 RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3443 Selectivity thisbucketsize;
3444 Selectivity thismcvfreq;
3445
3446 /*
3447 * First we have to figure out which side of the hashjoin clause
3448 * is the inner side.
3449 *
3450 * Since we tend to visit the same clauses over and over when
3451 * planning a large query, we cache the bucket stats estimates in
3452 * the RestrictInfo node to avoid repeated lookups of statistics.
3453 */
3454 if (bms_is_subset(restrictinfo->right_relids,
3455 inner_path->parent->relids))
3456 {
3457 /* righthand side is inner */
3458 thisbucketsize = restrictinfo->right_bucketsize;
3459 if (thisbucketsize < 0)
3460 {
3461 /* not cached yet */
3462 estimate_hash_bucket_stats(root,
3463 get_rightop(restrictinfo->clause),
3464 virtualbuckets,
3465 &restrictinfo->right_mcvfreq,
3466 &restrictinfo->right_bucketsize);
3467 thisbucketsize = restrictinfo->right_bucketsize;
3468 }
3469 thismcvfreq = restrictinfo->right_mcvfreq;
3470 }
3471 else
3472 {
3473 Assert(bms_is_subset(restrictinfo->left_relids,
3474 inner_path->parent->relids));
3475 /* lefthand side is inner */
3476 thisbucketsize = restrictinfo->left_bucketsize;
3477 if (thisbucketsize < 0)
3478 {
3479 /* not cached yet */
3480 estimate_hash_bucket_stats(root,
3481 get_leftop(restrictinfo->clause),
3482 virtualbuckets,
3483 &restrictinfo->left_mcvfreq,
3484 &restrictinfo->left_bucketsize);
3485 thisbucketsize = restrictinfo->left_bucketsize;
3486 }
3487 thismcvfreq = restrictinfo->left_mcvfreq;
3488 }
3489
3490 if (innerbucketsize > thisbucketsize)
3491 innerbucketsize = thisbucketsize;
3492 if (innermcvfreq > thismcvfreq)
3493 innermcvfreq = thismcvfreq;
3494 }
3495 }
3496
3497 /*
3498 * If the bucket holding the inner MCV would exceed work_mem, we don't
3499 * want to hash unless there is really no other alternative, so apply
3500 * disable_cost. (The executor normally copes with excessive memory usage
3501 * by splitting batches, but obviously it cannot separate equal values
3502 * that way, so it will be unable to drive the batch size below work_mem
3503 * when this is true.)
3504 */
3505 if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3506 inner_path->pathtarget->width) >
3507 (work_mem * 1024L))
3508 startup_cost += disable_cost;
3509
3510 /*
3511 * Compute cost of the hashquals and qpquals (other restriction clauses)
3512 * separately.
3513 */
3514 cost_qual_eval(&hash_qual_cost, hashclauses, root);
3515 cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3516 qp_qual_cost.startup -= hash_qual_cost.startup;
3517 qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3518
3519 /* CPU costs */
3520
3521 if (path->jpath.jointype == JOIN_SEMI ||
3522 path->jpath.jointype == JOIN_ANTI ||
3523 extra->inner_unique)
3524 {
3525 double outer_matched_rows;
3526 Selectivity inner_scan_frac;
3527
3528 /*
3529 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3530 * executor will stop after the first match.
3531 *
3532 * For an outer-rel row that has at least one match, we can expect the
3533 * bucket scan to stop after a fraction 1/(match_count+1) of the
3534 * bucket's rows, if the matches are evenly distributed. Since they
3535 * probably aren't quite evenly distributed, we apply a fuzz factor of
3536 * 2.0 to that fraction. (If we used a larger fuzz factor, we'd have
3537 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3538 * at least 1, no such clamp is needed now.)
3539 */
3540 outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3541 inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3542
3543 startup_cost += hash_qual_cost.startup;
3544 run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3545 clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3546
3547 /*
3548 * For unmatched outer-rel rows, the picture is quite a lot different.
3549 * In the first place, there is no reason to assume that these rows
3550 * preferentially hit heavily-populated buckets; instead assume they
3551 * are uncorrelated with the inner distribution and so they see an
3552 * average bucket size of inner_path_rows / virtualbuckets. In the
3553 * second place, it seems likely that they will have few if any exact
3554 * hash-code matches and so very few of the tuples in the bucket will
3555 * actually require eval of the hash quals. We don't have any good
3556 * way to estimate how many will, but for the moment assume that the
3557 * effective cost per bucket entry is one-tenth what it is for
3558 * matchable tuples.
3559 */
3560 run_cost += hash_qual_cost.per_tuple *
3561 (outer_path_rows - outer_matched_rows) *
3562 clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3563
3564 /* Get # of tuples that will pass the basic join */
3565 if (path->jpath.jointype == JOIN_ANTI)
3566 hashjointuples = outer_path_rows - outer_matched_rows;
3567 else
3568 hashjointuples = outer_matched_rows;
3569 }
3570 else
3571 {
3572 /*
3573 * The number of tuple comparisons needed is the number of outer
3574 * tuples times the typical number of tuples in a hash bucket, which
3575 * is the inner relation size times its bucketsize fraction. At each
3576 * one, we need to evaluate the hashjoin quals. But actually,
3577 * charging the full qual eval cost at each tuple is pessimistic,
3578 * since we don't evaluate the quals unless the hash values match
3579 * exactly. For lack of a better idea, halve the cost estimate to
3580 * allow for that.
3581 */
3582 startup_cost += hash_qual_cost.startup;
3583 run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3584 clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3585
3586 /*
3587 * Get approx # tuples passing the hashquals. We use
3588 * approx_tuple_count here because we need an estimate done with
3589 * JOIN_INNER semantics.
3590 */
3591 hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3592 }
3593
3594 /*
3595 * For each tuple that gets through the hashjoin proper, we charge
3596 * cpu_tuple_cost plus the cost of evaluating additional restriction
3597 * clauses that are to be applied at the join. (This is pessimistic since
3598 * not all of the quals may get evaluated at each tuple.)
3599 */
3600 startup_cost += qp_qual_cost.startup;
3601 cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3602 run_cost += cpu_per_tuple * hashjointuples;
3603
3604 /* tlist eval costs are paid per output row, not per tuple scanned */
3605 startup_cost += path->jpath.path.pathtarget->cost.startup;
3606 run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3607
3608 path->jpath.path.startup_cost = startup_cost;
3609 path->jpath.path.total_cost = startup_cost + run_cost;
3610 }
3611
3612
3613 /*
3614 * cost_subplan
3615 * Figure the costs for a SubPlan (or initplan).
3616 *
3617 * Note: we could dig the subplan's Plan out of the root list, but in practice
3618 * all callers have it handy already, so we make them pass it.
3619 */
3620 void
cost_subplan(PlannerInfo * root,SubPlan * subplan,Plan * plan)3621 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3622 {
3623 QualCost sp_cost;
3624
3625 /* Figure any cost for evaluating the testexpr */
3626 cost_qual_eval(&sp_cost,
3627 make_ands_implicit((Expr *) subplan->testexpr),
3628 root);
3629
3630 if (subplan->useHashTable)
3631 {
3632 /*
3633 * If we are using a hash table for the subquery outputs, then the
3634 * cost of evaluating the query is a one-time cost. We charge one
3635 * cpu_operator_cost per tuple for the work of loading the hashtable,
3636 * too.
3637 */
3638 sp_cost.startup += plan->total_cost +
3639 cpu_operator_cost * plan->plan_rows;
3640
3641 /*
3642 * The per-tuple costs include the cost of evaluating the lefthand
3643 * expressions, plus the cost of probing the hashtable. We already
3644 * accounted for the lefthand expressions as part of the testexpr, and
3645 * will also have counted one cpu_operator_cost for each comparison
3646 * operator. That is probably too low for the probing cost, but it's
3647 * hard to make a better estimate, so live with it for now.
3648 */
3649 }
3650 else
3651 {
3652 /*
3653 * Otherwise we will be rescanning the subplan output on each
3654 * evaluation. We need to estimate how much of the output we will
3655 * actually need to scan. NOTE: this logic should agree with the
3656 * tuple_fraction estimates used by make_subplan() in
3657 * plan/subselect.c.
3658 */
3659 Cost plan_run_cost = plan->total_cost - plan->startup_cost;
3660
3661 if (subplan->subLinkType == EXISTS_SUBLINK)
3662 {
3663 /* we only need to fetch 1 tuple; clamp to avoid zero divide */
3664 sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3665 }
3666 else if (subplan->subLinkType == ALL_SUBLINK ||
3667 subplan->subLinkType == ANY_SUBLINK)
3668 {
3669 /* assume we need 50% of the tuples */
3670 sp_cost.per_tuple += 0.50 * plan_run_cost;
3671 /* also charge a cpu_operator_cost per row examined */
3672 sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3673 }
3674 else
3675 {
3676 /* assume we need all tuples */
3677 sp_cost.per_tuple += plan_run_cost;
3678 }
3679
3680 /*
3681 * Also account for subplan's startup cost. If the subplan is
3682 * uncorrelated or undirect correlated, AND its topmost node is one
3683 * that materializes its output, assume that we'll only need to pay
3684 * its startup cost once; otherwise assume we pay the startup cost
3685 * every time.
3686 */
3687 if (subplan->parParam == NIL &&
3688 ExecMaterializesOutput(nodeTag(plan)))
3689 sp_cost.startup += plan->startup_cost;
3690 else
3691 sp_cost.per_tuple += plan->startup_cost;
3692 }
3693
3694 subplan->startup_cost = sp_cost.startup;
3695 subplan->per_call_cost = sp_cost.per_tuple;
3696 }
3697
3698
3699 /*
3700 * cost_rescan
3701 * Given a finished Path, estimate the costs of rescanning it after
3702 * having done so the first time. For some Path types a rescan is
3703 * cheaper than an original scan (if no parameters change), and this
3704 * function embodies knowledge about that. The default is to return
3705 * the same costs stored in the Path. (Note that the cost estimates
3706 * actually stored in Paths are always for first scans.)
3707 *
3708 * This function is not currently intended to model effects such as rescans
3709 * being cheaper due to disk block caching; what we are concerned with is
3710 * plan types wherein the executor caches results explicitly, or doesn't
3711 * redo startup calculations, etc.
3712 */
3713 static void
cost_rescan(PlannerInfo * root,Path * path,Cost * rescan_startup_cost,Cost * rescan_total_cost)3714 cost_rescan(PlannerInfo *root, Path *path,
3715 Cost *rescan_startup_cost, /* output parameters */
3716 Cost *rescan_total_cost)
3717 {
3718 switch (path->pathtype)
3719 {
3720 case T_FunctionScan:
3721
3722 /*
3723 * Currently, nodeFunctionscan.c always executes the function to
3724 * completion before returning any rows, and caches the results in
3725 * a tuplestore. So the function eval cost is all startup cost
3726 * and isn't paid over again on rescans. However, all run costs
3727 * will be paid over again.
3728 */
3729 *rescan_startup_cost = 0;
3730 *rescan_total_cost = path->total_cost - path->startup_cost;
3731 break;
3732 case T_HashJoin:
3733
3734 /*
3735 * If it's a single-batch join, we don't need to rebuild the hash
3736 * table during a rescan.
3737 */
3738 if (((HashPath *) path)->num_batches == 1)
3739 {
3740 /* Startup cost is exactly the cost of hash table building */
3741 *rescan_startup_cost = 0;
3742 *rescan_total_cost = path->total_cost - path->startup_cost;
3743 }
3744 else
3745 {
3746 /* Otherwise, no special treatment */
3747 *rescan_startup_cost = path->startup_cost;
3748 *rescan_total_cost = path->total_cost;
3749 }
3750 break;
3751 case T_CteScan:
3752 case T_WorkTableScan:
3753 {
3754 /*
3755 * These plan types materialize their final result in a
3756 * tuplestore or tuplesort object. So the rescan cost is only
3757 * cpu_tuple_cost per tuple, unless the result is large enough
3758 * to spill to disk.
3759 */
3760 Cost run_cost = cpu_tuple_cost * path->rows;
3761 double nbytes = relation_byte_size(path->rows,
3762 path->pathtarget->width);
3763 long work_mem_bytes = work_mem * 1024L;
3764
3765 if (nbytes > work_mem_bytes)
3766 {
3767 /* It will spill, so account for re-read cost */
3768 double npages = ceil(nbytes / BLCKSZ);
3769
3770 run_cost += seq_page_cost * npages;
3771 }
3772 *rescan_startup_cost = 0;
3773 *rescan_total_cost = run_cost;
3774 }
3775 break;
3776 case T_Material:
3777 case T_Sort:
3778 {
3779 /*
3780 * These plan types not only materialize their results, but do
3781 * not implement qual filtering or projection. So they are
3782 * even cheaper to rescan than the ones above. We charge only
3783 * cpu_operator_cost per tuple. (Note: keep that in sync with
3784 * the run_cost charge in cost_sort, and also see comments in
3785 * cost_material before you change it.)
3786 */
3787 Cost run_cost = cpu_operator_cost * path->rows;
3788 double nbytes = relation_byte_size(path->rows,
3789 path->pathtarget->width);
3790 long work_mem_bytes = work_mem * 1024L;
3791
3792 if (nbytes > work_mem_bytes)
3793 {
3794 /* It will spill, so account for re-read cost */
3795 double npages = ceil(nbytes / BLCKSZ);
3796
3797 run_cost += seq_page_cost * npages;
3798 }
3799 *rescan_startup_cost = 0;
3800 *rescan_total_cost = run_cost;
3801 }
3802 break;
3803 default:
3804 *rescan_startup_cost = path->startup_cost;
3805 *rescan_total_cost = path->total_cost;
3806 break;
3807 }
3808 }
3809
3810
3811 /*
3812 * cost_qual_eval
3813 * Estimate the CPU costs of evaluating a WHERE clause.
3814 * The input can be either an implicitly-ANDed list of boolean
3815 * expressions, or a list of RestrictInfo nodes. (The latter is
3816 * preferred since it allows caching of the results.)
3817 * The result includes both a one-time (startup) component,
3818 * and a per-evaluation component.
3819 */
3820 void
cost_qual_eval(QualCost * cost,List * quals,PlannerInfo * root)3821 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
3822 {
3823 cost_qual_eval_context context;
3824 ListCell *l;
3825
3826 context.root = root;
3827 context.total.startup = 0;
3828 context.total.per_tuple = 0;
3829
3830 /* We don't charge any cost for the implicit ANDing at top level ... */
3831
3832 foreach(l, quals)
3833 {
3834 Node *qual = (Node *) lfirst(l);
3835
3836 cost_qual_eval_walker(qual, &context);
3837 }
3838
3839 *cost = context.total;
3840 }
3841
3842 /*
3843 * cost_qual_eval_node
3844 * As above, for a single RestrictInfo or expression.
3845 */
3846 void
cost_qual_eval_node(QualCost * cost,Node * qual,PlannerInfo * root)3847 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
3848 {
3849 cost_qual_eval_context context;
3850
3851 context.root = root;
3852 context.total.startup = 0;
3853 context.total.per_tuple = 0;
3854
3855 cost_qual_eval_walker(qual, &context);
3856
3857 *cost = context.total;
3858 }
3859
3860 static bool
cost_qual_eval_walker(Node * node,cost_qual_eval_context * context)3861 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
3862 {
3863 if (node == NULL)
3864 return false;
3865
3866 /*
3867 * RestrictInfo nodes contain an eval_cost field reserved for this
3868 * routine's use, so that it's not necessary to evaluate the qual clause's
3869 * cost more than once. If the clause's cost hasn't been computed yet,
3870 * the field's startup value will contain -1.
3871 */
3872 if (IsA(node, RestrictInfo))
3873 {
3874 RestrictInfo *rinfo = (RestrictInfo *) node;
3875
3876 if (rinfo->eval_cost.startup < 0)
3877 {
3878 cost_qual_eval_context locContext;
3879
3880 locContext.root = context->root;
3881 locContext.total.startup = 0;
3882 locContext.total.per_tuple = 0;
3883
3884 /*
3885 * For an OR clause, recurse into the marked-up tree so that we
3886 * set the eval_cost for contained RestrictInfos too.
3887 */
3888 if (rinfo->orclause)
3889 cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
3890 else
3891 cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
3892
3893 /*
3894 * If the RestrictInfo is marked pseudoconstant, it will be tested
3895 * only once, so treat its cost as all startup cost.
3896 */
3897 if (rinfo->pseudoconstant)
3898 {
3899 /* count one execution during startup */
3900 locContext.total.startup += locContext.total.per_tuple;
3901 locContext.total.per_tuple = 0;
3902 }
3903 rinfo->eval_cost = locContext.total;
3904 }
3905 context->total.startup += rinfo->eval_cost.startup;
3906 context->total.per_tuple += rinfo->eval_cost.per_tuple;
3907 /* do NOT recurse into children */
3908 return false;
3909 }
3910
3911 /*
3912 * For each operator or function node in the given tree, we charge the
3913 * estimated execution cost given by pg_proc.procost (remember to multiply
3914 * this by cpu_operator_cost).
3915 *
3916 * Vars and Consts are charged zero, and so are boolean operators (AND,
3917 * OR, NOT). Simplistic, but a lot better than no model at all.
3918 *
3919 * Should we try to account for the possibility of short-circuit
3920 * evaluation of AND/OR? Probably *not*, because that would make the
3921 * results depend on the clause ordering, and we are not in any position
3922 * to expect that the current ordering of the clauses is the one that's
3923 * going to end up being used. The above per-RestrictInfo caching would
3924 * not mix well with trying to re-order clauses anyway.
3925 *
3926 * Another issue that is entirely ignored here is that if a set-returning
3927 * function is below top level in the tree, the functions/operators above
3928 * it will need to be evaluated multiple times. In practical use, such
3929 * cases arise so seldom as to not be worth the added complexity needed;
3930 * moreover, since our rowcount estimates for functions tend to be pretty
3931 * phony, the results would also be pretty phony.
3932 */
3933 if (IsA(node, FuncExpr))
3934 {
3935 add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
3936 &context->total);
3937 }
3938 else if (IsA(node, OpExpr) ||
3939 IsA(node, DistinctExpr) ||
3940 IsA(node, NullIfExpr))
3941 {
3942 /* rely on struct equivalence to treat these all alike */
3943 set_opfuncid((OpExpr *) node);
3944 add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
3945 &context->total);
3946 }
3947 else if (IsA(node, ScalarArrayOpExpr))
3948 {
3949 /*
3950 * Estimate that the operator will be applied to about half of the
3951 * array elements before the answer is determined.
3952 */
3953 ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
3954 Node *arraynode = (Node *) lsecond(saop->args);
3955 QualCost sacosts;
3956
3957 set_sa_opfuncid(saop);
3958 sacosts.startup = sacosts.per_tuple = 0;
3959 add_function_cost(context->root, saop->opfuncid, NULL,
3960 &sacosts);
3961 context->total.startup += sacosts.startup;
3962 context->total.per_tuple += sacosts.per_tuple *
3963 estimate_array_length(arraynode) * 0.5;
3964 }
3965 else if (IsA(node, Aggref) ||
3966 IsA(node, WindowFunc))
3967 {
3968 /*
3969 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
3970 * ie, zero execution cost in the current model, because they behave
3971 * essentially like Vars at execution. We disregard the costs of
3972 * their input expressions for the same reason. The actual execution
3973 * costs of the aggregate/window functions and their arguments have to
3974 * be factored into plan-node-specific costing of the Agg or WindowAgg
3975 * plan node.
3976 */
3977 return false; /* don't recurse into children */
3978 }
3979 else if (IsA(node, CoerceViaIO))
3980 {
3981 CoerceViaIO *iocoerce = (CoerceViaIO *) node;
3982 Oid iofunc;
3983 Oid typioparam;
3984 bool typisvarlena;
3985
3986 /* check the result type's input function */
3987 getTypeInputInfo(iocoerce->resulttype,
3988 &iofunc, &typioparam);
3989 add_function_cost(context->root, iofunc, NULL,
3990 &context->total);
3991 /* check the input type's output function */
3992 getTypeOutputInfo(exprType((Node *) iocoerce->arg),
3993 &iofunc, &typisvarlena);
3994 add_function_cost(context->root, iofunc, NULL,
3995 &context->total);
3996 }
3997 else if (IsA(node, ArrayCoerceExpr))
3998 {
3999 ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4000 QualCost perelemcost;
4001
4002 cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4003 context->root);
4004 context->total.startup += perelemcost.startup;
4005 if (perelemcost.per_tuple > 0)
4006 context->total.per_tuple += perelemcost.per_tuple *
4007 estimate_array_length((Node *) acoerce->arg);
4008 }
4009 else if (IsA(node, RowCompareExpr))
4010 {
4011 /* Conservatively assume we will check all the columns */
4012 RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4013 ListCell *lc;
4014
4015 foreach(lc, rcexpr->opnos)
4016 {
4017 Oid opid = lfirst_oid(lc);
4018
4019 add_function_cost(context->root, get_opcode(opid), NULL,
4020 &context->total);
4021 }
4022 }
4023 else if (IsA(node, MinMaxExpr) ||
4024 IsA(node, SQLValueFunction) ||
4025 IsA(node, XmlExpr) ||
4026 IsA(node, CoerceToDomain) ||
4027 IsA(node, NextValueExpr))
4028 {
4029 /* Treat all these as having cost 1 */
4030 context->total.per_tuple += cpu_operator_cost;
4031 }
4032 else if (IsA(node, CurrentOfExpr))
4033 {
4034 /* Report high cost to prevent selection of anything but TID scan */
4035 context->total.startup += disable_cost;
4036 }
4037 else if (IsA(node, SubLink))
4038 {
4039 /* This routine should not be applied to un-planned expressions */
4040 elog(ERROR, "cannot handle unplanned sub-select");
4041 }
4042 else if (IsA(node, SubPlan))
4043 {
4044 /*
4045 * A subplan node in an expression typically indicates that the
4046 * subplan will be executed on each evaluation, so charge accordingly.
4047 * (Sub-selects that can be executed as InitPlans have already been
4048 * removed from the expression.)
4049 */
4050 SubPlan *subplan = (SubPlan *) node;
4051
4052 context->total.startup += subplan->startup_cost;
4053 context->total.per_tuple += subplan->per_call_cost;
4054
4055 /*
4056 * We don't want to recurse into the testexpr, because it was already
4057 * counted in the SubPlan node's costs. So we're done.
4058 */
4059 return false;
4060 }
4061 else if (IsA(node, AlternativeSubPlan))
4062 {
4063 /*
4064 * Arbitrarily use the first alternative plan for costing. (We should
4065 * certainly only include one alternative, and we don't yet have
4066 * enough information to know which one the executor is most likely to
4067 * use.)
4068 */
4069 AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4070
4071 return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4072 context);
4073 }
4074 else if (IsA(node, PlaceHolderVar))
4075 {
4076 /*
4077 * A PlaceHolderVar should be given cost zero when considering general
4078 * expression evaluation costs. The expense of doing the contained
4079 * expression is charged as part of the tlist eval costs of the scan
4080 * or join where the PHV is first computed (see set_rel_width and
4081 * add_placeholders_to_joinrel). If we charged it again here, we'd be
4082 * double-counting the cost for each level of plan that the PHV
4083 * bubbles up through. Hence, return without recursing into the
4084 * phexpr.
4085 */
4086 return false;
4087 }
4088
4089 /* recurse into children */
4090 return expression_tree_walker(node, cost_qual_eval_walker,
4091 (void *) context);
4092 }
4093
4094 /*
4095 * get_restriction_qual_cost
4096 * Compute evaluation costs of a baserel's restriction quals, plus any
4097 * movable join quals that have been pushed down to the scan.
4098 * Results are returned into *qpqual_cost.
4099 *
4100 * This is a convenience subroutine that works for seqscans and other cases
4101 * where all the given quals will be evaluated the hard way. It's not useful
4102 * for cost_index(), for example, where the index machinery takes care of
4103 * some of the quals. We assume baserestrictcost was previously set by
4104 * set_baserel_size_estimates().
4105 */
4106 static void
get_restriction_qual_cost(PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,QualCost * qpqual_cost)4107 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4108 ParamPathInfo *param_info,
4109 QualCost *qpqual_cost)
4110 {
4111 if (param_info)
4112 {
4113 /* Include costs of pushed-down clauses */
4114 cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4115
4116 qpqual_cost->startup += baserel->baserestrictcost.startup;
4117 qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4118 }
4119 else
4120 *qpqual_cost = baserel->baserestrictcost;
4121 }
4122
4123
4124 /*
4125 * compute_semi_anti_join_factors
4126 * Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4127 * can be expected to scan.
4128 *
4129 * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4130 * inner rows as soon as it finds a match to the current outer row.
4131 * The same happens if we have detected the inner rel is unique.
4132 * We should therefore adjust some of the cost components for this effect.
4133 * This function computes some estimates needed for these adjustments.
4134 * These estimates will be the same regardless of the particular paths used
4135 * for the outer and inner relation, so we compute these once and then pass
4136 * them to all the join cost estimation functions.
4137 *
4138 * Input parameters:
4139 * joinrel: join relation under consideration
4140 * outerrel: outer relation under consideration
4141 * innerrel: inner relation under consideration
4142 * jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4143 * sjinfo: SpecialJoinInfo relevant to this join
4144 * restrictlist: join quals
4145 * Output parameters:
4146 * *semifactors is filled in (see pathnodes.h for field definitions)
4147 */
4148 void
compute_semi_anti_join_factors(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outerrel,RelOptInfo * innerrel,JoinType jointype,SpecialJoinInfo * sjinfo,List * restrictlist,SemiAntiJoinFactors * semifactors)4149 compute_semi_anti_join_factors(PlannerInfo *root,
4150 RelOptInfo *joinrel,
4151 RelOptInfo *outerrel,
4152 RelOptInfo *innerrel,
4153 JoinType jointype,
4154 SpecialJoinInfo *sjinfo,
4155 List *restrictlist,
4156 SemiAntiJoinFactors *semifactors)
4157 {
4158 Selectivity jselec;
4159 Selectivity nselec;
4160 Selectivity avgmatch;
4161 SpecialJoinInfo norm_sjinfo;
4162 List *joinquals;
4163 ListCell *l;
4164
4165 /*
4166 * In an ANTI join, we must ignore clauses that are "pushed down", since
4167 * those won't affect the match logic. In a SEMI join, we do not
4168 * distinguish joinquals from "pushed down" quals, so just use the whole
4169 * restrictinfo list. For other outer join types, we should consider only
4170 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4171 */
4172 if (IS_OUTER_JOIN(jointype))
4173 {
4174 joinquals = NIL;
4175 foreach(l, restrictlist)
4176 {
4177 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4178
4179 if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4180 joinquals = lappend(joinquals, rinfo);
4181 }
4182 }
4183 else
4184 joinquals = restrictlist;
4185
4186 /*
4187 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4188 */
4189 jselec = clauselist_selectivity(root,
4190 joinquals,
4191 0,
4192 (jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4193 sjinfo);
4194
4195 /*
4196 * Also get the normal inner-join selectivity of the join clauses.
4197 */
4198 norm_sjinfo.type = T_SpecialJoinInfo;
4199 norm_sjinfo.min_lefthand = outerrel->relids;
4200 norm_sjinfo.min_righthand = innerrel->relids;
4201 norm_sjinfo.syn_lefthand = outerrel->relids;
4202 norm_sjinfo.syn_righthand = innerrel->relids;
4203 norm_sjinfo.jointype = JOIN_INNER;
4204 /* we don't bother trying to make the remaining fields valid */
4205 norm_sjinfo.lhs_strict = false;
4206 norm_sjinfo.delay_upper_joins = false;
4207 norm_sjinfo.semi_can_btree = false;
4208 norm_sjinfo.semi_can_hash = false;
4209 norm_sjinfo.semi_operators = NIL;
4210 norm_sjinfo.semi_rhs_exprs = NIL;
4211
4212 nselec = clauselist_selectivity(root,
4213 joinquals,
4214 0,
4215 JOIN_INNER,
4216 &norm_sjinfo);
4217
4218 /* Avoid leaking a lot of ListCells */
4219 if (IS_OUTER_JOIN(jointype))
4220 list_free(joinquals);
4221
4222 /*
4223 * jselec can be interpreted as the fraction of outer-rel rows that have
4224 * any matches (this is true for both SEMI and ANTI cases). And nselec is
4225 * the fraction of the Cartesian product that matches. So, the average
4226 * number of matches for each outer-rel row that has at least one match is
4227 * nselec * inner_rows / jselec.
4228 *
4229 * Note: it is correct to use the inner rel's "rows" count here, even
4230 * though we might later be considering a parameterized inner path with
4231 * fewer rows. This is because we have included all the join clauses in
4232 * the selectivity estimate.
4233 */
4234 if (jselec > 0) /* protect against zero divide */
4235 {
4236 avgmatch = nselec * innerrel->rows / jselec;
4237 /* Clamp to sane range */
4238 avgmatch = Max(1.0, avgmatch);
4239 }
4240 else
4241 avgmatch = 1.0;
4242
4243 semifactors->outer_match_frac = jselec;
4244 semifactors->match_count = avgmatch;
4245 }
4246
4247 /*
4248 * has_indexed_join_quals
4249 * Check whether all the joinquals of a nestloop join are used as
4250 * inner index quals.
4251 *
4252 * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4253 * indexscan) that uses all the joinquals as indexquals, we can assume that an
4254 * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4255 * expensive.
4256 */
4257 static bool
has_indexed_join_quals(NestPath * joinpath)4258 has_indexed_join_quals(NestPath *joinpath)
4259 {
4260 Relids joinrelids = joinpath->path.parent->relids;
4261 Path *innerpath = joinpath->innerjoinpath;
4262 List *indexclauses;
4263 bool found_one;
4264 ListCell *lc;
4265
4266 /* If join still has quals to evaluate, it's not fast */
4267 if (joinpath->joinrestrictinfo != NIL)
4268 return false;
4269 /* Nor if the inner path isn't parameterized at all */
4270 if (innerpath->param_info == NULL)
4271 return false;
4272
4273 /* Find the indexclauses list for the inner scan */
4274 switch (innerpath->pathtype)
4275 {
4276 case T_IndexScan:
4277 case T_IndexOnlyScan:
4278 indexclauses = ((IndexPath *) innerpath)->indexclauses;
4279 break;
4280 case T_BitmapHeapScan:
4281 {
4282 /* Accept only a simple bitmap scan, not AND/OR cases */
4283 Path *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4284
4285 if (IsA(bmqual, IndexPath))
4286 indexclauses = ((IndexPath *) bmqual)->indexclauses;
4287 else
4288 return false;
4289 break;
4290 }
4291 default:
4292
4293 /*
4294 * If it's not a simple indexscan, it probably doesn't run quickly
4295 * for zero rows out, even if it's a parameterized path using all
4296 * the joinquals.
4297 */
4298 return false;
4299 }
4300
4301 /*
4302 * Examine the inner path's param clauses. Any that are from the outer
4303 * path must be found in the indexclauses list, either exactly or in an
4304 * equivalent form generated by equivclass.c. Also, we must find at least
4305 * one such clause, else it's a clauseless join which isn't fast.
4306 */
4307 found_one = false;
4308 foreach(lc, innerpath->param_info->ppi_clauses)
4309 {
4310 RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4311
4312 if (join_clause_is_movable_into(rinfo,
4313 innerpath->parent->relids,
4314 joinrelids))
4315 {
4316 if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4317 return false;
4318 found_one = true;
4319 }
4320 }
4321 return found_one;
4322 }
4323
4324
4325 /*
4326 * approx_tuple_count
4327 * Quick-and-dirty estimation of the number of join rows passing
4328 * a set of qual conditions.
4329 *
4330 * The quals can be either an implicitly-ANDed list of boolean expressions,
4331 * or a list of RestrictInfo nodes (typically the latter).
4332 *
4333 * We intentionally compute the selectivity under JOIN_INNER rules, even
4334 * if it's some type of outer join. This is appropriate because we are
4335 * trying to figure out how many tuples pass the initial merge or hash
4336 * join step.
4337 *
4338 * This is quick-and-dirty because we bypass clauselist_selectivity, and
4339 * simply multiply the independent clause selectivities together. Now
4340 * clauselist_selectivity often can't do any better than that anyhow, but
4341 * for some situations (such as range constraints) it is smarter. However,
4342 * we can't effectively cache the results of clauselist_selectivity, whereas
4343 * the individual clause selectivities can be and are cached.
4344 *
4345 * Since we are only using the results to estimate how many potential
4346 * output tuples are generated and passed through qpqual checking, it
4347 * seems OK to live with the approximation.
4348 */
4349 static double
approx_tuple_count(PlannerInfo * root,JoinPath * path,List * quals)4350 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4351 {
4352 double tuples;
4353 double outer_tuples = path->outerjoinpath->rows;
4354 double inner_tuples = path->innerjoinpath->rows;
4355 SpecialJoinInfo sjinfo;
4356 Selectivity selec = 1.0;
4357 ListCell *l;
4358
4359 /*
4360 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4361 */
4362 sjinfo.type = T_SpecialJoinInfo;
4363 sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4364 sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4365 sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4366 sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4367 sjinfo.jointype = JOIN_INNER;
4368 /* we don't bother trying to make the remaining fields valid */
4369 sjinfo.lhs_strict = false;
4370 sjinfo.delay_upper_joins = false;
4371 sjinfo.semi_can_btree = false;
4372 sjinfo.semi_can_hash = false;
4373 sjinfo.semi_operators = NIL;
4374 sjinfo.semi_rhs_exprs = NIL;
4375
4376 /* Get the approximate selectivity */
4377 foreach(l, quals)
4378 {
4379 Node *qual = (Node *) lfirst(l);
4380
4381 /* Note that clause_selectivity will be able to cache its result */
4382 selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4383 }
4384
4385 /* Apply it to the input relation sizes */
4386 tuples = selec * outer_tuples * inner_tuples;
4387
4388 return clamp_row_est(tuples);
4389 }
4390
4391
4392 /*
4393 * set_baserel_size_estimates
4394 * Set the size estimates for the given base relation.
4395 *
4396 * The rel's targetlist and restrictinfo list must have been constructed
4397 * already, and rel->tuples must be set.
4398 *
4399 * We set the following fields of the rel node:
4400 * rows: the estimated number of output tuples (after applying
4401 * restriction clauses).
4402 * width: the estimated average output tuple width in bytes.
4403 * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4404 */
4405 void
set_baserel_size_estimates(PlannerInfo * root,RelOptInfo * rel)4406 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4407 {
4408 double nrows;
4409
4410 /* Should only be applied to base relations */
4411 Assert(rel->relid > 0);
4412
4413 nrows = rel->tuples *
4414 clauselist_selectivity(root,
4415 rel->baserestrictinfo,
4416 0,
4417 JOIN_INNER,
4418 NULL);
4419
4420 rel->rows = clamp_row_est(nrows);
4421
4422 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4423
4424 set_rel_width(root, rel);
4425 }
4426
4427 /*
4428 * get_parameterized_baserel_size
4429 * Make a size estimate for a parameterized scan of a base relation.
4430 *
4431 * 'param_clauses' lists the additional join clauses to be used.
4432 *
4433 * set_baserel_size_estimates must have been applied already.
4434 */
4435 double
get_parameterized_baserel_size(PlannerInfo * root,RelOptInfo * rel,List * param_clauses)4436 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4437 List *param_clauses)
4438 {
4439 List *allclauses;
4440 double nrows;
4441
4442 /*
4443 * Estimate the number of rows returned by the parameterized scan, knowing
4444 * that it will apply all the extra join clauses as well as the rel's own
4445 * restriction clauses. Note that we force the clauses to be treated as
4446 * non-join clauses during selectivity estimation.
4447 */
4448 allclauses = list_concat(list_copy(param_clauses),
4449 rel->baserestrictinfo);
4450 nrows = rel->tuples *
4451 clauselist_selectivity(root,
4452 allclauses,
4453 rel->relid, /* do not use 0! */
4454 JOIN_INNER,
4455 NULL);
4456 nrows = clamp_row_est(nrows);
4457 /* For safety, make sure result is not more than the base estimate */
4458 if (nrows > rel->rows)
4459 nrows = rel->rows;
4460 return nrows;
4461 }
4462
4463 /*
4464 * set_joinrel_size_estimates
4465 * Set the size estimates for the given join relation.
4466 *
4467 * The rel's targetlist must have been constructed already, and a
4468 * restriction clause list that matches the given component rels must
4469 * be provided.
4470 *
4471 * Since there is more than one way to make a joinrel for more than two
4472 * base relations, the results we get here could depend on which component
4473 * rel pair is provided. In theory we should get the same answers no matter
4474 * which pair is provided; in practice, since the selectivity estimation
4475 * routines don't handle all cases equally well, we might not. But there's
4476 * not much to be done about it. (Would it make sense to repeat the
4477 * calculations for each pair of input rels that's encountered, and somehow
4478 * average the results? Probably way more trouble than it's worth, and
4479 * anyway we must keep the rowcount estimate the same for all paths for the
4480 * joinrel.)
4481 *
4482 * We set only the rows field here. The reltarget field was already set by
4483 * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4484 */
4485 void
set_joinrel_size_estimates(PlannerInfo * root,RelOptInfo * rel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,SpecialJoinInfo * sjinfo,List * restrictlist)4486 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4487 RelOptInfo *outer_rel,
4488 RelOptInfo *inner_rel,
4489 SpecialJoinInfo *sjinfo,
4490 List *restrictlist)
4491 {
4492 rel->rows = calc_joinrel_size_estimate(root,
4493 rel,
4494 outer_rel,
4495 inner_rel,
4496 outer_rel->rows,
4497 inner_rel->rows,
4498 sjinfo,
4499 restrictlist);
4500 }
4501
4502 /*
4503 * get_parameterized_joinrel_size
4504 * Make a size estimate for a parameterized scan of a join relation.
4505 *
4506 * 'rel' is the joinrel under consideration.
4507 * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4508 * produce the relations being joined.
4509 * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4510 * 'restrict_clauses' lists the join clauses that need to be applied at the
4511 * join node (including any movable clauses that were moved down to this join,
4512 * and not including any movable clauses that were pushed down into the
4513 * child paths).
4514 *
4515 * set_joinrel_size_estimates must have been applied already.
4516 */
4517 double
get_parameterized_joinrel_size(PlannerInfo * root,RelOptInfo * rel,Path * outer_path,Path * inner_path,SpecialJoinInfo * sjinfo,List * restrict_clauses)4518 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4519 Path *outer_path,
4520 Path *inner_path,
4521 SpecialJoinInfo *sjinfo,
4522 List *restrict_clauses)
4523 {
4524 double nrows;
4525
4526 /*
4527 * Estimate the number of rows returned by the parameterized join as the
4528 * sizes of the input paths times the selectivity of the clauses that have
4529 * ended up at this join node.
4530 *
4531 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4532 * on the pair of input paths provided, though ideally we'd get the same
4533 * estimate for any pair with the same parameterization.
4534 */
4535 nrows = calc_joinrel_size_estimate(root,
4536 rel,
4537 outer_path->parent,
4538 inner_path->parent,
4539 outer_path->rows,
4540 inner_path->rows,
4541 sjinfo,
4542 restrict_clauses);
4543 /* For safety, make sure result is not more than the base estimate */
4544 if (nrows > rel->rows)
4545 nrows = rel->rows;
4546 return nrows;
4547 }
4548
4549 /*
4550 * calc_joinrel_size_estimate
4551 * Workhorse for set_joinrel_size_estimates and
4552 * get_parameterized_joinrel_size.
4553 *
4554 * outer_rel/inner_rel are the relations being joined, but they should be
4555 * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4556 * than what rel->rows says, when we are considering parameterized paths.
4557 */
4558 static double
calc_joinrel_size_estimate(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,double outer_rows,double inner_rows,SpecialJoinInfo * sjinfo,List * restrictlist_in)4559 calc_joinrel_size_estimate(PlannerInfo *root,
4560 RelOptInfo *joinrel,
4561 RelOptInfo *outer_rel,
4562 RelOptInfo *inner_rel,
4563 double outer_rows,
4564 double inner_rows,
4565 SpecialJoinInfo *sjinfo,
4566 List *restrictlist_in)
4567 {
4568 /* This apparently-useless variable dodges a compiler bug in VS2013: */
4569 List *restrictlist = restrictlist_in;
4570 JoinType jointype = sjinfo->jointype;
4571 Selectivity fkselec;
4572 Selectivity jselec;
4573 Selectivity pselec;
4574 double nrows;
4575
4576 /*
4577 * Compute joinclause selectivity. Note that we are only considering
4578 * clauses that become restriction clauses at this join level; we are not
4579 * double-counting them because they were not considered in estimating the
4580 * sizes of the component rels.
4581 *
4582 * First, see whether any of the joinclauses can be matched to known FK
4583 * constraints. If so, drop those clauses from the restrictlist, and
4584 * instead estimate their selectivity using FK semantics. (We do this
4585 * without regard to whether said clauses are local or "pushed down".
4586 * Probably, an FK-matching clause could never be seen as pushed down at
4587 * an outer join, since it would be strict and hence would be grounds for
4588 * join strength reduction.) fkselec gets the net selectivity for
4589 * FK-matching clauses, or 1.0 if there are none.
4590 */
4591 fkselec = get_foreign_key_join_selectivity(root,
4592 outer_rel->relids,
4593 inner_rel->relids,
4594 sjinfo,
4595 &restrictlist);
4596
4597 /*
4598 * For an outer join, we have to distinguish the selectivity of the join's
4599 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4600 * down". For inner joins we just count them all as joinclauses.
4601 */
4602 if (IS_OUTER_JOIN(jointype))
4603 {
4604 List *joinquals = NIL;
4605 List *pushedquals = NIL;
4606 ListCell *l;
4607
4608 /* Grovel through the clauses to separate into two lists */
4609 foreach(l, restrictlist)
4610 {
4611 RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4612
4613 if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4614 pushedquals = lappend(pushedquals, rinfo);
4615 else
4616 joinquals = lappend(joinquals, rinfo);
4617 }
4618
4619 /* Get the separate selectivities */
4620 jselec = clauselist_selectivity(root,
4621 joinquals,
4622 0,
4623 jointype,
4624 sjinfo);
4625 pselec = clauselist_selectivity(root,
4626 pushedquals,
4627 0,
4628 jointype,
4629 sjinfo);
4630
4631 /* Avoid leaking a lot of ListCells */
4632 list_free(joinquals);
4633 list_free(pushedquals);
4634 }
4635 else
4636 {
4637 jselec = clauselist_selectivity(root,
4638 restrictlist,
4639 0,
4640 jointype,
4641 sjinfo);
4642 pselec = 0.0; /* not used, keep compiler quiet */
4643 }
4644
4645 /*
4646 * Basically, we multiply size of Cartesian product by selectivity.
4647 *
4648 * If we are doing an outer join, take that into account: the joinqual
4649 * selectivity has to be clamped using the knowledge that the output must
4650 * be at least as large as the non-nullable input. However, any
4651 * pushed-down quals are applied after the outer join, so their
4652 * selectivity applies fully.
4653 *
4654 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4655 * of LHS rows that have matches, and we apply that straightforwardly.
4656 */
4657 switch (jointype)
4658 {
4659 case JOIN_INNER:
4660 nrows = outer_rows * inner_rows * fkselec * jselec;
4661 /* pselec not used */
4662 break;
4663 case JOIN_LEFT:
4664 nrows = outer_rows * inner_rows * fkselec * jselec;
4665 if (nrows < outer_rows)
4666 nrows = outer_rows;
4667 nrows *= pselec;
4668 break;
4669 case JOIN_FULL:
4670 nrows = outer_rows * inner_rows * fkselec * jselec;
4671 if (nrows < outer_rows)
4672 nrows = outer_rows;
4673 if (nrows < inner_rows)
4674 nrows = inner_rows;
4675 nrows *= pselec;
4676 break;
4677 case JOIN_SEMI:
4678 nrows = outer_rows * fkselec * jselec;
4679 /* pselec not used */
4680 break;
4681 case JOIN_ANTI:
4682 nrows = outer_rows * (1.0 - fkselec * jselec);
4683 nrows *= pselec;
4684 break;
4685 default:
4686 /* other values not expected here */
4687 elog(ERROR, "unrecognized join type: %d", (int) jointype);
4688 nrows = 0; /* keep compiler quiet */
4689 break;
4690 }
4691
4692 return clamp_row_est(nrows);
4693 }
4694
4695 /*
4696 * get_foreign_key_join_selectivity
4697 * Estimate join selectivity for foreign-key-related clauses.
4698 *
4699 * Remove any clauses that can be matched to FK constraints from *restrictlist,
4700 * and return a substitute estimate of their selectivity. 1.0 is returned
4701 * when there are no such clauses.
4702 *
4703 * The reason for treating such clauses specially is that we can get better
4704 * estimates this way than by relying on clauselist_selectivity(), especially
4705 * for multi-column FKs where that function's assumption that the clauses are
4706 * independent falls down badly. But even with single-column FKs, we may be
4707 * able to get a better answer when the pg_statistic stats are missing or out
4708 * of date.
4709 */
4710 static Selectivity
get_foreign_key_join_selectivity(PlannerInfo * root,Relids outer_relids,Relids inner_relids,SpecialJoinInfo * sjinfo,List ** restrictlist)4711 get_foreign_key_join_selectivity(PlannerInfo *root,
4712 Relids outer_relids,
4713 Relids inner_relids,
4714 SpecialJoinInfo *sjinfo,
4715 List **restrictlist)
4716 {
4717 Selectivity fkselec = 1.0;
4718 JoinType jointype = sjinfo->jointype;
4719 List *worklist = *restrictlist;
4720 ListCell *lc;
4721
4722 /* Consider each FK constraint that is known to match the query */
4723 foreach(lc, root->fkey_list)
4724 {
4725 ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4726 bool ref_is_outer;
4727 List *removedlist;
4728 ListCell *cell;
4729 ListCell *prev;
4730 ListCell *next;
4731
4732 /*
4733 * This FK is not relevant unless it connects a baserel on one side of
4734 * this join to a baserel on the other side.
4735 */
4736 if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4737 bms_is_member(fkinfo->ref_relid, inner_relids))
4738 ref_is_outer = false;
4739 else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4740 bms_is_member(fkinfo->con_relid, inner_relids))
4741 ref_is_outer = true;
4742 else
4743 continue;
4744
4745 /*
4746 * If we're dealing with a semi/anti join, and the FK's referenced
4747 * relation is on the outside, then knowledge of the FK doesn't help
4748 * us figure out what we need to know (which is the fraction of outer
4749 * rows that have matches). On the other hand, if the referenced rel
4750 * is on the inside, then all outer rows must have matches in the
4751 * referenced table (ignoring nulls). But any restriction or join
4752 * clauses that filter that table will reduce the fraction of matches.
4753 * We can account for restriction clauses, but it's too hard to guess
4754 * how many table rows would get through a join that's inside the RHS.
4755 * Hence, if either case applies, punt and ignore the FK.
4756 */
4757 if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4758 (ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4759 continue;
4760
4761 /*
4762 * Modify the restrictlist by removing clauses that match the FK (and
4763 * putting them into removedlist instead). It seems unsafe to modify
4764 * the originally-passed List structure, so we make a shallow copy the
4765 * first time through.
4766 */
4767 if (worklist == *restrictlist)
4768 worklist = list_copy(worklist);
4769
4770 removedlist = NIL;
4771 prev = NULL;
4772 for (cell = list_head(worklist); cell; cell = next)
4773 {
4774 RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
4775 bool remove_it = false;
4776 int i;
4777
4778 next = lnext(cell);
4779 /* Drop this clause if it matches any column of the FK */
4780 for (i = 0; i < fkinfo->nkeys; i++)
4781 {
4782 if (rinfo->parent_ec)
4783 {
4784 /*
4785 * EC-derived clauses can only match by EC. It is okay to
4786 * consider any clause derived from the same EC as
4787 * matching the FK: even if equivclass.c chose to generate
4788 * a clause equating some other pair of Vars, it could
4789 * have generated one equating the FK's Vars. So for
4790 * purposes of estimation, we can act as though it did so.
4791 *
4792 * Note: checking parent_ec is a bit of a cheat because
4793 * there are EC-derived clauses that don't have parent_ec
4794 * set; but such clauses must compare expressions that
4795 * aren't just Vars, so they cannot match the FK anyway.
4796 */
4797 if (fkinfo->eclass[i] == rinfo->parent_ec)
4798 {
4799 remove_it = true;
4800 break;
4801 }
4802 }
4803 else
4804 {
4805 /*
4806 * Otherwise, see if rinfo was previously matched to FK as
4807 * a "loose" clause.
4808 */
4809 if (list_member_ptr(fkinfo->rinfos[i], rinfo))
4810 {
4811 remove_it = true;
4812 break;
4813 }
4814 }
4815 }
4816 if (remove_it)
4817 {
4818 worklist = list_delete_cell(worklist, cell, prev);
4819 removedlist = lappend(removedlist, rinfo);
4820 }
4821 else
4822 prev = cell;
4823 }
4824
4825 /*
4826 * If we failed to remove all the matching clauses we expected to
4827 * find, chicken out and ignore this FK; applying its selectivity
4828 * might result in double-counting. Put any clauses we did manage to
4829 * remove back into the worklist.
4830 *
4831 * Since the matching clauses are known not outerjoin-delayed, they
4832 * should certainly have appeared in the initial joinclause list. If
4833 * we didn't find them, they must have been matched to, and removed
4834 * by, some other FK in a previous iteration of this loop. (A likely
4835 * case is that two FKs are matched to the same EC; there will be only
4836 * one EC-derived clause in the initial list, so the first FK will
4837 * consume it.) Applying both FKs' selectivity independently risks
4838 * underestimating the join size; in particular, this would undo one
4839 * of the main things that ECs were invented for, namely to avoid
4840 * double-counting the selectivity of redundant equality conditions.
4841 * Later we might think of a reasonable way to combine the estimates,
4842 * but for now, just punt, since this is a fairly uncommon situation.
4843 */
4844 if (list_length(removedlist) !=
4845 (fkinfo->nmatched_ec + fkinfo->nmatched_ri))
4846 {
4847 worklist = list_concat(worklist, removedlist);
4848 continue;
4849 }
4850
4851 /*
4852 * Finally we get to the payoff: estimate selectivity using the
4853 * knowledge that each referencing row will match exactly one row in
4854 * the referenced table.
4855 *
4856 * XXX that's not true in the presence of nulls in the referencing
4857 * column(s), so in principle we should derate the estimate for those.
4858 * However (1) if there are any strict restriction clauses for the
4859 * referencing column(s) elsewhere in the query, derating here would
4860 * be double-counting the null fraction, and (2) it's not very clear
4861 * how to combine null fractions for multiple referencing columns. So
4862 * we do nothing for now about correcting for nulls.
4863 *
4864 * XXX another point here is that if either side of an FK constraint
4865 * is an inheritance parent, we estimate as though the constraint
4866 * covers all its children as well. This is not an unreasonable
4867 * assumption for a referencing table, ie the user probably applied
4868 * identical constraints to all child tables (though perhaps we ought
4869 * to check that). But it's not possible to have done that for a
4870 * referenced table. Fortunately, precisely because that doesn't
4871 * work, it is uncommon in practice to have an FK referencing a parent
4872 * table. So, at least for now, disregard inheritance here.
4873 */
4874 if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
4875 {
4876 /*
4877 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
4878 * referenced table is exactly the inside of the join. The join
4879 * selectivity is defined as the fraction of LHS rows that have
4880 * matches. The FK implies that every LHS row has a match *in the
4881 * referenced table*; but any restriction clauses on it will
4882 * reduce the number of matches. Hence we take the join
4883 * selectivity as equal to the selectivity of the table's
4884 * restriction clauses, which is rows / tuples; but we must guard
4885 * against tuples == 0.
4886 */
4887 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4888 double ref_tuples = Max(ref_rel->tuples, 1.0);
4889
4890 fkselec *= ref_rel->rows / ref_tuples;
4891 }
4892 else
4893 {
4894 /*
4895 * Otherwise, selectivity is exactly 1/referenced-table-size; but
4896 * guard against tuples == 0. Note we should use the raw table
4897 * tuple count, not any estimate of its filtered or joined size.
4898 */
4899 RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4900 double ref_tuples = Max(ref_rel->tuples, 1.0);
4901
4902 fkselec *= 1.0 / ref_tuples;
4903 }
4904 }
4905
4906 *restrictlist = worklist;
4907 return fkselec;
4908 }
4909
4910 /*
4911 * set_subquery_size_estimates
4912 * Set the size estimates for a base relation that is a subquery.
4913 *
4914 * The rel's targetlist and restrictinfo list must have been constructed
4915 * already, and the Paths for the subquery must have been completed.
4916 * We look at the subquery's PlannerInfo to extract data.
4917 *
4918 * We set the same fields as set_baserel_size_estimates.
4919 */
4920 void
set_subquery_size_estimates(PlannerInfo * root,RelOptInfo * rel)4921 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4922 {
4923 PlannerInfo *subroot = rel->subroot;
4924 RelOptInfo *sub_final_rel;
4925 ListCell *lc;
4926
4927 /* Should only be applied to base relations that are subqueries */
4928 Assert(rel->relid > 0);
4929 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
4930
4931 /*
4932 * Copy raw number of output rows from subquery. All of its paths should
4933 * have the same output rowcount, so just look at cheapest-total.
4934 */
4935 sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
4936 rel->tuples = sub_final_rel->cheapest_total_path->rows;
4937
4938 /*
4939 * Compute per-output-column width estimates by examining the subquery's
4940 * targetlist. For any output that is a plain Var, get the width estimate
4941 * that was made while planning the subquery. Otherwise, we leave it to
4942 * set_rel_width to fill in a datatype-based default estimate.
4943 */
4944 foreach(lc, subroot->parse->targetList)
4945 {
4946 TargetEntry *te = lfirst_node(TargetEntry, lc);
4947 Node *texpr = (Node *) te->expr;
4948 int32 item_width = 0;
4949
4950 /* junk columns aren't visible to upper query */
4951 if (te->resjunk)
4952 continue;
4953
4954 /*
4955 * The subquery could be an expansion of a view that's had columns
4956 * added to it since the current query was parsed, so that there are
4957 * non-junk tlist columns in it that don't correspond to any column
4958 * visible at our query level. Ignore such columns.
4959 */
4960 if (te->resno < rel->min_attr || te->resno > rel->max_attr)
4961 continue;
4962
4963 /*
4964 * XXX This currently doesn't work for subqueries containing set
4965 * operations, because the Vars in their tlists are bogus references
4966 * to the first leaf subquery, which wouldn't give the right answer
4967 * even if we could still get to its PlannerInfo.
4968 *
4969 * Also, the subquery could be an appendrel for which all branches are
4970 * known empty due to constraint exclusion, in which case
4971 * set_append_rel_pathlist will have left the attr_widths set to zero.
4972 *
4973 * In either case, we just leave the width estimate zero until
4974 * set_rel_width fixes it.
4975 */
4976 if (IsA(texpr, Var) &&
4977 subroot->parse->setOperations == NULL)
4978 {
4979 Var *var = (Var *) texpr;
4980 RelOptInfo *subrel = find_base_rel(subroot, var->varno);
4981
4982 item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
4983 }
4984 rel->attr_widths[te->resno - rel->min_attr] = item_width;
4985 }
4986
4987 /* Now estimate number of output rows, etc */
4988 set_baserel_size_estimates(root, rel);
4989 }
4990
4991 /*
4992 * set_function_size_estimates
4993 * Set the size estimates for a base relation that is a function call.
4994 *
4995 * The rel's targetlist and restrictinfo list must have been constructed
4996 * already.
4997 *
4998 * We set the same fields as set_baserel_size_estimates.
4999 */
5000 void
set_function_size_estimates(PlannerInfo * root,RelOptInfo * rel)5001 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5002 {
5003 RangeTblEntry *rte;
5004 ListCell *lc;
5005
5006 /* Should only be applied to base relations that are functions */
5007 Assert(rel->relid > 0);
5008 rte = planner_rt_fetch(rel->relid, root);
5009 Assert(rte->rtekind == RTE_FUNCTION);
5010
5011 /*
5012 * Estimate number of rows the functions will return. The rowcount of the
5013 * node is that of the largest function result.
5014 */
5015 rel->tuples = 0;
5016 foreach(lc, rte->functions)
5017 {
5018 RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5019 double ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5020
5021 if (ntup > rel->tuples)
5022 rel->tuples = ntup;
5023 }
5024
5025 /* Now estimate number of output rows, etc */
5026 set_baserel_size_estimates(root, rel);
5027 }
5028
5029 /*
5030 * set_function_size_estimates
5031 * Set the size estimates for a base relation that is a function call.
5032 *
5033 * The rel's targetlist and restrictinfo list must have been constructed
5034 * already.
5035 *
5036 * We set the same fields as set_tablefunc_size_estimates.
5037 */
5038 void
set_tablefunc_size_estimates(PlannerInfo * root,RelOptInfo * rel)5039 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5040 {
5041 /* Should only be applied to base relations that are functions */
5042 Assert(rel->relid > 0);
5043 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5044
5045 rel->tuples = 100;
5046
5047 /* Now estimate number of output rows, etc */
5048 set_baserel_size_estimates(root, rel);
5049 }
5050
5051 /*
5052 * set_values_size_estimates
5053 * Set the size estimates for a base relation that is a values list.
5054 *
5055 * The rel's targetlist and restrictinfo list must have been constructed
5056 * already.
5057 *
5058 * We set the same fields as set_baserel_size_estimates.
5059 */
5060 void
set_values_size_estimates(PlannerInfo * root,RelOptInfo * rel)5061 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5062 {
5063 RangeTblEntry *rte;
5064
5065 /* Should only be applied to base relations that are values lists */
5066 Assert(rel->relid > 0);
5067 rte = planner_rt_fetch(rel->relid, root);
5068 Assert(rte->rtekind == RTE_VALUES);
5069
5070 /*
5071 * Estimate number of rows the values list will return. We know this
5072 * precisely based on the list length (well, barring set-returning
5073 * functions in list items, but that's a refinement not catered for
5074 * anywhere else either).
5075 */
5076 rel->tuples = list_length(rte->values_lists);
5077
5078 /* Now estimate number of output rows, etc */
5079 set_baserel_size_estimates(root, rel);
5080 }
5081
5082 /*
5083 * set_cte_size_estimates
5084 * Set the size estimates for a base relation that is a CTE reference.
5085 *
5086 * The rel's targetlist and restrictinfo list must have been constructed
5087 * already, and we need an estimate of the number of rows returned by the CTE
5088 * (if a regular CTE) or the non-recursive term (if a self-reference).
5089 *
5090 * We set the same fields as set_baserel_size_estimates.
5091 */
5092 void
set_cte_size_estimates(PlannerInfo * root,RelOptInfo * rel,double cte_rows)5093 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5094 {
5095 RangeTblEntry *rte;
5096
5097 /* Should only be applied to base relations that are CTE references */
5098 Assert(rel->relid > 0);
5099 rte = planner_rt_fetch(rel->relid, root);
5100 Assert(rte->rtekind == RTE_CTE);
5101
5102 if (rte->self_reference)
5103 {
5104 /*
5105 * In a self-reference, arbitrarily assume the average worktable size
5106 * is about 10 times the nonrecursive term's size.
5107 */
5108 rel->tuples = 10 * cte_rows;
5109 }
5110 else
5111 {
5112 /* Otherwise just believe the CTE's rowcount estimate */
5113 rel->tuples = cte_rows;
5114 }
5115
5116 /* Now estimate number of output rows, etc */
5117 set_baserel_size_estimates(root, rel);
5118 }
5119
5120 /*
5121 * set_namedtuplestore_size_estimates
5122 * Set the size estimates for a base relation that is a tuplestore reference.
5123 *
5124 * The rel's targetlist and restrictinfo list must have been constructed
5125 * already.
5126 *
5127 * We set the same fields as set_baserel_size_estimates.
5128 */
5129 void
set_namedtuplestore_size_estimates(PlannerInfo * root,RelOptInfo * rel)5130 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5131 {
5132 RangeTblEntry *rte;
5133
5134 /* Should only be applied to base relations that are tuplestore references */
5135 Assert(rel->relid > 0);
5136 rte = planner_rt_fetch(rel->relid, root);
5137 Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5138
5139 /*
5140 * Use the estimate provided by the code which is generating the named
5141 * tuplestore. In some cases, the actual number might be available; in
5142 * others the same plan will be re-used, so a "typical" value might be
5143 * estimated and used.
5144 */
5145 rel->tuples = rte->enrtuples;
5146 if (rel->tuples < 0)
5147 rel->tuples = 1000;
5148
5149 /* Now estimate number of output rows, etc */
5150 set_baserel_size_estimates(root, rel);
5151 }
5152
5153 /*
5154 * set_result_size_estimates
5155 * Set the size estimates for an RTE_RESULT base relation
5156 *
5157 * The rel's targetlist and restrictinfo list must have been constructed
5158 * already.
5159 *
5160 * We set the same fields as set_baserel_size_estimates.
5161 */
5162 void
set_result_size_estimates(PlannerInfo * root,RelOptInfo * rel)5163 set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5164 {
5165 /* Should only be applied to RTE_RESULT base relations */
5166 Assert(rel->relid > 0);
5167 Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5168
5169 /* RTE_RESULT always generates a single row, natively */
5170 rel->tuples = 1;
5171
5172 /* Now estimate number of output rows, etc */
5173 set_baserel_size_estimates(root, rel);
5174 }
5175
5176 /*
5177 * set_foreign_size_estimates
5178 * Set the size estimates for a base relation that is a foreign table.
5179 *
5180 * There is not a whole lot that we can do here; the foreign-data wrapper
5181 * is responsible for producing useful estimates. We can do a decent job
5182 * of estimating baserestrictcost, so we set that, and we also set up width
5183 * using what will be purely datatype-driven estimates from the targetlist.
5184 * There is no way to do anything sane with the rows value, so we just put
5185 * a default estimate and hope that the wrapper can improve on it. The
5186 * wrapper's GetForeignRelSize function will be called momentarily.
5187 *
5188 * The rel's targetlist and restrictinfo list must have been constructed
5189 * already.
5190 */
5191 void
set_foreign_size_estimates(PlannerInfo * root,RelOptInfo * rel)5192 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5193 {
5194 /* Should only be applied to base relations */
5195 Assert(rel->relid > 0);
5196
5197 rel->rows = 1000; /* entirely bogus default estimate */
5198
5199 cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5200
5201 set_rel_width(root, rel);
5202 }
5203
5204
5205 /*
5206 * set_rel_width
5207 * Set the estimated output width of a base relation.
5208 *
5209 * The estimated output width is the sum of the per-attribute width estimates
5210 * for the actually-referenced columns, plus any PHVs or other expressions
5211 * that have to be calculated at this relation. This is the amount of data
5212 * we'd need to pass upwards in case of a sort, hash, etc.
5213 *
5214 * This function also sets reltarget->cost, so it's a bit misnamed now.
5215 *
5216 * NB: this works best on plain relations because it prefers to look at
5217 * real Vars. For subqueries, set_subquery_size_estimates will already have
5218 * copied up whatever per-column estimates were made within the subquery,
5219 * and for other types of rels there isn't much we can do anyway. We fall
5220 * back on (fairly stupid) datatype-based width estimates if we can't get
5221 * any better number.
5222 *
5223 * The per-attribute width estimates are cached for possible re-use while
5224 * building join relations or post-scan/join pathtargets.
5225 */
5226 static void
set_rel_width(PlannerInfo * root,RelOptInfo * rel)5227 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5228 {
5229 Oid reloid = planner_rt_fetch(rel->relid, root)->relid;
5230 int32 tuple_width = 0;
5231 bool have_wholerow_var = false;
5232 ListCell *lc;
5233
5234 /* Vars are assumed to have cost zero, but other exprs do not */
5235 rel->reltarget->cost.startup = 0;
5236 rel->reltarget->cost.per_tuple = 0;
5237
5238 foreach(lc, rel->reltarget->exprs)
5239 {
5240 Node *node = (Node *) lfirst(lc);
5241
5242 /*
5243 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5244 * but there are corner cases involving LATERAL references where that
5245 * isn't so. If the Var has the wrong varno, fall through to the
5246 * generic case (it doesn't seem worth the trouble to be any smarter).
5247 */
5248 if (IsA(node, Var) &&
5249 ((Var *) node)->varno == rel->relid)
5250 {
5251 Var *var = (Var *) node;
5252 int ndx;
5253 int32 item_width;
5254
5255 Assert(var->varattno >= rel->min_attr);
5256 Assert(var->varattno <= rel->max_attr);
5257
5258 ndx = var->varattno - rel->min_attr;
5259
5260 /*
5261 * If it's a whole-row Var, we'll deal with it below after we have
5262 * already cached as many attr widths as possible.
5263 */
5264 if (var->varattno == 0)
5265 {
5266 have_wholerow_var = true;
5267 continue;
5268 }
5269
5270 /*
5271 * The width may have been cached already (especially if it's a
5272 * subquery), so don't duplicate effort.
5273 */
5274 if (rel->attr_widths[ndx] > 0)
5275 {
5276 tuple_width += rel->attr_widths[ndx];
5277 continue;
5278 }
5279
5280 /* Try to get column width from statistics */
5281 if (reloid != InvalidOid && var->varattno > 0)
5282 {
5283 item_width = get_attavgwidth(reloid, var->varattno);
5284 if (item_width > 0)
5285 {
5286 rel->attr_widths[ndx] = item_width;
5287 tuple_width += item_width;
5288 continue;
5289 }
5290 }
5291
5292 /*
5293 * Not a plain relation, or can't find statistics for it. Estimate
5294 * using just the type info.
5295 */
5296 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5297 Assert(item_width > 0);
5298 rel->attr_widths[ndx] = item_width;
5299 tuple_width += item_width;
5300 }
5301 else if (IsA(node, PlaceHolderVar))
5302 {
5303 /*
5304 * We will need to evaluate the PHV's contained expression while
5305 * scanning this rel, so be sure to include it in reltarget->cost.
5306 */
5307 PlaceHolderVar *phv = (PlaceHolderVar *) node;
5308 PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5309 QualCost cost;
5310
5311 tuple_width += phinfo->ph_width;
5312 cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5313 rel->reltarget->cost.startup += cost.startup;
5314 rel->reltarget->cost.per_tuple += cost.per_tuple;
5315 }
5316 else
5317 {
5318 /*
5319 * We could be looking at an expression pulled up from a subquery,
5320 * or a ROW() representing a whole-row child Var, etc. Do what we
5321 * can using the expression type information.
5322 */
5323 int32 item_width;
5324 QualCost cost;
5325
5326 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5327 Assert(item_width > 0);
5328 tuple_width += item_width;
5329 /* Not entirely clear if we need to account for cost, but do so */
5330 cost_qual_eval_node(&cost, node, root);
5331 rel->reltarget->cost.startup += cost.startup;
5332 rel->reltarget->cost.per_tuple += cost.per_tuple;
5333 }
5334 }
5335
5336 /*
5337 * If we have a whole-row reference, estimate its width as the sum of
5338 * per-column widths plus heap tuple header overhead.
5339 */
5340 if (have_wholerow_var)
5341 {
5342 int32 wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5343
5344 if (reloid != InvalidOid)
5345 {
5346 /* Real relation, so estimate true tuple width */
5347 wholerow_width += get_relation_data_width(reloid,
5348 rel->attr_widths - rel->min_attr);
5349 }
5350 else
5351 {
5352 /* Do what we can with info for a phony rel */
5353 AttrNumber i;
5354
5355 for (i = 1; i <= rel->max_attr; i++)
5356 wholerow_width += rel->attr_widths[i - rel->min_attr];
5357 }
5358
5359 rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5360
5361 /*
5362 * Include the whole-row Var as part of the output tuple. Yes, that
5363 * really is what happens at runtime.
5364 */
5365 tuple_width += wholerow_width;
5366 }
5367
5368 Assert(tuple_width >= 0);
5369 rel->reltarget->width = tuple_width;
5370 }
5371
5372 /*
5373 * set_pathtarget_cost_width
5374 * Set the estimated eval cost and output width of a PathTarget tlist.
5375 *
5376 * As a notational convenience, returns the same PathTarget pointer passed in.
5377 *
5378 * Most, though not quite all, uses of this function occur after we've run
5379 * set_rel_width() for base relations; so we can usually obtain cached width
5380 * estimates for Vars. If we can't, fall back on datatype-based width
5381 * estimates. Present early-planning uses of PathTargets don't need accurate
5382 * widths badly enough to justify going to the catalogs for better data.
5383 */
5384 PathTarget *
set_pathtarget_cost_width(PlannerInfo * root,PathTarget * target)5385 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5386 {
5387 int32 tuple_width = 0;
5388 ListCell *lc;
5389
5390 /* Vars are assumed to have cost zero, but other exprs do not */
5391 target->cost.startup = 0;
5392 target->cost.per_tuple = 0;
5393
5394 foreach(lc, target->exprs)
5395 {
5396 Node *node = (Node *) lfirst(lc);
5397
5398 if (IsA(node, Var))
5399 {
5400 Var *var = (Var *) node;
5401 int32 item_width;
5402
5403 /* We should not see any upper-level Vars here */
5404 Assert(var->varlevelsup == 0);
5405
5406 /* Try to get data from RelOptInfo cache */
5407 if (var->varno < root->simple_rel_array_size)
5408 {
5409 RelOptInfo *rel = root->simple_rel_array[var->varno];
5410
5411 if (rel != NULL &&
5412 var->varattno >= rel->min_attr &&
5413 var->varattno <= rel->max_attr)
5414 {
5415 int ndx = var->varattno - rel->min_attr;
5416
5417 if (rel->attr_widths[ndx] > 0)
5418 {
5419 tuple_width += rel->attr_widths[ndx];
5420 continue;
5421 }
5422 }
5423 }
5424
5425 /*
5426 * No cached data available, so estimate using just the type info.
5427 */
5428 item_width = get_typavgwidth(var->vartype, var->vartypmod);
5429 Assert(item_width > 0);
5430 tuple_width += item_width;
5431 }
5432 else
5433 {
5434 /*
5435 * Handle general expressions using type info.
5436 */
5437 int32 item_width;
5438 QualCost cost;
5439
5440 item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5441 Assert(item_width > 0);
5442 tuple_width += item_width;
5443
5444 /* Account for cost, too */
5445 cost_qual_eval_node(&cost, node, root);
5446 target->cost.startup += cost.startup;
5447 target->cost.per_tuple += cost.per_tuple;
5448 }
5449 }
5450
5451 Assert(tuple_width >= 0);
5452 target->width = tuple_width;
5453
5454 return target;
5455 }
5456
5457 /*
5458 * relation_byte_size
5459 * Estimate the storage space in bytes for a given number of tuples
5460 * of a given width (size in bytes).
5461 */
5462 static double
relation_byte_size(double tuples,int width)5463 relation_byte_size(double tuples, int width)
5464 {
5465 return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5466 }
5467
5468 /*
5469 * page_size
5470 * Returns an estimate of the number of pages covered by a given
5471 * number of tuples of a given width (size in bytes).
5472 */
5473 static double
page_size(double tuples,int width)5474 page_size(double tuples, int width)
5475 {
5476 return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5477 }
5478
5479 /*
5480 * Estimate the fraction of the work that each worker will do given the
5481 * number of workers budgeted for the path.
5482 */
5483 static double
get_parallel_divisor(Path * path)5484 get_parallel_divisor(Path *path)
5485 {
5486 double parallel_divisor = path->parallel_workers;
5487
5488 /*
5489 * Early experience with parallel query suggests that when there is only
5490 * one worker, the leader often makes a very substantial contribution to
5491 * executing the parallel portion of the plan, but as more workers are
5492 * added, it does less and less, because it's busy reading tuples from the
5493 * workers and doing whatever non-parallel post-processing is needed. By
5494 * the time we reach 4 workers, the leader no longer makes a meaningful
5495 * contribution. Thus, for now, estimate that the leader spends 30% of
5496 * its time servicing each worker, and the remainder executing the
5497 * parallel plan.
5498 */
5499 if (parallel_leader_participation)
5500 {
5501 double leader_contribution;
5502
5503 leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5504 if (leader_contribution > 0)
5505 parallel_divisor += leader_contribution;
5506 }
5507
5508 return parallel_divisor;
5509 }
5510
5511 /*
5512 * compute_bitmap_pages
5513 *
5514 * compute number of pages fetched from heap in bitmap heap scan.
5515 */
5516 double
compute_bitmap_pages(PlannerInfo * root,RelOptInfo * baserel,Path * bitmapqual,int loop_count,Cost * cost,double * tuple)5517 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5518 int loop_count, Cost *cost, double *tuple)
5519 {
5520 Cost indexTotalCost;
5521 Selectivity indexSelectivity;
5522 double T;
5523 double pages_fetched;
5524 double tuples_fetched;
5525 double heap_pages;
5526 long maxentries;
5527
5528 /*
5529 * Fetch total cost of obtaining the bitmap, as well as its total
5530 * selectivity.
5531 */
5532 cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5533
5534 /*
5535 * Estimate number of main-table pages fetched.
5536 */
5537 tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5538
5539 T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5540
5541 /*
5542 * For a single scan, the number of heap pages that need to be fetched is
5543 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5544 * re-reads needed).
5545 */
5546 pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5547
5548 /*
5549 * Calculate the number of pages fetched from the heap. Then based on
5550 * current work_mem estimate get the estimated maxentries in the bitmap.
5551 * (Note that we always do this calculation based on the number of pages
5552 * that would be fetched in a single iteration, even if loop_count > 1.
5553 * That's correct, because only that number of entries will be stored in
5554 * the bitmap at one time.)
5555 */
5556 heap_pages = Min(pages_fetched, baserel->pages);
5557 maxentries = tbm_calculate_entries(work_mem * 1024L);
5558
5559 if (loop_count > 1)
5560 {
5561 /*
5562 * For repeated bitmap scans, scale up the number of tuples fetched in
5563 * the Mackert and Lohman formula by the number of scans, so that we
5564 * estimate the number of pages fetched by all the scans. Then
5565 * pro-rate for one scan.
5566 */
5567 pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5568 baserel->pages,
5569 get_indexpath_pages(bitmapqual),
5570 root);
5571 pages_fetched /= loop_count;
5572 }
5573
5574 if (pages_fetched >= T)
5575 pages_fetched = T;
5576 else
5577 pages_fetched = ceil(pages_fetched);
5578
5579 if (maxentries < heap_pages)
5580 {
5581 double exact_pages;
5582 double lossy_pages;
5583
5584 /*
5585 * Crude approximation of the number of lossy pages. Because of the
5586 * way tbm_lossify() is coded, the number of lossy pages increases
5587 * very sharply as soon as we run short of memory; this formula has
5588 * that property and seems to perform adequately in testing, but it's
5589 * possible we could do better somehow.
5590 */
5591 lossy_pages = Max(0, heap_pages - maxentries / 2);
5592 exact_pages = heap_pages - lossy_pages;
5593
5594 /*
5595 * If there are lossy pages then recompute the number of tuples
5596 * processed by the bitmap heap node. We assume here that the chance
5597 * of a given tuple coming from an exact page is the same as the
5598 * chance that a given page is exact. This might not be true, but
5599 * it's not clear how we can do any better.
5600 */
5601 if (lossy_pages > 0)
5602 tuples_fetched =
5603 clamp_row_est(indexSelectivity *
5604 (exact_pages / heap_pages) * baserel->tuples +
5605 (lossy_pages / heap_pages) * baserel->tuples);
5606 }
5607
5608 if (cost)
5609 *cost = indexTotalCost;
5610 if (tuple)
5611 *tuple = tuples_fetched;
5612
5613 return pages_fetched;
5614 }
5615