1 /*-------------------------------------------------------------------------
2  *
3  * costsize.c
4  *	  Routines to compute (and set) relation sizes and path costs
5  *
6  * Path costs are measured in arbitrary units established by these basic
7  * parameters:
8  *
9  *	seq_page_cost		Cost of a sequential page fetch
10  *	random_page_cost	Cost of a non-sequential page fetch
11  *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
12  *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
13  *	cpu_operator_cost	Cost of CPU time to execute an operator or function
14  *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15  *	parallel_setup_cost Cost of setting up shared memory for parallelism
16  *
17  * We expect that the kernel will typically do some amount of read-ahead
18  * optimization; this in conjunction with seek costs means that seq_page_cost
19  * is normally considerably less than random_page_cost.  (However, if the
20  * database is fully cached in RAM, it is reasonable to set them equal.)
21  *
22  * We also use a rough estimate "effective_cache_size" of the number of
23  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
24  * NBuffers for this purpose because that would ignore the effects of
25  * the kernel's disk cache.)
26  *
27  * Obviously, taking constants for these values is an oversimplification,
28  * but it's tough enough to get any useful estimates even at this level of
29  * detail.  Note that all of these parameters are user-settable, in case
30  * the default values are drastically off for a particular platform.
31  *
32  * seq_page_cost and random_page_cost can also be overridden for an individual
33  * tablespace, in case some data is on a fast disk and other data is on a slow
34  * disk.  Per-tablespace overrides never apply to temporary work files such as
35  * an external sort or a materialize node that overflows work_mem.
36  *
37  * We compute two separate costs for each path:
38  *		total_cost: total estimated cost to fetch all tuples
39  *		startup_cost: cost that is expended before first tuple is fetched
40  * In some scenarios, such as when there is a LIMIT or we are implementing
41  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42  * path's result.  A caller can estimate the cost of fetching a partial
43  * result by interpolating between startup_cost and total_cost.  In detail:
44  *		actual_cost = startup_cost +
45  *			(total_cost - startup_cost) * tuples_to_fetch / path->rows;
46  * Note that a base relation's rows count (and, by extension, plan_rows for
47  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48  * that this equation works properly.  (Note: while path->rows is never zero
49  * for ordinary relations, it is zero for paths for provably-empty relations,
50  * so beware of division-by-zero.)	The LIMIT is applied as a top-level
51  * plan node.
52  *
53  * For largely historical reasons, most of the routines in this module use
54  * the passed result Path only to store their results (rows, startup_cost and
55  * total_cost) into.  All the input data they need is passed as separate
56  * parameters, even though much of it could be extracted from the Path.
57  * An exception is made for the cost_XXXjoin() routines, which expect all
58  * the other fields of the passed XXXPath to be filled in, and similarly
59  * cost_index() assumes the passed IndexPath is valid except for its output
60  * values.
61  *
62  *
63  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
64  * Portions Copyright (c) 1994, Regents of the University of California
65  *
66  * IDENTIFICATION
67  *	  src/backend/optimizer/path/costsize.c
68  *
69  *-------------------------------------------------------------------------
70  */
71 
72 #include "postgres.h"
73 
74 #include <math.h>
75 
76 #include "access/amapi.h"
77 #include "access/htup_details.h"
78 #include "access/tsmapi.h"
79 #include "executor/executor.h"
80 #include "executor/nodeHash.h"
81 #include "miscadmin.h"
82 #include "nodes/makefuncs.h"
83 #include "nodes/nodeFuncs.h"
84 #include "optimizer/clauses.h"
85 #include "optimizer/cost.h"
86 #include "optimizer/optimizer.h"
87 #include "optimizer/pathnode.h"
88 #include "optimizer/paths.h"
89 #include "optimizer/placeholder.h"
90 #include "optimizer/plancat.h"
91 #include "optimizer/planmain.h"
92 #include "optimizer/restrictinfo.h"
93 #include "parser/parsetree.h"
94 #include "utils/lsyscache.h"
95 #include "utils/selfuncs.h"
96 #include "utils/spccache.h"
97 #include "utils/tuplesort.h"
98 
99 
100 #define LOG2(x)  (log(x) / 0.693147180559945)
101 
102 /*
103  * Append and MergeAppend nodes are less expensive than some other operations
104  * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
105  * per-tuple cost as cpu_tuple_cost multiplied by this value.
106  */
107 #define APPEND_CPU_COST_MULTIPLIER 0.5
108 
109 
110 double		seq_page_cost = DEFAULT_SEQ_PAGE_COST;
111 double		random_page_cost = DEFAULT_RANDOM_PAGE_COST;
112 double		cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
113 double		cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
114 double		cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
115 double		parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
116 double		parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
117 
118 int			effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
119 
120 Cost		disable_cost = 1.0e10;
121 
122 int			max_parallel_workers_per_gather = 2;
123 
124 bool		enable_seqscan = true;
125 bool		enable_indexscan = true;
126 bool		enable_indexonlyscan = true;
127 bool		enable_bitmapscan = true;
128 bool		enable_tidscan = true;
129 bool		enable_sort = true;
130 bool		enable_hashagg = true;
131 bool		enable_nestloop = true;
132 bool		enable_material = true;
133 bool		enable_mergejoin = true;
134 bool		enable_hashjoin = true;
135 bool		enable_gathermerge = true;
136 bool		enable_partitionwise_join = false;
137 bool		enable_partitionwise_aggregate = false;
138 bool		enable_parallel_append = true;
139 bool		enable_parallel_hash = true;
140 bool		enable_partition_pruning = true;
141 
142 typedef struct
143 {
144 	PlannerInfo *root;
145 	QualCost	total;
146 } cost_qual_eval_context;
147 
148 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
149 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
150 										 RestrictInfo *rinfo,
151 										 PathKey *pathkey);
152 static void cost_rescan(PlannerInfo *root, Path *path,
153 						Cost *rescan_startup_cost, Cost *rescan_total_cost);
154 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
155 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
156 									  ParamPathInfo *param_info,
157 									  QualCost *qpqual_cost);
158 static bool has_indexed_join_quals(NestPath *joinpath);
159 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
160 								 List *quals);
161 static double calc_joinrel_size_estimate(PlannerInfo *root,
162 										 RelOptInfo *joinrel,
163 										 RelOptInfo *outer_rel,
164 										 RelOptInfo *inner_rel,
165 										 double outer_rows,
166 										 double inner_rows,
167 										 SpecialJoinInfo *sjinfo,
168 										 List *restrictlist);
169 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
170 													Relids outer_relids,
171 													Relids inner_relids,
172 													SpecialJoinInfo *sjinfo,
173 													List **restrictlist);
174 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
175 								   int parallel_workers);
176 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
177 static double relation_byte_size(double tuples, int width);
178 static double page_size(double tuples, int width);
179 static double get_parallel_divisor(Path *path);
180 
181 
182 /*
183  * clamp_row_est
184  *		Force a row-count estimate to a sane value.
185  */
186 double
clamp_row_est(double nrows)187 clamp_row_est(double nrows)
188 {
189 	/*
190 	 * Force estimate to be at least one row, to make explain output look
191 	 * better and to avoid possible divide-by-zero when interpolating costs.
192 	 * Make it an integer, too.
193 	 */
194 	if (nrows <= 1.0)
195 		nrows = 1.0;
196 	else
197 		nrows = rint(nrows);
198 
199 	return nrows;
200 }
201 
202 
203 /*
204  * cost_seqscan
205  *	  Determines and returns the cost of scanning a relation sequentially.
206  *
207  * 'baserel' is the relation to be scanned
208  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
209  */
210 void
cost_seqscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)211 cost_seqscan(Path *path, PlannerInfo *root,
212 			 RelOptInfo *baserel, ParamPathInfo *param_info)
213 {
214 	Cost		startup_cost = 0;
215 	Cost		cpu_run_cost;
216 	Cost		disk_run_cost;
217 	double		spc_seq_page_cost;
218 	QualCost	qpqual_cost;
219 	Cost		cpu_per_tuple;
220 
221 	/* Should only be applied to base relations */
222 	Assert(baserel->relid > 0);
223 	Assert(baserel->rtekind == RTE_RELATION);
224 
225 	/* Mark the path with the correct row estimate */
226 	if (param_info)
227 		path->rows = param_info->ppi_rows;
228 	else
229 		path->rows = baserel->rows;
230 
231 	if (!enable_seqscan)
232 		startup_cost += disable_cost;
233 
234 	/* fetch estimated page cost for tablespace containing table */
235 	get_tablespace_page_costs(baserel->reltablespace,
236 							  NULL,
237 							  &spc_seq_page_cost);
238 
239 	/*
240 	 * disk costs
241 	 */
242 	disk_run_cost = spc_seq_page_cost * baserel->pages;
243 
244 	/* CPU costs */
245 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
246 
247 	startup_cost += qpqual_cost.startup;
248 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
249 	cpu_run_cost = cpu_per_tuple * baserel->tuples;
250 	/* tlist eval costs are paid per output row, not per tuple scanned */
251 	startup_cost += path->pathtarget->cost.startup;
252 	cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
253 
254 	/* Adjust costing for parallelism, if used. */
255 	if (path->parallel_workers > 0)
256 	{
257 		double		parallel_divisor = get_parallel_divisor(path);
258 
259 		/* The CPU cost is divided among all the workers. */
260 		cpu_run_cost /= parallel_divisor;
261 
262 		/*
263 		 * It may be possible to amortize some of the I/O cost, but probably
264 		 * not very much, because most operating systems already do aggressive
265 		 * prefetching.  For now, we assume that the disk run cost can't be
266 		 * amortized at all.
267 		 */
268 
269 		/*
270 		 * In the case of a parallel plan, the row count needs to represent
271 		 * the number of tuples processed per worker.
272 		 */
273 		path->rows = clamp_row_est(path->rows / parallel_divisor);
274 	}
275 
276 	path->startup_cost = startup_cost;
277 	path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
278 }
279 
280 /*
281  * cost_samplescan
282  *	  Determines and returns the cost of scanning a relation using sampling.
283  *
284  * 'baserel' is the relation to be scanned
285  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
286  */
287 void
cost_samplescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)288 cost_samplescan(Path *path, PlannerInfo *root,
289 				RelOptInfo *baserel, ParamPathInfo *param_info)
290 {
291 	Cost		startup_cost = 0;
292 	Cost		run_cost = 0;
293 	RangeTblEntry *rte;
294 	TableSampleClause *tsc;
295 	TsmRoutine *tsm;
296 	double		spc_seq_page_cost,
297 				spc_random_page_cost,
298 				spc_page_cost;
299 	QualCost	qpqual_cost;
300 	Cost		cpu_per_tuple;
301 
302 	/* Should only be applied to base relations with tablesample clauses */
303 	Assert(baserel->relid > 0);
304 	rte = planner_rt_fetch(baserel->relid, root);
305 	Assert(rte->rtekind == RTE_RELATION);
306 	tsc = rte->tablesample;
307 	Assert(tsc != NULL);
308 	tsm = GetTsmRoutine(tsc->tsmhandler);
309 
310 	/* Mark the path with the correct row estimate */
311 	if (param_info)
312 		path->rows = param_info->ppi_rows;
313 	else
314 		path->rows = baserel->rows;
315 
316 	/* fetch estimated page cost for tablespace containing table */
317 	get_tablespace_page_costs(baserel->reltablespace,
318 							  &spc_random_page_cost,
319 							  &spc_seq_page_cost);
320 
321 	/* if NextSampleBlock is used, assume random access, else sequential */
322 	spc_page_cost = (tsm->NextSampleBlock != NULL) ?
323 		spc_random_page_cost : spc_seq_page_cost;
324 
325 	/*
326 	 * disk costs (recall that baserel->pages has already been set to the
327 	 * number of pages the sampling method will visit)
328 	 */
329 	run_cost += spc_page_cost * baserel->pages;
330 
331 	/*
332 	 * CPU costs (recall that baserel->tuples has already been set to the
333 	 * number of tuples the sampling method will select).  Note that we ignore
334 	 * execution cost of the TABLESAMPLE parameter expressions; they will be
335 	 * evaluated only once per scan, and in most usages they'll likely be
336 	 * simple constants anyway.  We also don't charge anything for the
337 	 * calculations the sampling method might do internally.
338 	 */
339 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
340 
341 	startup_cost += qpqual_cost.startup;
342 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
343 	run_cost += cpu_per_tuple * baserel->tuples;
344 	/* tlist eval costs are paid per output row, not per tuple scanned */
345 	startup_cost += path->pathtarget->cost.startup;
346 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
347 
348 	path->startup_cost = startup_cost;
349 	path->total_cost = startup_cost + run_cost;
350 }
351 
352 /*
353  * cost_gather
354  *	  Determines and returns the cost of gather path.
355  *
356  * 'rel' is the relation to be operated upon
357  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
358  * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
359  * both 'rel' and 'param_info'.  This is useful when the path doesn't exactly
360  * correspond to any particular RelOptInfo.
361  */
362 void
cost_gather(GatherPath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,double * rows)363 cost_gather(GatherPath *path, PlannerInfo *root,
364 			RelOptInfo *rel, ParamPathInfo *param_info,
365 			double *rows)
366 {
367 	Cost		startup_cost = 0;
368 	Cost		run_cost = 0;
369 
370 	/* Mark the path with the correct row estimate */
371 	if (rows)
372 		path->path.rows = *rows;
373 	else if (param_info)
374 		path->path.rows = param_info->ppi_rows;
375 	else
376 		path->path.rows = rel->rows;
377 
378 	startup_cost = path->subpath->startup_cost;
379 
380 	run_cost = path->subpath->total_cost - path->subpath->startup_cost;
381 
382 	/* Parallel setup and communication cost. */
383 	startup_cost += parallel_setup_cost;
384 	run_cost += parallel_tuple_cost * path->path.rows;
385 
386 	path->path.startup_cost = startup_cost;
387 	path->path.total_cost = (startup_cost + run_cost);
388 }
389 
390 /*
391  * cost_gather_merge
392  *	  Determines and returns the cost of gather merge path.
393  *
394  * GatherMerge merges several pre-sorted input streams, using a heap that at
395  * any given instant holds the next tuple from each stream. If there are N
396  * streams, we need about N*log2(N) tuple comparisons to construct the heap at
397  * startup, and then for each output tuple, about log2(N) comparisons to
398  * replace the top heap entry with the next tuple from the same stream.
399  */
400 void
cost_gather_merge(GatherMergePath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,Cost input_startup_cost,Cost input_total_cost,double * rows)401 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
402 				  RelOptInfo *rel, ParamPathInfo *param_info,
403 				  Cost input_startup_cost, Cost input_total_cost,
404 				  double *rows)
405 {
406 	Cost		startup_cost = 0;
407 	Cost		run_cost = 0;
408 	Cost		comparison_cost;
409 	double		N;
410 	double		logN;
411 
412 	/* Mark the path with the correct row estimate */
413 	if (rows)
414 		path->path.rows = *rows;
415 	else if (param_info)
416 		path->path.rows = param_info->ppi_rows;
417 	else
418 		path->path.rows = rel->rows;
419 
420 	if (!enable_gathermerge)
421 		startup_cost += disable_cost;
422 
423 	/*
424 	 * Add one to the number of workers to account for the leader.  This might
425 	 * be overgenerous since the leader will do less work than other workers
426 	 * in typical cases, but we'll go with it for now.
427 	 */
428 	Assert(path->num_workers > 0);
429 	N = (double) path->num_workers + 1;
430 	logN = LOG2(N);
431 
432 	/* Assumed cost per tuple comparison */
433 	comparison_cost = 2.0 * cpu_operator_cost;
434 
435 	/* Heap creation cost */
436 	startup_cost += comparison_cost * N * logN;
437 
438 	/* Per-tuple heap maintenance cost */
439 	run_cost += path->path.rows * comparison_cost * logN;
440 
441 	/* small cost for heap management, like cost_merge_append */
442 	run_cost += cpu_operator_cost * path->path.rows;
443 
444 	/*
445 	 * Parallel setup and communication cost.  Since Gather Merge, unlike
446 	 * Gather, requires us to block until a tuple is available from every
447 	 * worker, we bump the IPC cost up a little bit as compared with Gather.
448 	 * For lack of a better idea, charge an extra 5%.
449 	 */
450 	startup_cost += parallel_setup_cost;
451 	run_cost += parallel_tuple_cost * path->path.rows * 1.05;
452 
453 	path->path.startup_cost = startup_cost + input_startup_cost;
454 	path->path.total_cost = (startup_cost + run_cost + input_total_cost);
455 }
456 
457 /*
458  * cost_index
459  *	  Determines and returns the cost of scanning a relation using an index.
460  *
461  * 'path' describes the indexscan under consideration, and is complete
462  *		except for the fields to be set by this routine
463  * 'loop_count' is the number of repetitions of the indexscan to factor into
464  *		estimates of caching behavior
465  *
466  * In addition to rows, startup_cost and total_cost, cost_index() sets the
467  * path's indextotalcost and indexselectivity fields.  These values will be
468  * needed if the IndexPath is used in a BitmapIndexScan.
469  *
470  * NOTE: path->indexquals must contain only clauses usable as index
471  * restrictions.  Any additional quals evaluated as qpquals may reduce the
472  * number of returned tuples, but they won't reduce the number of tuples
473  * we have to fetch from the table, so they don't reduce the scan cost.
474  */
475 void
cost_index(IndexPath * path,PlannerInfo * root,double loop_count,bool partial_path)476 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
477 		   bool partial_path)
478 {
479 	IndexOptInfo *index = path->indexinfo;
480 	RelOptInfo *baserel = index->rel;
481 	bool		indexonly = (path->path.pathtype == T_IndexOnlyScan);
482 	amcostestimate_function amcostestimate;
483 	List	   *qpquals;
484 	Cost		startup_cost = 0;
485 	Cost		run_cost = 0;
486 	Cost		cpu_run_cost = 0;
487 	Cost		indexStartupCost;
488 	Cost		indexTotalCost;
489 	Selectivity indexSelectivity;
490 	double		indexCorrelation,
491 				csquared;
492 	double		spc_seq_page_cost,
493 				spc_random_page_cost;
494 	Cost		min_IO_cost,
495 				max_IO_cost;
496 	QualCost	qpqual_cost;
497 	Cost		cpu_per_tuple;
498 	double		tuples_fetched;
499 	double		pages_fetched;
500 	double		rand_heap_pages;
501 	double		index_pages;
502 
503 	/* Should only be applied to base relations */
504 	Assert(IsA(baserel, RelOptInfo) &&
505 		   IsA(index, IndexOptInfo));
506 	Assert(baserel->relid > 0);
507 	Assert(baserel->rtekind == RTE_RELATION);
508 
509 	/*
510 	 * Mark the path with the correct row estimate, and identify which quals
511 	 * will need to be enforced as qpquals.  We need not check any quals that
512 	 * are implied by the index's predicate, so we can use indrestrictinfo not
513 	 * baserestrictinfo as the list of relevant restriction clauses for the
514 	 * rel.
515 	 */
516 	if (path->path.param_info)
517 	{
518 		path->path.rows = path->path.param_info->ppi_rows;
519 		/* qpquals come from the rel's restriction clauses and ppi_clauses */
520 		qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
521 														  path->indexclauses),
522 							  extract_nonindex_conditions(path->path.param_info->ppi_clauses,
523 														  path->indexclauses));
524 	}
525 	else
526 	{
527 		path->path.rows = baserel->rows;
528 		/* qpquals come from just the rel's restriction clauses */
529 		qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
530 											  path->indexclauses);
531 	}
532 
533 	if (!enable_indexscan)
534 		startup_cost += disable_cost;
535 	/* we don't need to check enable_indexonlyscan; indxpath.c does that */
536 
537 	/*
538 	 * Call index-access-method-specific code to estimate the processing cost
539 	 * for scanning the index, as well as the selectivity of the index (ie,
540 	 * the fraction of main-table tuples we will have to retrieve) and its
541 	 * correlation to the main-table tuple order.  We need a cast here because
542 	 * pathnodes.h uses a weak function type to avoid including amapi.h.
543 	 */
544 	amcostestimate = (amcostestimate_function) index->amcostestimate;
545 	amcostestimate(root, path, loop_count,
546 				   &indexStartupCost, &indexTotalCost,
547 				   &indexSelectivity, &indexCorrelation,
548 				   &index_pages);
549 
550 	/*
551 	 * Save amcostestimate's results for possible use in bitmap scan planning.
552 	 * We don't bother to save indexStartupCost or indexCorrelation, because a
553 	 * bitmap scan doesn't care about either.
554 	 */
555 	path->indextotalcost = indexTotalCost;
556 	path->indexselectivity = indexSelectivity;
557 
558 	/* all costs for touching index itself included here */
559 	startup_cost += indexStartupCost;
560 	run_cost += indexTotalCost - indexStartupCost;
561 
562 	/* estimate number of main-table tuples fetched */
563 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
564 
565 	/* fetch estimated page costs for tablespace containing table */
566 	get_tablespace_page_costs(baserel->reltablespace,
567 							  &spc_random_page_cost,
568 							  &spc_seq_page_cost);
569 
570 	/*----------
571 	 * Estimate number of main-table pages fetched, and compute I/O cost.
572 	 *
573 	 * When the index ordering is uncorrelated with the table ordering,
574 	 * we use an approximation proposed by Mackert and Lohman (see
575 	 * index_pages_fetched() for details) to compute the number of pages
576 	 * fetched, and then charge spc_random_page_cost per page fetched.
577 	 *
578 	 * When the index ordering is exactly correlated with the table ordering
579 	 * (just after a CLUSTER, for example), the number of pages fetched should
580 	 * be exactly selectivity * table_size.  What's more, all but the first
581 	 * will be sequential fetches, not the random fetches that occur in the
582 	 * uncorrelated case.  So if the number of pages is more than 1, we
583 	 * ought to charge
584 	 *		spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
585 	 * For partially-correlated indexes, we ought to charge somewhere between
586 	 * these two estimates.  We currently interpolate linearly between the
587 	 * estimates based on the correlation squared (XXX is that appropriate?).
588 	 *
589 	 * If it's an index-only scan, then we will not need to fetch any heap
590 	 * pages for which the visibility map shows all tuples are visible.
591 	 * Hence, reduce the estimated number of heap fetches accordingly.
592 	 * We use the measured fraction of the entire heap that is all-visible,
593 	 * which might not be particularly relevant to the subset of the heap
594 	 * that this query will fetch; but it's not clear how to do better.
595 	 *----------
596 	 */
597 	if (loop_count > 1)
598 	{
599 		/*
600 		 * For repeated indexscans, the appropriate estimate for the
601 		 * uncorrelated case is to scale up the number of tuples fetched in
602 		 * the Mackert and Lohman formula by the number of scans, so that we
603 		 * estimate the number of pages fetched by all the scans; then
604 		 * pro-rate the costs for one scan.  In this case we assume all the
605 		 * fetches are random accesses.
606 		 */
607 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
608 											baserel->pages,
609 											(double) index->pages,
610 											root);
611 
612 		if (indexonly)
613 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
614 
615 		rand_heap_pages = pages_fetched;
616 
617 		max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
618 
619 		/*
620 		 * In the perfectly correlated case, the number of pages touched by
621 		 * each scan is selectivity * table_size, and we can use the Mackert
622 		 * and Lohman formula at the page level to estimate how much work is
623 		 * saved by caching across scans.  We still assume all the fetches are
624 		 * random, though, which is an overestimate that's hard to correct for
625 		 * without double-counting the cache effects.  (But in most cases
626 		 * where such a plan is actually interesting, only one page would get
627 		 * fetched per scan anyway, so it shouldn't matter much.)
628 		 */
629 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
630 
631 		pages_fetched = index_pages_fetched(pages_fetched * loop_count,
632 											baserel->pages,
633 											(double) index->pages,
634 											root);
635 
636 		if (indexonly)
637 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
638 
639 		min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
640 	}
641 	else
642 	{
643 		/*
644 		 * Normal case: apply the Mackert and Lohman formula, and then
645 		 * interpolate between that and the correlation-derived result.
646 		 */
647 		pages_fetched = index_pages_fetched(tuples_fetched,
648 											baserel->pages,
649 											(double) index->pages,
650 											root);
651 
652 		if (indexonly)
653 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
654 
655 		rand_heap_pages = pages_fetched;
656 
657 		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
658 		max_IO_cost = pages_fetched * spc_random_page_cost;
659 
660 		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
661 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
662 
663 		if (indexonly)
664 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
665 
666 		if (pages_fetched > 0)
667 		{
668 			min_IO_cost = spc_random_page_cost;
669 			if (pages_fetched > 1)
670 				min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
671 		}
672 		else
673 			min_IO_cost = 0;
674 	}
675 
676 	if (partial_path)
677 	{
678 		/*
679 		 * For index only scans compute workers based on number of index pages
680 		 * fetched; the number of heap pages we fetch might be so small as to
681 		 * effectively rule out parallelism, which we don't want to do.
682 		 */
683 		if (indexonly)
684 			rand_heap_pages = -1;
685 
686 		/*
687 		 * Estimate the number of parallel workers required to scan index. Use
688 		 * the number of heap pages computed considering heap fetches won't be
689 		 * sequential as for parallel scans the pages are accessed in random
690 		 * order.
691 		 */
692 		path->path.parallel_workers = compute_parallel_worker(baserel,
693 															  rand_heap_pages,
694 															  index_pages,
695 															  max_parallel_workers_per_gather);
696 
697 		/*
698 		 * Fall out if workers can't be assigned for parallel scan, because in
699 		 * such a case this path will be rejected.  So there is no benefit in
700 		 * doing extra computation.
701 		 */
702 		if (path->path.parallel_workers <= 0)
703 			return;
704 
705 		path->path.parallel_aware = true;
706 	}
707 
708 	/*
709 	 * Now interpolate based on estimated index order correlation to get total
710 	 * disk I/O cost for main table accesses.
711 	 */
712 	csquared = indexCorrelation * indexCorrelation;
713 
714 	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
715 
716 	/*
717 	 * Estimate CPU costs per tuple.
718 	 *
719 	 * What we want here is cpu_tuple_cost plus the evaluation costs of any
720 	 * qual clauses that we have to evaluate as qpquals.
721 	 */
722 	cost_qual_eval(&qpqual_cost, qpquals, root);
723 
724 	startup_cost += qpqual_cost.startup;
725 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
726 
727 	cpu_run_cost += cpu_per_tuple * tuples_fetched;
728 
729 	/* tlist eval costs are paid per output row, not per tuple scanned */
730 	startup_cost += path->path.pathtarget->cost.startup;
731 	cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
732 
733 	/* Adjust costing for parallelism, if used. */
734 	if (path->path.parallel_workers > 0)
735 	{
736 		double		parallel_divisor = get_parallel_divisor(&path->path);
737 
738 		path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
739 
740 		/* The CPU cost is divided among all the workers. */
741 		cpu_run_cost /= parallel_divisor;
742 	}
743 
744 	run_cost += cpu_run_cost;
745 
746 	path->path.startup_cost = startup_cost;
747 	path->path.total_cost = startup_cost + run_cost;
748 }
749 
750 /*
751  * extract_nonindex_conditions
752  *
753  * Given a list of quals to be enforced in an indexscan, extract the ones that
754  * will have to be applied as qpquals (ie, the index machinery won't handle
755  * them).  Here we detect only whether a qual clause is directly redundant
756  * with some indexclause.  If the index path is chosen for use, createplan.c
757  * will try a bit harder to get rid of redundant qual conditions; specifically
758  * it will see if quals can be proven to be implied by the indexquals.  But
759  * it does not seem worth the cycles to try to factor that in at this stage,
760  * since we're only trying to estimate qual eval costs.  Otherwise this must
761  * match the logic in create_indexscan_plan().
762  *
763  * qual_clauses, and the result, are lists of RestrictInfos.
764  * indexclauses is a list of IndexClauses.
765  */
766 static List *
extract_nonindex_conditions(List * qual_clauses,List * indexclauses)767 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
768 {
769 	List	   *result = NIL;
770 	ListCell   *lc;
771 
772 	foreach(lc, qual_clauses)
773 	{
774 		RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
775 
776 		if (rinfo->pseudoconstant)
777 			continue;			/* we may drop pseudoconstants here */
778 		if (is_redundant_with_indexclauses(rinfo, indexclauses))
779 			continue;			/* dup or derived from same EquivalenceClass */
780 		/* ... skip the predicate proof attempt createplan.c will try ... */
781 		result = lappend(result, rinfo);
782 	}
783 	return result;
784 }
785 
786 /*
787  * index_pages_fetched
788  *	  Estimate the number of pages actually fetched after accounting for
789  *	  cache effects.
790  *
791  * We use an approximation proposed by Mackert and Lohman, "Index Scans
792  * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
793  * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
794  * The Mackert and Lohman approximation is that the number of pages
795  * fetched is
796  *	PF =
797  *		min(2TNs/(2T+Ns), T)			when T <= b
798  *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
799  *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
800  * where
801  *		T = # pages in table
802  *		N = # tuples in table
803  *		s = selectivity = fraction of table to be scanned
804  *		b = # buffer pages available (we include kernel space here)
805  *
806  * We assume that effective_cache_size is the total number of buffer pages
807  * available for the whole query, and pro-rate that space across all the
808  * tables in the query and the index currently under consideration.  (This
809  * ignores space needed for other indexes used by the query, but since we
810  * don't know which indexes will get used, we can't estimate that very well;
811  * and in any case counting all the tables may well be an overestimate, since
812  * depending on the join plan not all the tables may be scanned concurrently.)
813  *
814  * The product Ns is the number of tuples fetched; we pass in that
815  * product rather than calculating it here.  "pages" is the number of pages
816  * in the object under consideration (either an index or a table).
817  * "index_pages" is the amount to add to the total table space, which was
818  * computed for us by make_one_rel.
819  *
820  * Caller is expected to have ensured that tuples_fetched is greater than zero
821  * and rounded to integer (see clamp_row_est).  The result will likewise be
822  * greater than zero and integral.
823  */
824 double
index_pages_fetched(double tuples_fetched,BlockNumber pages,double index_pages,PlannerInfo * root)825 index_pages_fetched(double tuples_fetched, BlockNumber pages,
826 					double index_pages, PlannerInfo *root)
827 {
828 	double		pages_fetched;
829 	double		total_pages;
830 	double		T,
831 				b;
832 
833 	/* T is # pages in table, but don't allow it to be zero */
834 	T = (pages > 1) ? (double) pages : 1.0;
835 
836 	/* Compute number of pages assumed to be competing for cache space */
837 	total_pages = root->total_table_pages + index_pages;
838 	total_pages = Max(total_pages, 1.0);
839 	Assert(T <= total_pages);
840 
841 	/* b is pro-rated share of effective_cache_size */
842 	b = (double) effective_cache_size * T / total_pages;
843 
844 	/* force it positive and integral */
845 	if (b <= 1.0)
846 		b = 1.0;
847 	else
848 		b = ceil(b);
849 
850 	/* This part is the Mackert and Lohman formula */
851 	if (T <= b)
852 	{
853 		pages_fetched =
854 			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
855 		if (pages_fetched >= T)
856 			pages_fetched = T;
857 		else
858 			pages_fetched = ceil(pages_fetched);
859 	}
860 	else
861 	{
862 		double		lim;
863 
864 		lim = (2.0 * T * b) / (2.0 * T - b);
865 		if (tuples_fetched <= lim)
866 		{
867 			pages_fetched =
868 				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
869 		}
870 		else
871 		{
872 			pages_fetched =
873 				b + (tuples_fetched - lim) * (T - b) / T;
874 		}
875 		pages_fetched = ceil(pages_fetched);
876 	}
877 	return pages_fetched;
878 }
879 
880 /*
881  * get_indexpath_pages
882  *		Determine the total size of the indexes used in a bitmap index path.
883  *
884  * Note: if the same index is used more than once in a bitmap tree, we will
885  * count it multiple times, which perhaps is the wrong thing ... but it's
886  * not completely clear, and detecting duplicates is difficult, so ignore it
887  * for now.
888  */
889 static double
get_indexpath_pages(Path * bitmapqual)890 get_indexpath_pages(Path *bitmapqual)
891 {
892 	double		result = 0;
893 	ListCell   *l;
894 
895 	if (IsA(bitmapqual, BitmapAndPath))
896 	{
897 		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
898 
899 		foreach(l, apath->bitmapquals)
900 		{
901 			result += get_indexpath_pages((Path *) lfirst(l));
902 		}
903 	}
904 	else if (IsA(bitmapqual, BitmapOrPath))
905 	{
906 		BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
907 
908 		foreach(l, opath->bitmapquals)
909 		{
910 			result += get_indexpath_pages((Path *) lfirst(l));
911 		}
912 	}
913 	else if (IsA(bitmapqual, IndexPath))
914 	{
915 		IndexPath  *ipath = (IndexPath *) bitmapqual;
916 
917 		result = (double) ipath->indexinfo->pages;
918 	}
919 	else
920 		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
921 
922 	return result;
923 }
924 
925 /*
926  * cost_bitmap_heap_scan
927  *	  Determines and returns the cost of scanning a relation using a bitmap
928  *	  index-then-heap plan.
929  *
930  * 'baserel' is the relation to be scanned
931  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
932  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
933  * 'loop_count' is the number of repetitions of the indexscan to factor into
934  *		estimates of caching behavior
935  *
936  * Note: the component IndexPaths in bitmapqual should have been costed
937  * using the same loop_count.
938  */
939 void
cost_bitmap_heap_scan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,Path * bitmapqual,double loop_count)940 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
941 					  ParamPathInfo *param_info,
942 					  Path *bitmapqual, double loop_count)
943 {
944 	Cost		startup_cost = 0;
945 	Cost		run_cost = 0;
946 	Cost		indexTotalCost;
947 	QualCost	qpqual_cost;
948 	Cost		cpu_per_tuple;
949 	Cost		cost_per_page;
950 	Cost		cpu_run_cost;
951 	double		tuples_fetched;
952 	double		pages_fetched;
953 	double		spc_seq_page_cost,
954 				spc_random_page_cost;
955 	double		T;
956 
957 	/* Should only be applied to base relations */
958 	Assert(IsA(baserel, RelOptInfo));
959 	Assert(baserel->relid > 0);
960 	Assert(baserel->rtekind == RTE_RELATION);
961 
962 	/* Mark the path with the correct row estimate */
963 	if (param_info)
964 		path->rows = param_info->ppi_rows;
965 	else
966 		path->rows = baserel->rows;
967 
968 	if (!enable_bitmapscan)
969 		startup_cost += disable_cost;
970 
971 	pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
972 										 loop_count, &indexTotalCost,
973 										 &tuples_fetched);
974 
975 	startup_cost += indexTotalCost;
976 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
977 
978 	/* Fetch estimated page costs for tablespace containing table. */
979 	get_tablespace_page_costs(baserel->reltablespace,
980 							  &spc_random_page_cost,
981 							  &spc_seq_page_cost);
982 
983 	/*
984 	 * For small numbers of pages we should charge spc_random_page_cost
985 	 * apiece, while if nearly all the table's pages are being read, it's more
986 	 * appropriate to charge spc_seq_page_cost apiece.  The effect is
987 	 * nonlinear, too. For lack of a better idea, interpolate like this to
988 	 * determine the cost per page.
989 	 */
990 	if (pages_fetched >= 2.0)
991 		cost_per_page = spc_random_page_cost -
992 			(spc_random_page_cost - spc_seq_page_cost)
993 			* sqrt(pages_fetched / T);
994 	else
995 		cost_per_page = spc_random_page_cost;
996 
997 	run_cost += pages_fetched * cost_per_page;
998 
999 	/*
1000 	 * Estimate CPU costs per tuple.
1001 	 *
1002 	 * Often the indexquals don't need to be rechecked at each tuple ... but
1003 	 * not always, especially not if there are enough tuples involved that the
1004 	 * bitmaps become lossy.  For the moment, just assume they will be
1005 	 * rechecked always.  This means we charge the full freight for all the
1006 	 * scan clauses.
1007 	 */
1008 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1009 
1010 	startup_cost += qpqual_cost.startup;
1011 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1012 	cpu_run_cost = cpu_per_tuple * tuples_fetched;
1013 
1014 	/* Adjust costing for parallelism, if used. */
1015 	if (path->parallel_workers > 0)
1016 	{
1017 		double		parallel_divisor = get_parallel_divisor(path);
1018 
1019 		/* The CPU cost is divided among all the workers. */
1020 		cpu_run_cost /= parallel_divisor;
1021 
1022 		path->rows = clamp_row_est(path->rows / parallel_divisor);
1023 	}
1024 
1025 
1026 	run_cost += cpu_run_cost;
1027 
1028 	/* tlist eval costs are paid per output row, not per tuple scanned */
1029 	startup_cost += path->pathtarget->cost.startup;
1030 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1031 
1032 	path->startup_cost = startup_cost;
1033 	path->total_cost = startup_cost + run_cost;
1034 }
1035 
1036 /*
1037  * cost_bitmap_tree_node
1038  *		Extract cost and selectivity from a bitmap tree node (index/and/or)
1039  */
1040 void
cost_bitmap_tree_node(Path * path,Cost * cost,Selectivity * selec)1041 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1042 {
1043 	if (IsA(path, IndexPath))
1044 	{
1045 		*cost = ((IndexPath *) path)->indextotalcost;
1046 		*selec = ((IndexPath *) path)->indexselectivity;
1047 
1048 		/*
1049 		 * Charge a small amount per retrieved tuple to reflect the costs of
1050 		 * manipulating the bitmap.  This is mostly to make sure that a bitmap
1051 		 * scan doesn't look to be the same cost as an indexscan to retrieve a
1052 		 * single tuple.
1053 		 */
1054 		*cost += 0.1 * cpu_operator_cost * path->rows;
1055 	}
1056 	else if (IsA(path, BitmapAndPath))
1057 	{
1058 		*cost = path->total_cost;
1059 		*selec = ((BitmapAndPath *) path)->bitmapselectivity;
1060 	}
1061 	else if (IsA(path, BitmapOrPath))
1062 	{
1063 		*cost = path->total_cost;
1064 		*selec = ((BitmapOrPath *) path)->bitmapselectivity;
1065 	}
1066 	else
1067 	{
1068 		elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1069 		*cost = *selec = 0;		/* keep compiler quiet */
1070 	}
1071 }
1072 
1073 /*
1074  * cost_bitmap_and_node
1075  *		Estimate the cost of a BitmapAnd node
1076  *
1077  * Note that this considers only the costs of index scanning and bitmap
1078  * creation, not the eventual heap access.  In that sense the object isn't
1079  * truly a Path, but it has enough path-like properties (costs in particular)
1080  * to warrant treating it as one.  We don't bother to set the path rows field,
1081  * however.
1082  */
1083 void
cost_bitmap_and_node(BitmapAndPath * path,PlannerInfo * root)1084 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1085 {
1086 	Cost		totalCost;
1087 	Selectivity selec;
1088 	ListCell   *l;
1089 
1090 	/*
1091 	 * We estimate AND selectivity on the assumption that the inputs are
1092 	 * independent.  This is probably often wrong, but we don't have the info
1093 	 * to do better.
1094 	 *
1095 	 * The runtime cost of the BitmapAnd itself is estimated at 100x
1096 	 * cpu_operator_cost for each tbm_intersect needed.  Probably too small,
1097 	 * definitely too simplistic?
1098 	 */
1099 	totalCost = 0.0;
1100 	selec = 1.0;
1101 	foreach(l, path->bitmapquals)
1102 	{
1103 		Path	   *subpath = (Path *) lfirst(l);
1104 		Cost		subCost;
1105 		Selectivity subselec;
1106 
1107 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1108 
1109 		selec *= subselec;
1110 
1111 		totalCost += subCost;
1112 		if (l != list_head(path->bitmapquals))
1113 			totalCost += 100.0 * cpu_operator_cost;
1114 	}
1115 	path->bitmapselectivity = selec;
1116 	path->path.rows = 0;		/* per above, not used */
1117 	path->path.startup_cost = totalCost;
1118 	path->path.total_cost = totalCost;
1119 }
1120 
1121 /*
1122  * cost_bitmap_or_node
1123  *		Estimate the cost of a BitmapOr node
1124  *
1125  * See comments for cost_bitmap_and_node.
1126  */
1127 void
cost_bitmap_or_node(BitmapOrPath * path,PlannerInfo * root)1128 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1129 {
1130 	Cost		totalCost;
1131 	Selectivity selec;
1132 	ListCell   *l;
1133 
1134 	/*
1135 	 * We estimate OR selectivity on the assumption that the inputs are
1136 	 * non-overlapping, since that's often the case in "x IN (list)" type
1137 	 * situations.  Of course, we clamp to 1.0 at the end.
1138 	 *
1139 	 * The runtime cost of the BitmapOr itself is estimated at 100x
1140 	 * cpu_operator_cost for each tbm_union needed.  Probably too small,
1141 	 * definitely too simplistic?  We are aware that the tbm_unions are
1142 	 * optimized out when the inputs are BitmapIndexScans.
1143 	 */
1144 	totalCost = 0.0;
1145 	selec = 0.0;
1146 	foreach(l, path->bitmapquals)
1147 	{
1148 		Path	   *subpath = (Path *) lfirst(l);
1149 		Cost		subCost;
1150 		Selectivity subselec;
1151 
1152 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1153 
1154 		selec += subselec;
1155 
1156 		totalCost += subCost;
1157 		if (l != list_head(path->bitmapquals) &&
1158 			!IsA(subpath, IndexPath))
1159 			totalCost += 100.0 * cpu_operator_cost;
1160 	}
1161 	path->bitmapselectivity = Min(selec, 1.0);
1162 	path->path.rows = 0;		/* per above, not used */
1163 	path->path.startup_cost = totalCost;
1164 	path->path.total_cost = totalCost;
1165 }
1166 
1167 /*
1168  * cost_tidscan
1169  *	  Determines and returns the cost of scanning a relation using TIDs.
1170  *
1171  * 'baserel' is the relation to be scanned
1172  * 'tidquals' is the list of TID-checkable quals
1173  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1174  */
1175 void
cost_tidscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,List * tidquals,ParamPathInfo * param_info)1176 cost_tidscan(Path *path, PlannerInfo *root,
1177 			 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1178 {
1179 	Cost		startup_cost = 0;
1180 	Cost		run_cost = 0;
1181 	bool		isCurrentOf = false;
1182 	QualCost	qpqual_cost;
1183 	Cost		cpu_per_tuple;
1184 	QualCost	tid_qual_cost;
1185 	int			ntuples;
1186 	ListCell   *l;
1187 	double		spc_random_page_cost;
1188 
1189 	/* Should only be applied to base relations */
1190 	Assert(baserel->relid > 0);
1191 	Assert(baserel->rtekind == RTE_RELATION);
1192 
1193 	/* Mark the path with the correct row estimate */
1194 	if (param_info)
1195 		path->rows = param_info->ppi_rows;
1196 	else
1197 		path->rows = baserel->rows;
1198 
1199 	/* Count how many tuples we expect to retrieve */
1200 	ntuples = 0;
1201 	foreach(l, tidquals)
1202 	{
1203 		RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1204 		Expr	   *qual = rinfo->clause;
1205 
1206 		if (IsA(qual, ScalarArrayOpExpr))
1207 		{
1208 			/* Each element of the array yields 1 tuple */
1209 			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1210 			Node	   *arraynode = (Node *) lsecond(saop->args);
1211 
1212 			ntuples += estimate_array_length(arraynode);
1213 		}
1214 		else if (IsA(qual, CurrentOfExpr))
1215 		{
1216 			/* CURRENT OF yields 1 tuple */
1217 			isCurrentOf = true;
1218 			ntuples++;
1219 		}
1220 		else
1221 		{
1222 			/* It's just CTID = something, count 1 tuple */
1223 			ntuples++;
1224 		}
1225 	}
1226 
1227 	/*
1228 	 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1229 	 * understands how to do it correctly.  Therefore, honor enable_tidscan
1230 	 * only when CURRENT OF isn't present.  Also note that cost_qual_eval
1231 	 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1232 	 * subtract off here; that's to prevent other plan types such as seqscan
1233 	 * from winning.
1234 	 */
1235 	if (isCurrentOf)
1236 	{
1237 		Assert(baserel->baserestrictcost.startup >= disable_cost);
1238 		startup_cost -= disable_cost;
1239 	}
1240 	else if (!enable_tidscan)
1241 		startup_cost += disable_cost;
1242 
1243 	/*
1244 	 * The TID qual expressions will be computed once, any other baserestrict
1245 	 * quals once per retrieved tuple.
1246 	 */
1247 	cost_qual_eval(&tid_qual_cost, tidquals, root);
1248 
1249 	/* fetch estimated page cost for tablespace containing table */
1250 	get_tablespace_page_costs(baserel->reltablespace,
1251 							  &spc_random_page_cost,
1252 							  NULL);
1253 
1254 	/* disk costs --- assume each tuple on a different page */
1255 	run_cost += spc_random_page_cost * ntuples;
1256 
1257 	/* Add scanning CPU costs */
1258 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1259 
1260 	/* XXX currently we assume TID quals are a subset of qpquals */
1261 	startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1262 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1263 		tid_qual_cost.per_tuple;
1264 	run_cost += cpu_per_tuple * ntuples;
1265 
1266 	/* tlist eval costs are paid per output row, not per tuple scanned */
1267 	startup_cost += path->pathtarget->cost.startup;
1268 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1269 
1270 	path->startup_cost = startup_cost;
1271 	path->total_cost = startup_cost + run_cost;
1272 }
1273 
1274 /*
1275  * cost_subqueryscan
1276  *	  Determines and returns the cost of scanning a subquery RTE.
1277  *
1278  * 'baserel' is the relation to be scanned
1279  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1280  */
1281 void
cost_subqueryscan(SubqueryScanPath * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1282 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1283 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1284 {
1285 	Cost		startup_cost;
1286 	Cost		run_cost;
1287 	QualCost	qpqual_cost;
1288 	Cost		cpu_per_tuple;
1289 
1290 	/* Should only be applied to base relations that are subqueries */
1291 	Assert(baserel->relid > 0);
1292 	Assert(baserel->rtekind == RTE_SUBQUERY);
1293 
1294 	/* Mark the path with the correct row estimate */
1295 	if (param_info)
1296 		path->path.rows = param_info->ppi_rows;
1297 	else
1298 		path->path.rows = baserel->rows;
1299 
1300 	/*
1301 	 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1302 	 * any restriction clauses and tlist that will be attached to the
1303 	 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1304 	 * projection overhead.
1305 	 */
1306 	path->path.startup_cost = path->subpath->startup_cost;
1307 	path->path.total_cost = path->subpath->total_cost;
1308 
1309 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1310 
1311 	startup_cost = qpqual_cost.startup;
1312 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1313 	run_cost = cpu_per_tuple * baserel->tuples;
1314 
1315 	/* tlist eval costs are paid per output row, not per tuple scanned */
1316 	startup_cost += path->path.pathtarget->cost.startup;
1317 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1318 
1319 	path->path.startup_cost += startup_cost;
1320 	path->path.total_cost += startup_cost + run_cost;
1321 }
1322 
1323 /*
1324  * cost_functionscan
1325  *	  Determines and returns the cost of scanning a function RTE.
1326  *
1327  * 'baserel' is the relation to be scanned
1328  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1329  */
1330 void
cost_functionscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1331 cost_functionscan(Path *path, PlannerInfo *root,
1332 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1333 {
1334 	Cost		startup_cost = 0;
1335 	Cost		run_cost = 0;
1336 	QualCost	qpqual_cost;
1337 	Cost		cpu_per_tuple;
1338 	RangeTblEntry *rte;
1339 	QualCost	exprcost;
1340 
1341 	/* Should only be applied to base relations that are functions */
1342 	Assert(baserel->relid > 0);
1343 	rte = planner_rt_fetch(baserel->relid, root);
1344 	Assert(rte->rtekind == RTE_FUNCTION);
1345 
1346 	/* Mark the path with the correct row estimate */
1347 	if (param_info)
1348 		path->rows = param_info->ppi_rows;
1349 	else
1350 		path->rows = baserel->rows;
1351 
1352 	/*
1353 	 * Estimate costs of executing the function expression(s).
1354 	 *
1355 	 * Currently, nodeFunctionscan.c always executes the functions to
1356 	 * completion before returning any rows, and caches the results in a
1357 	 * tuplestore.  So the function eval cost is all startup cost, and per-row
1358 	 * costs are minimal.
1359 	 *
1360 	 * XXX in principle we ought to charge tuplestore spill costs if the
1361 	 * number of rows is large.  However, given how phony our rowcount
1362 	 * estimates for functions tend to be, there's not a lot of point in that
1363 	 * refinement right now.
1364 	 */
1365 	cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1366 
1367 	startup_cost += exprcost.startup + exprcost.per_tuple;
1368 
1369 	/* Add scanning CPU costs */
1370 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1371 
1372 	startup_cost += qpqual_cost.startup;
1373 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1374 	run_cost += cpu_per_tuple * baserel->tuples;
1375 
1376 	/* tlist eval costs are paid per output row, not per tuple scanned */
1377 	startup_cost += path->pathtarget->cost.startup;
1378 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1379 
1380 	path->startup_cost = startup_cost;
1381 	path->total_cost = startup_cost + run_cost;
1382 }
1383 
1384 /*
1385  * cost_tablefuncscan
1386  *	  Determines and returns the cost of scanning a table function.
1387  *
1388  * 'baserel' is the relation to be scanned
1389  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1390  */
1391 void
cost_tablefuncscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1392 cost_tablefuncscan(Path *path, PlannerInfo *root,
1393 				   RelOptInfo *baserel, ParamPathInfo *param_info)
1394 {
1395 	Cost		startup_cost = 0;
1396 	Cost		run_cost = 0;
1397 	QualCost	qpqual_cost;
1398 	Cost		cpu_per_tuple;
1399 	RangeTblEntry *rte;
1400 	QualCost	exprcost;
1401 
1402 	/* Should only be applied to base relations that are functions */
1403 	Assert(baserel->relid > 0);
1404 	rte = planner_rt_fetch(baserel->relid, root);
1405 	Assert(rte->rtekind == RTE_TABLEFUNC);
1406 
1407 	/* Mark the path with the correct row estimate */
1408 	if (param_info)
1409 		path->rows = param_info->ppi_rows;
1410 	else
1411 		path->rows = baserel->rows;
1412 
1413 	/*
1414 	 * Estimate costs of executing the table func expression(s).
1415 	 *
1416 	 * XXX in principle we ought to charge tuplestore spill costs if the
1417 	 * number of rows is large.  However, given how phony our rowcount
1418 	 * estimates for tablefuncs tend to be, there's not a lot of point in that
1419 	 * refinement right now.
1420 	 */
1421 	cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1422 
1423 	startup_cost += exprcost.startup + exprcost.per_tuple;
1424 
1425 	/* Add scanning CPU costs */
1426 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1427 
1428 	startup_cost += qpqual_cost.startup;
1429 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1430 	run_cost += cpu_per_tuple * baserel->tuples;
1431 
1432 	/* tlist eval costs are paid per output row, not per tuple scanned */
1433 	startup_cost += path->pathtarget->cost.startup;
1434 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1435 
1436 	path->startup_cost = startup_cost;
1437 	path->total_cost = startup_cost + run_cost;
1438 }
1439 
1440 /*
1441  * cost_valuesscan
1442  *	  Determines and returns the cost of scanning a VALUES RTE.
1443  *
1444  * 'baserel' is the relation to be scanned
1445  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1446  */
1447 void
cost_valuesscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1448 cost_valuesscan(Path *path, PlannerInfo *root,
1449 				RelOptInfo *baserel, ParamPathInfo *param_info)
1450 {
1451 	Cost		startup_cost = 0;
1452 	Cost		run_cost = 0;
1453 	QualCost	qpqual_cost;
1454 	Cost		cpu_per_tuple;
1455 
1456 	/* Should only be applied to base relations that are values lists */
1457 	Assert(baserel->relid > 0);
1458 	Assert(baserel->rtekind == RTE_VALUES);
1459 
1460 	/* Mark the path with the correct row estimate */
1461 	if (param_info)
1462 		path->rows = param_info->ppi_rows;
1463 	else
1464 		path->rows = baserel->rows;
1465 
1466 	/*
1467 	 * For now, estimate list evaluation cost at one operator eval per list
1468 	 * (probably pretty bogus, but is it worth being smarter?)
1469 	 */
1470 	cpu_per_tuple = cpu_operator_cost;
1471 
1472 	/* Add scanning CPU costs */
1473 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1474 
1475 	startup_cost += qpqual_cost.startup;
1476 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1477 	run_cost += cpu_per_tuple * baserel->tuples;
1478 
1479 	/* tlist eval costs are paid per output row, not per tuple scanned */
1480 	startup_cost += path->pathtarget->cost.startup;
1481 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1482 
1483 	path->startup_cost = startup_cost;
1484 	path->total_cost = startup_cost + run_cost;
1485 }
1486 
1487 /*
1488  * cost_ctescan
1489  *	  Determines and returns the cost of scanning a CTE RTE.
1490  *
1491  * Note: this is used for both self-reference and regular CTEs; the
1492  * possible cost differences are below the threshold of what we could
1493  * estimate accurately anyway.  Note that the costs of evaluating the
1494  * referenced CTE query are added into the final plan as initplan costs,
1495  * and should NOT be counted here.
1496  */
1497 void
cost_ctescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1498 cost_ctescan(Path *path, PlannerInfo *root,
1499 			 RelOptInfo *baserel, ParamPathInfo *param_info)
1500 {
1501 	Cost		startup_cost = 0;
1502 	Cost		run_cost = 0;
1503 	QualCost	qpqual_cost;
1504 	Cost		cpu_per_tuple;
1505 
1506 	/* Should only be applied to base relations that are CTEs */
1507 	Assert(baserel->relid > 0);
1508 	Assert(baserel->rtekind == RTE_CTE);
1509 
1510 	/* Mark the path with the correct row estimate */
1511 	if (param_info)
1512 		path->rows = param_info->ppi_rows;
1513 	else
1514 		path->rows = baserel->rows;
1515 
1516 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1517 	cpu_per_tuple = cpu_tuple_cost;
1518 
1519 	/* Add scanning CPU costs */
1520 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1521 
1522 	startup_cost += qpqual_cost.startup;
1523 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1524 	run_cost += cpu_per_tuple * baserel->tuples;
1525 
1526 	/* tlist eval costs are paid per output row, not per tuple scanned */
1527 	startup_cost += path->pathtarget->cost.startup;
1528 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1529 
1530 	path->startup_cost = startup_cost;
1531 	path->total_cost = startup_cost + run_cost;
1532 }
1533 
1534 /*
1535  * cost_namedtuplestorescan
1536  *	  Determines and returns the cost of scanning a named tuplestore.
1537  */
1538 void
cost_namedtuplestorescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1539 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1540 						 RelOptInfo *baserel, ParamPathInfo *param_info)
1541 {
1542 	Cost		startup_cost = 0;
1543 	Cost		run_cost = 0;
1544 	QualCost	qpqual_cost;
1545 	Cost		cpu_per_tuple;
1546 
1547 	/* Should only be applied to base relations that are Tuplestores */
1548 	Assert(baserel->relid > 0);
1549 	Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1550 
1551 	/* Mark the path with the correct row estimate */
1552 	if (param_info)
1553 		path->rows = param_info->ppi_rows;
1554 	else
1555 		path->rows = baserel->rows;
1556 
1557 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1558 	cpu_per_tuple = cpu_tuple_cost;
1559 
1560 	/* Add scanning CPU costs */
1561 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1562 
1563 	startup_cost += qpqual_cost.startup;
1564 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1565 	run_cost += cpu_per_tuple * baserel->tuples;
1566 
1567 	path->startup_cost = startup_cost;
1568 	path->total_cost = startup_cost + run_cost;
1569 }
1570 
1571 /*
1572  * cost_resultscan
1573  *	  Determines and returns the cost of scanning an RTE_RESULT relation.
1574  */
1575 void
cost_resultscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1576 cost_resultscan(Path *path, PlannerInfo *root,
1577 				RelOptInfo *baserel, ParamPathInfo *param_info)
1578 {
1579 	Cost		startup_cost = 0;
1580 	Cost		run_cost = 0;
1581 	QualCost	qpqual_cost;
1582 	Cost		cpu_per_tuple;
1583 
1584 	/* Should only be applied to RTE_RESULT base relations */
1585 	Assert(baserel->relid > 0);
1586 	Assert(baserel->rtekind == RTE_RESULT);
1587 
1588 	/* Mark the path with the correct row estimate */
1589 	if (param_info)
1590 		path->rows = param_info->ppi_rows;
1591 	else
1592 		path->rows = baserel->rows;
1593 
1594 	/* We charge qual cost plus cpu_tuple_cost */
1595 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1596 
1597 	startup_cost += qpqual_cost.startup;
1598 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1599 	run_cost += cpu_per_tuple * baserel->tuples;
1600 
1601 	path->startup_cost = startup_cost;
1602 	path->total_cost = startup_cost + run_cost;
1603 }
1604 
1605 /*
1606  * cost_recursive_union
1607  *	  Determines and returns the cost of performing a recursive union,
1608  *	  and also the estimated output size.
1609  *
1610  * We are given Paths for the nonrecursive and recursive terms.
1611  */
1612 void
cost_recursive_union(Path * runion,Path * nrterm,Path * rterm)1613 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1614 {
1615 	Cost		startup_cost;
1616 	Cost		total_cost;
1617 	double		total_rows;
1618 
1619 	/* We probably have decent estimates for the non-recursive term */
1620 	startup_cost = nrterm->startup_cost;
1621 	total_cost = nrterm->total_cost;
1622 	total_rows = nrterm->rows;
1623 
1624 	/*
1625 	 * We arbitrarily assume that about 10 recursive iterations will be
1626 	 * needed, and that we've managed to get a good fix on the cost and output
1627 	 * size of each one of them.  These are mighty shaky assumptions but it's
1628 	 * hard to see how to do better.
1629 	 */
1630 	total_cost += 10 * rterm->total_cost;
1631 	total_rows += 10 * rterm->rows;
1632 
1633 	/*
1634 	 * Also charge cpu_tuple_cost per row to account for the costs of
1635 	 * manipulating the tuplestores.  (We don't worry about possible
1636 	 * spill-to-disk costs.)
1637 	 */
1638 	total_cost += cpu_tuple_cost * total_rows;
1639 
1640 	runion->startup_cost = startup_cost;
1641 	runion->total_cost = total_cost;
1642 	runion->rows = total_rows;
1643 	runion->pathtarget->width = Max(nrterm->pathtarget->width,
1644 									rterm->pathtarget->width);
1645 }
1646 
1647 /*
1648  * cost_sort
1649  *	  Determines and returns the cost of sorting a relation, including
1650  *	  the cost of reading the input data.
1651  *
1652  * If the total volume of data to sort is less than sort_mem, we will do
1653  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1654  * comparisons for t tuples.
1655  *
1656  * If the total volume exceeds sort_mem, we switch to a tape-style merge
1657  * algorithm.  There will still be about t*log2(t) tuple comparisons in
1658  * total, but we will also need to write and read each tuple once per
1659  * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
1660  * number of initial runs formed and M is the merge order used by tuplesort.c.
1661  * Since the average initial run should be about sort_mem, we have
1662  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1663  *		cpu = comparison_cost * t * log2(t)
1664  *
1665  * If the sort is bounded (i.e., only the first k result tuples are needed)
1666  * and k tuples can fit into sort_mem, we use a heap method that keeps only
1667  * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1668  *
1669  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1670  * accesses (XXX can't we refine that guess?)
1671  *
1672  * By default, we charge two operator evals per tuple comparison, which should
1673  * be in the right ballpark in most cases.  The caller can tweak this by
1674  * specifying nonzero comparison_cost; typically that's used for any extra
1675  * work that has to be done to prepare the inputs to the comparison operators.
1676  *
1677  * 'pathkeys' is a list of sort keys
1678  * 'input_cost' is the total cost for reading the input data
1679  * 'tuples' is the number of tuples in the relation
1680  * 'width' is the average tuple width in bytes
1681  * 'comparison_cost' is the extra cost per comparison, if any
1682  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1683  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1684  *
1685  * NOTE: some callers currently pass NIL for pathkeys because they
1686  * can't conveniently supply the sort keys.  Since this routine doesn't
1687  * currently do anything with pathkeys anyway, that doesn't matter...
1688  * but if it ever does, it should react gracefully to lack of key data.
1689  * (Actually, the thing we'd most likely be interested in is just the number
1690  * of sort keys, which all callers *could* supply.)
1691  */
1692 void
cost_sort(Path * path,PlannerInfo * root,List * pathkeys,Cost input_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1693 cost_sort(Path *path, PlannerInfo *root,
1694 		  List *pathkeys, Cost input_cost, double tuples, int width,
1695 		  Cost comparison_cost, int sort_mem,
1696 		  double limit_tuples)
1697 {
1698 	Cost		startup_cost = input_cost;
1699 	Cost		run_cost = 0;
1700 	double		input_bytes = relation_byte_size(tuples, width);
1701 	double		output_bytes;
1702 	double		output_tuples;
1703 	long		sort_mem_bytes = sort_mem * 1024L;
1704 
1705 	if (!enable_sort)
1706 		startup_cost += disable_cost;
1707 
1708 	path->rows = tuples;
1709 
1710 	/*
1711 	 * We want to be sure the cost of a sort is never estimated as zero, even
1712 	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1713 	 */
1714 	if (tuples < 2.0)
1715 		tuples = 2.0;
1716 
1717 	/* Include the default cost-per-comparison */
1718 	comparison_cost += 2.0 * cpu_operator_cost;
1719 
1720 	/* Do we have a useful LIMIT? */
1721 	if (limit_tuples > 0 && limit_tuples < tuples)
1722 	{
1723 		output_tuples = limit_tuples;
1724 		output_bytes = relation_byte_size(output_tuples, width);
1725 	}
1726 	else
1727 	{
1728 		output_tuples = tuples;
1729 		output_bytes = input_bytes;
1730 	}
1731 
1732 	if (output_bytes > sort_mem_bytes)
1733 	{
1734 		/*
1735 		 * We'll have to use a disk-based sort of all the tuples
1736 		 */
1737 		double		npages = ceil(input_bytes / BLCKSZ);
1738 		double		nruns = input_bytes / sort_mem_bytes;
1739 		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
1740 		double		log_runs;
1741 		double		npageaccesses;
1742 
1743 		/*
1744 		 * CPU costs
1745 		 *
1746 		 * Assume about N log2 N comparisons
1747 		 */
1748 		startup_cost += comparison_cost * tuples * LOG2(tuples);
1749 
1750 		/* Disk costs */
1751 
1752 		/* Compute logM(r) as log(r) / log(M) */
1753 		if (nruns > mergeorder)
1754 			log_runs = ceil(log(nruns) / log(mergeorder));
1755 		else
1756 			log_runs = 1.0;
1757 		npageaccesses = 2.0 * npages * log_runs;
1758 		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
1759 		startup_cost += npageaccesses *
1760 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
1761 	}
1762 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1763 	{
1764 		/*
1765 		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1766 		 * a total number of tuple comparisons of N log2 K; but the constant
1767 		 * factor is a bit higher than for quicksort.  Tweak it so that the
1768 		 * cost curve is continuous at the crossover point.
1769 		 */
1770 		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
1771 	}
1772 	else
1773 	{
1774 		/* We'll use plain quicksort on all the input tuples */
1775 		startup_cost += comparison_cost * tuples * LOG2(tuples);
1776 	}
1777 
1778 	/*
1779 	 * Also charge a small amount (arbitrarily set equal to operator cost) per
1780 	 * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
1781 	 * doesn't do qual-checking or projection, so it has less overhead than
1782 	 * most plan nodes.  Note it's correct to use tuples not output_tuples
1783 	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1784 	 * counting the LIMIT otherwise.
1785 	 */
1786 	run_cost += cpu_operator_cost * tuples;
1787 
1788 	path->startup_cost = startup_cost;
1789 	path->total_cost = startup_cost + run_cost;
1790 }
1791 
1792 /*
1793  * append_nonpartial_cost
1794  *	  Estimate the cost of the non-partial paths in a Parallel Append.
1795  *	  The non-partial paths are assumed to be the first "numpaths" paths
1796  *	  from the subpaths list, and to be in order of decreasing cost.
1797  */
1798 static Cost
append_nonpartial_cost(List * subpaths,int numpaths,int parallel_workers)1799 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1800 {
1801 	Cost	   *costarr;
1802 	int			arrlen;
1803 	ListCell   *l;
1804 	ListCell   *cell;
1805 	int			i;
1806 	int			path_index;
1807 	int			min_index;
1808 	int			max_index;
1809 
1810 	if (numpaths == 0)
1811 		return 0;
1812 
1813 	/*
1814 	 * Array length is number of workers or number of relevants paths,
1815 	 * whichever is less.
1816 	 */
1817 	arrlen = Min(parallel_workers, numpaths);
1818 	costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1819 
1820 	/* The first few paths will each be claimed by a different worker. */
1821 	path_index = 0;
1822 	foreach(cell, subpaths)
1823 	{
1824 		Path	   *subpath = (Path *) lfirst(cell);
1825 
1826 		if (path_index == arrlen)
1827 			break;
1828 		costarr[path_index++] = subpath->total_cost;
1829 	}
1830 
1831 	/*
1832 	 * Since subpaths are sorted by decreasing cost, the last one will have
1833 	 * the minimum cost.
1834 	 */
1835 	min_index = arrlen - 1;
1836 
1837 	/*
1838 	 * For each of the remaining subpaths, add its cost to the array element
1839 	 * with minimum cost.
1840 	 */
1841 	for_each_cell(l, cell)
1842 	{
1843 		Path	   *subpath = (Path *) lfirst(l);
1844 		int			i;
1845 
1846 		/* Consider only the non-partial paths */
1847 		if (path_index++ == numpaths)
1848 			break;
1849 
1850 		costarr[min_index] += subpath->total_cost;
1851 
1852 		/* Update the new min cost array index */
1853 		for (min_index = i = 0; i < arrlen; i++)
1854 		{
1855 			if (costarr[i] < costarr[min_index])
1856 				min_index = i;
1857 		}
1858 	}
1859 
1860 	/* Return the highest cost from the array */
1861 	for (max_index = i = 0; i < arrlen; i++)
1862 	{
1863 		if (costarr[i] > costarr[max_index])
1864 			max_index = i;
1865 	}
1866 
1867 	return costarr[max_index];
1868 }
1869 
1870 /*
1871  * cost_append
1872  *	  Determines and returns the cost of an Append node.
1873  */
1874 void
cost_append(AppendPath * apath)1875 cost_append(AppendPath *apath)
1876 {
1877 	ListCell   *l;
1878 
1879 	apath->path.startup_cost = 0;
1880 	apath->path.total_cost = 0;
1881 	apath->path.rows = 0;
1882 
1883 	if (apath->subpaths == NIL)
1884 		return;
1885 
1886 	if (!apath->path.parallel_aware)
1887 	{
1888 		List	   *pathkeys = apath->path.pathkeys;
1889 
1890 		if (pathkeys == NIL)
1891 		{
1892 			Path	   *subpath = (Path *) linitial(apath->subpaths);
1893 
1894 			/*
1895 			 * For an unordered, non-parallel-aware Append we take the startup
1896 			 * cost as the startup cost of the first subpath.
1897 			 */
1898 			apath->path.startup_cost = subpath->startup_cost;
1899 
1900 			/* Compute rows and costs as sums of subplan rows and costs. */
1901 			foreach(l, apath->subpaths)
1902 			{
1903 				Path	   *subpath = (Path *) lfirst(l);
1904 
1905 				apath->path.rows += subpath->rows;
1906 				apath->path.total_cost += subpath->total_cost;
1907 			}
1908 		}
1909 		else
1910 		{
1911 			/*
1912 			 * For an ordered, non-parallel-aware Append we take the startup
1913 			 * cost as the sum of the subpath startup costs.  This ensures
1914 			 * that we don't underestimate the startup cost when a query's
1915 			 * LIMIT is such that several of the children have to be run to
1916 			 * satisfy it.  This might be overkill --- another plausible hack
1917 			 * would be to take the Append's startup cost as the maximum of
1918 			 * the child startup costs.  But we don't want to risk believing
1919 			 * that an ORDER BY LIMIT query can be satisfied at small cost
1920 			 * when the first child has small startup cost but later ones
1921 			 * don't.  (If we had the ability to deal with nonlinear cost
1922 			 * interpolation for partial retrievals, we would not need to be
1923 			 * so conservative about this.)
1924 			 *
1925 			 * This case is also different from the above in that we have to
1926 			 * account for possibly injecting sorts into subpaths that aren't
1927 			 * natively ordered.
1928 			 */
1929 			foreach(l, apath->subpaths)
1930 			{
1931 				Path	   *subpath = (Path *) lfirst(l);
1932 				Path		sort_path;	/* dummy for result of cost_sort */
1933 
1934 				if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
1935 				{
1936 					/*
1937 					 * We'll need to insert a Sort node, so include costs for
1938 					 * that.  We can use the parent's LIMIT if any, since we
1939 					 * certainly won't pull more than that many tuples from
1940 					 * any child.
1941 					 */
1942 					cost_sort(&sort_path,
1943 							  NULL, /* doesn't currently need root */
1944 							  pathkeys,
1945 							  subpath->total_cost,
1946 							  subpath->rows,
1947 							  subpath->pathtarget->width,
1948 							  0.0,
1949 							  work_mem,
1950 							  apath->limit_tuples);
1951 					subpath = &sort_path;
1952 				}
1953 
1954 				apath->path.rows += subpath->rows;
1955 				apath->path.startup_cost += subpath->startup_cost;
1956 				apath->path.total_cost += subpath->total_cost;
1957 			}
1958 		}
1959 	}
1960 	else						/* parallel-aware */
1961 	{
1962 		int			i = 0;
1963 		double		parallel_divisor = get_parallel_divisor(&apath->path);
1964 
1965 		/* Parallel-aware Append never produces ordered output. */
1966 		Assert(apath->path.pathkeys == NIL);
1967 
1968 		/* Calculate startup cost. */
1969 		foreach(l, apath->subpaths)
1970 		{
1971 			Path	   *subpath = (Path *) lfirst(l);
1972 
1973 			/*
1974 			 * Append will start returning tuples when the child node having
1975 			 * lowest startup cost is done setting up. We consider only the
1976 			 * first few subplans that immediately get a worker assigned.
1977 			 */
1978 			if (i == 0)
1979 				apath->path.startup_cost = subpath->startup_cost;
1980 			else if (i < apath->path.parallel_workers)
1981 				apath->path.startup_cost = Min(apath->path.startup_cost,
1982 											   subpath->startup_cost);
1983 
1984 			/*
1985 			 * Apply parallel divisor to subpaths.  Scale the number of rows
1986 			 * for each partial subpath based on the ratio of the parallel
1987 			 * divisor originally used for the subpath to the one we adopted.
1988 			 * Also add the cost of partial paths to the total cost, but
1989 			 * ignore non-partial paths for now.
1990 			 */
1991 			if (i < apath->first_partial_path)
1992 				apath->path.rows += subpath->rows / parallel_divisor;
1993 			else
1994 			{
1995 				double		subpath_parallel_divisor;
1996 
1997 				subpath_parallel_divisor = get_parallel_divisor(subpath);
1998 				apath->path.rows += subpath->rows * (subpath_parallel_divisor /
1999 													 parallel_divisor);
2000 				apath->path.total_cost += subpath->total_cost;
2001 			}
2002 
2003 			apath->path.rows = clamp_row_est(apath->path.rows);
2004 
2005 			i++;
2006 		}
2007 
2008 		/* Add cost for non-partial subpaths. */
2009 		apath->path.total_cost +=
2010 			append_nonpartial_cost(apath->subpaths,
2011 								   apath->first_partial_path,
2012 								   apath->path.parallel_workers);
2013 	}
2014 
2015 	/*
2016 	 * Although Append does not do any selection or projection, it's not free;
2017 	 * add a small per-tuple overhead.
2018 	 */
2019 	apath->path.total_cost +=
2020 		cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2021 }
2022 
2023 /*
2024  * cost_merge_append
2025  *	  Determines and returns the cost of a MergeAppend node.
2026  *
2027  * MergeAppend merges several pre-sorted input streams, using a heap that
2028  * at any given instant holds the next tuple from each stream.  If there
2029  * are N streams, we need about N*log2(N) tuple comparisons to construct
2030  * the heap at startup, and then for each output tuple, about log2(N)
2031  * comparisons to replace the top entry.
2032  *
2033  * (The effective value of N will drop once some of the input streams are
2034  * exhausted, but it seems unlikely to be worth trying to account for that.)
2035  *
2036  * The heap is never spilled to disk, since we assume N is not very large.
2037  * So this is much simpler than cost_sort.
2038  *
2039  * As in cost_sort, we charge two operator evals per tuple comparison.
2040  *
2041  * 'pathkeys' is a list of sort keys
2042  * 'n_streams' is the number of input streams
2043  * 'input_startup_cost' is the sum of the input streams' startup costs
2044  * 'input_total_cost' is the sum of the input streams' total costs
2045  * 'tuples' is the number of tuples in all the streams
2046  */
2047 void
cost_merge_append(Path * path,PlannerInfo * root,List * pathkeys,int n_streams,Cost input_startup_cost,Cost input_total_cost,double tuples)2048 cost_merge_append(Path *path, PlannerInfo *root,
2049 				  List *pathkeys, int n_streams,
2050 				  Cost input_startup_cost, Cost input_total_cost,
2051 				  double tuples)
2052 {
2053 	Cost		startup_cost = 0;
2054 	Cost		run_cost = 0;
2055 	Cost		comparison_cost;
2056 	double		N;
2057 	double		logN;
2058 
2059 	/*
2060 	 * Avoid log(0)...
2061 	 */
2062 	N = (n_streams < 2) ? 2.0 : (double) n_streams;
2063 	logN = LOG2(N);
2064 
2065 	/* Assumed cost per tuple comparison */
2066 	comparison_cost = 2.0 * cpu_operator_cost;
2067 
2068 	/* Heap creation cost */
2069 	startup_cost += comparison_cost * N * logN;
2070 
2071 	/* Per-tuple heap maintenance cost */
2072 	run_cost += tuples * comparison_cost * logN;
2073 
2074 	/*
2075 	 * Although MergeAppend does not do any selection or projection, it's not
2076 	 * free; add a small per-tuple overhead.
2077 	 */
2078 	run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2079 
2080 	path->startup_cost = startup_cost + input_startup_cost;
2081 	path->total_cost = startup_cost + run_cost + input_total_cost;
2082 }
2083 
2084 /*
2085  * cost_material
2086  *	  Determines and returns the cost of materializing a relation, including
2087  *	  the cost of reading the input data.
2088  *
2089  * If the total volume of data to materialize exceeds work_mem, we will need
2090  * to write it to disk, so the cost is much higher in that case.
2091  *
2092  * Note that here we are estimating the costs for the first scan of the
2093  * relation, so the materialization is all overhead --- any savings will
2094  * occur only on rescan, which is estimated in cost_rescan.
2095  */
2096 void
cost_material(Path * path,Cost input_startup_cost,Cost input_total_cost,double tuples,int width)2097 cost_material(Path *path,
2098 			  Cost input_startup_cost, Cost input_total_cost,
2099 			  double tuples, int width)
2100 {
2101 	Cost		startup_cost = input_startup_cost;
2102 	Cost		run_cost = input_total_cost - input_startup_cost;
2103 	double		nbytes = relation_byte_size(tuples, width);
2104 	long		work_mem_bytes = work_mem * 1024L;
2105 
2106 	path->rows = tuples;
2107 
2108 	/*
2109 	 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2110 	 * reflect bookkeeping overhead.  (This rate must be more than what
2111 	 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2112 	 * if it is exactly the same then there will be a cost tie between
2113 	 * nestloop with A outer, materialized B inner and nestloop with B outer,
2114 	 * materialized A inner.  The extra cost ensures we'll prefer
2115 	 * materializing the smaller rel.)	Note that this is normally a good deal
2116 	 * less than cpu_tuple_cost; which is OK because a Material plan node
2117 	 * doesn't do qual-checking or projection, so it's got less overhead than
2118 	 * most plan nodes.
2119 	 */
2120 	run_cost += 2 * cpu_operator_cost * tuples;
2121 
2122 	/*
2123 	 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2124 	 * This cost is assumed to be evenly spread through the plan run phase,
2125 	 * which isn't exactly accurate but our cost model doesn't allow for
2126 	 * nonuniform costs within the run phase.
2127 	 */
2128 	if (nbytes > work_mem_bytes)
2129 	{
2130 		double		npages = ceil(nbytes / BLCKSZ);
2131 
2132 		run_cost += seq_page_cost * npages;
2133 	}
2134 
2135 	path->startup_cost = startup_cost;
2136 	path->total_cost = startup_cost + run_cost;
2137 }
2138 
2139 /*
2140  * cost_agg
2141  *		Determines and returns the cost of performing an Agg plan node,
2142  *		including the cost of its input.
2143  *
2144  * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2145  * we are using a hashed Agg node just to do grouping).
2146  *
2147  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2148  * are for appropriately-sorted input.
2149  */
2150 void
cost_agg(Path * path,PlannerInfo * root,AggStrategy aggstrategy,const AggClauseCosts * aggcosts,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2151 cost_agg(Path *path, PlannerInfo *root,
2152 		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2153 		 int numGroupCols, double numGroups,
2154 		 List *quals,
2155 		 Cost input_startup_cost, Cost input_total_cost,
2156 		 double input_tuples)
2157 {
2158 	double		output_tuples;
2159 	Cost		startup_cost;
2160 	Cost		total_cost;
2161 	AggClauseCosts dummy_aggcosts;
2162 
2163 	/* Use all-zero per-aggregate costs if NULL is passed */
2164 	if (aggcosts == NULL)
2165 	{
2166 		Assert(aggstrategy == AGG_HASHED);
2167 		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2168 		aggcosts = &dummy_aggcosts;
2169 	}
2170 
2171 	/*
2172 	 * The transCost.per_tuple component of aggcosts should be charged once
2173 	 * per input tuple, corresponding to the costs of evaluating the aggregate
2174 	 * transfns and their input expressions. The finalCost.per_tuple component
2175 	 * is charged once per output tuple, corresponding to the costs of
2176 	 * evaluating the finalfns.  Startup costs are of course charged but once.
2177 	 *
2178 	 * If we are grouping, we charge an additional cpu_operator_cost per
2179 	 * grouping column per input tuple for grouping comparisons.
2180 	 *
2181 	 * We will produce a single output tuple if not grouping, and a tuple per
2182 	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
2183 	 *
2184 	 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2185 	 * same total CPU cost, but AGG_SORTED has lower startup cost.  If the
2186 	 * input path is already sorted appropriately, AGG_SORTED should be
2187 	 * preferred (since it has no risk of memory overflow).  This will happen
2188 	 * as long as the computed total costs are indeed exactly equal --- but if
2189 	 * there's roundoff error we might do the wrong thing.  So be sure that
2190 	 * the computations below form the same intermediate values in the same
2191 	 * order.
2192 	 */
2193 	if (aggstrategy == AGG_PLAIN)
2194 	{
2195 		startup_cost = input_total_cost;
2196 		startup_cost += aggcosts->transCost.startup;
2197 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2198 		startup_cost += aggcosts->finalCost.startup;
2199 		startup_cost += aggcosts->finalCost.per_tuple;
2200 		/* we aren't grouping */
2201 		total_cost = startup_cost + cpu_tuple_cost;
2202 		output_tuples = 1;
2203 	}
2204 	else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2205 	{
2206 		/* Here we are able to deliver output on-the-fly */
2207 		startup_cost = input_startup_cost;
2208 		total_cost = input_total_cost;
2209 		if (aggstrategy == AGG_MIXED && !enable_hashagg)
2210 		{
2211 			startup_cost += disable_cost;
2212 			total_cost += disable_cost;
2213 		}
2214 		/* calcs phrased this way to match HASHED case, see note above */
2215 		total_cost += aggcosts->transCost.startup;
2216 		total_cost += aggcosts->transCost.per_tuple * input_tuples;
2217 		total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2218 		total_cost += aggcosts->finalCost.startup;
2219 		total_cost += aggcosts->finalCost.per_tuple * numGroups;
2220 		total_cost += cpu_tuple_cost * numGroups;
2221 		output_tuples = numGroups;
2222 	}
2223 	else
2224 	{
2225 		/* must be AGG_HASHED */
2226 		startup_cost = input_total_cost;
2227 		if (!enable_hashagg)
2228 			startup_cost += disable_cost;
2229 		startup_cost += aggcosts->transCost.startup;
2230 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2231 		startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2232 		startup_cost += aggcosts->finalCost.startup;
2233 		total_cost = startup_cost;
2234 		total_cost += aggcosts->finalCost.per_tuple * numGroups;
2235 		total_cost += cpu_tuple_cost * numGroups;
2236 		output_tuples = numGroups;
2237 	}
2238 
2239 	/*
2240 	 * If there are quals (HAVING quals), account for their cost and
2241 	 * selectivity.
2242 	 */
2243 	if (quals)
2244 	{
2245 		QualCost	qual_cost;
2246 
2247 		cost_qual_eval(&qual_cost, quals, root);
2248 		startup_cost += qual_cost.startup;
2249 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2250 
2251 		output_tuples = clamp_row_est(output_tuples *
2252 									  clauselist_selectivity(root,
2253 															 quals,
2254 															 0,
2255 															 JOIN_INNER,
2256 															 NULL));
2257 	}
2258 
2259 	path->rows = output_tuples;
2260 	path->startup_cost = startup_cost;
2261 	path->total_cost = total_cost;
2262 }
2263 
2264 /*
2265  * cost_windowagg
2266  *		Determines and returns the cost of performing a WindowAgg plan node,
2267  *		including the cost of its input.
2268  *
2269  * Input is assumed already properly sorted.
2270  */
2271 void
cost_windowagg(Path * path,PlannerInfo * root,List * windowFuncs,int numPartCols,int numOrderCols,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2272 cost_windowagg(Path *path, PlannerInfo *root,
2273 			   List *windowFuncs, int numPartCols, int numOrderCols,
2274 			   Cost input_startup_cost, Cost input_total_cost,
2275 			   double input_tuples)
2276 {
2277 	Cost		startup_cost;
2278 	Cost		total_cost;
2279 	ListCell   *lc;
2280 
2281 	startup_cost = input_startup_cost;
2282 	total_cost = input_total_cost;
2283 
2284 	/*
2285 	 * Window functions are assumed to cost their stated execution cost, plus
2286 	 * the cost of evaluating their input expressions, per tuple.  Since they
2287 	 * may in fact evaluate their inputs at multiple rows during each cycle,
2288 	 * this could be a drastic underestimate; but without a way to know how
2289 	 * many rows the window function will fetch, it's hard to do better.  In
2290 	 * any case, it's a good estimate for all the built-in window functions,
2291 	 * so we'll just do this for now.
2292 	 */
2293 	foreach(lc, windowFuncs)
2294 	{
2295 		WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2296 		Cost		wfunccost;
2297 		QualCost	argcosts;
2298 
2299 		argcosts.startup = argcosts.per_tuple = 0;
2300 		add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2301 						  &argcosts);
2302 		startup_cost += argcosts.startup;
2303 		wfunccost = argcosts.per_tuple;
2304 
2305 		/* also add the input expressions' cost to per-input-row costs */
2306 		cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2307 		startup_cost += argcosts.startup;
2308 		wfunccost += argcosts.per_tuple;
2309 
2310 		/*
2311 		 * Add the filter's cost to per-input-row costs.  XXX We should reduce
2312 		 * input expression costs according to filter selectivity.
2313 		 */
2314 		cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2315 		startup_cost += argcosts.startup;
2316 		wfunccost += argcosts.per_tuple;
2317 
2318 		total_cost += wfunccost * input_tuples;
2319 	}
2320 
2321 	/*
2322 	 * We also charge cpu_operator_cost per grouping column per tuple for
2323 	 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2324 	 * overhead.
2325 	 *
2326 	 * XXX this neglects costs of spooling the data to disk when it overflows
2327 	 * work_mem.  Sooner or later that should get accounted for.
2328 	 */
2329 	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2330 	total_cost += cpu_tuple_cost * input_tuples;
2331 
2332 	path->rows = input_tuples;
2333 	path->startup_cost = startup_cost;
2334 	path->total_cost = total_cost;
2335 }
2336 
2337 /*
2338  * cost_group
2339  *		Determines and returns the cost of performing a Group plan node,
2340  *		including the cost of its input.
2341  *
2342  * Note: caller must ensure that input costs are for appropriately-sorted
2343  * input.
2344  */
2345 void
cost_group(Path * path,PlannerInfo * root,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2346 cost_group(Path *path, PlannerInfo *root,
2347 		   int numGroupCols, double numGroups,
2348 		   List *quals,
2349 		   Cost input_startup_cost, Cost input_total_cost,
2350 		   double input_tuples)
2351 {
2352 	double		output_tuples;
2353 	Cost		startup_cost;
2354 	Cost		total_cost;
2355 
2356 	output_tuples = numGroups;
2357 	startup_cost = input_startup_cost;
2358 	total_cost = input_total_cost;
2359 
2360 	/*
2361 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2362 	 * all columns get compared at most of the tuples.
2363 	 */
2364 	total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2365 
2366 	/*
2367 	 * If there are quals (HAVING quals), account for their cost and
2368 	 * selectivity.
2369 	 */
2370 	if (quals)
2371 	{
2372 		QualCost	qual_cost;
2373 
2374 		cost_qual_eval(&qual_cost, quals, root);
2375 		startup_cost += qual_cost.startup;
2376 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2377 
2378 		output_tuples = clamp_row_est(output_tuples *
2379 									  clauselist_selectivity(root,
2380 															 quals,
2381 															 0,
2382 															 JOIN_INNER,
2383 															 NULL));
2384 	}
2385 
2386 	path->rows = output_tuples;
2387 	path->startup_cost = startup_cost;
2388 	path->total_cost = total_cost;
2389 }
2390 
2391 /*
2392  * initial_cost_nestloop
2393  *	  Preliminary estimate of the cost of a nestloop join path.
2394  *
2395  * This must quickly produce lower-bound estimates of the path's startup and
2396  * total costs.  If we are unable to eliminate the proposed path from
2397  * consideration using the lower bounds, final_cost_nestloop will be called
2398  * to obtain the final estimates.
2399  *
2400  * The exact division of labor between this function and final_cost_nestloop
2401  * is private to them, and represents a tradeoff between speed of the initial
2402  * estimate and getting a tight lower bound.  We choose to not examine the
2403  * join quals here, since that's by far the most expensive part of the
2404  * calculations.  The end result is that CPU-cost considerations must be
2405  * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2406  * incorporation of the inner path's run cost.
2407  *
2408  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2409  *		other data to be used by final_cost_nestloop
2410  * 'jointype' is the type of join to be performed
2411  * 'outer_path' is the outer input to the join
2412  * 'inner_path' is the inner input to the join
2413  * 'extra' contains miscellaneous information about the join
2414  */
2415 void
initial_cost_nestloop(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,Path * outer_path,Path * inner_path,JoinPathExtraData * extra)2416 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2417 					  JoinType jointype,
2418 					  Path *outer_path, Path *inner_path,
2419 					  JoinPathExtraData *extra)
2420 {
2421 	Cost		startup_cost = 0;
2422 	Cost		run_cost = 0;
2423 	double		outer_path_rows = outer_path->rows;
2424 	Cost		inner_rescan_start_cost;
2425 	Cost		inner_rescan_total_cost;
2426 	Cost		inner_run_cost;
2427 	Cost		inner_rescan_run_cost;
2428 
2429 	/* estimate costs to rescan the inner relation */
2430 	cost_rescan(root, inner_path,
2431 				&inner_rescan_start_cost,
2432 				&inner_rescan_total_cost);
2433 
2434 	/* cost of source data */
2435 
2436 	/*
2437 	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2438 	 * before we can start returning tuples, so the join's startup cost is
2439 	 * their sum.  We'll also pay the inner path's rescan startup cost
2440 	 * multiple times.
2441 	 */
2442 	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2443 	run_cost += outer_path->total_cost - outer_path->startup_cost;
2444 	if (outer_path_rows > 1)
2445 		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2446 
2447 	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2448 	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2449 
2450 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2451 		extra->inner_unique)
2452 	{
2453 		/*
2454 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2455 		 * executor will stop after the first match.
2456 		 *
2457 		 * Getting decent estimates requires inspection of the join quals,
2458 		 * which we choose to postpone to final_cost_nestloop.
2459 		 */
2460 
2461 		/* Save private data for final_cost_nestloop */
2462 		workspace->inner_run_cost = inner_run_cost;
2463 		workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2464 	}
2465 	else
2466 	{
2467 		/* Normal case; we'll scan whole input rel for each outer row */
2468 		run_cost += inner_run_cost;
2469 		if (outer_path_rows > 1)
2470 			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2471 	}
2472 
2473 	/* CPU costs left for later */
2474 
2475 	/* Public result fields */
2476 	workspace->startup_cost = startup_cost;
2477 	workspace->total_cost = startup_cost + run_cost;
2478 	/* Save private data for final_cost_nestloop */
2479 	workspace->run_cost = run_cost;
2480 }
2481 
2482 /*
2483  * final_cost_nestloop
2484  *	  Final estimate of the cost and result size of a nestloop join path.
2485  *
2486  * 'path' is already filled in except for the rows and cost fields
2487  * 'workspace' is the result from initial_cost_nestloop
2488  * 'extra' contains miscellaneous information about the join
2489  */
2490 void
final_cost_nestloop(PlannerInfo * root,NestPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2491 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2492 					JoinCostWorkspace *workspace,
2493 					JoinPathExtraData *extra)
2494 {
2495 	Path	   *outer_path = path->outerjoinpath;
2496 	Path	   *inner_path = path->innerjoinpath;
2497 	double		outer_path_rows = outer_path->rows;
2498 	double		inner_path_rows = inner_path->rows;
2499 	Cost		startup_cost = workspace->startup_cost;
2500 	Cost		run_cost = workspace->run_cost;
2501 	Cost		cpu_per_tuple;
2502 	QualCost	restrict_qual_cost;
2503 	double		ntuples;
2504 
2505 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2506 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2507 		outer_path_rows = 1;
2508 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2509 		inner_path_rows = 1;
2510 
2511 	/* Mark the path with the correct row estimate */
2512 	if (path->path.param_info)
2513 		path->path.rows = path->path.param_info->ppi_rows;
2514 	else
2515 		path->path.rows = path->path.parent->rows;
2516 
2517 	/* For partial paths, scale row estimate. */
2518 	if (path->path.parallel_workers > 0)
2519 	{
2520 		double		parallel_divisor = get_parallel_divisor(&path->path);
2521 
2522 		path->path.rows =
2523 			clamp_row_est(path->path.rows / parallel_divisor);
2524 	}
2525 
2526 	/*
2527 	 * We could include disable_cost in the preliminary estimate, but that
2528 	 * would amount to optimizing for the case where the join method is
2529 	 * disabled, which doesn't seem like the way to bet.
2530 	 */
2531 	if (!enable_nestloop)
2532 		startup_cost += disable_cost;
2533 
2534 	/* cost of inner-relation source data (we already dealt with outer rel) */
2535 
2536 	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2537 		extra->inner_unique)
2538 	{
2539 		/*
2540 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2541 		 * executor will stop after the first match.
2542 		 */
2543 		Cost		inner_run_cost = workspace->inner_run_cost;
2544 		Cost		inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2545 		double		outer_matched_rows;
2546 		double		outer_unmatched_rows;
2547 		Selectivity inner_scan_frac;
2548 
2549 		/*
2550 		 * For an outer-rel row that has at least one match, we can expect the
2551 		 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2552 		 * rows, if the matches are evenly distributed.  Since they probably
2553 		 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2554 		 * that fraction.  (If we used a larger fuzz factor, we'd have to
2555 		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2556 		 * least 1, no such clamp is needed now.)
2557 		 */
2558 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2559 		outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2560 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2561 
2562 		/*
2563 		 * Compute number of tuples processed (not number emitted!).  First,
2564 		 * account for successfully-matched outer rows.
2565 		 */
2566 		ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2567 
2568 		/*
2569 		 * Now we need to estimate the actual costs of scanning the inner
2570 		 * relation, which may be quite a bit less than N times inner_run_cost
2571 		 * due to early scan stops.  We consider two cases.  If the inner path
2572 		 * is an indexscan using all the joinquals as indexquals, then an
2573 		 * unmatched outer row results in an indexscan returning no rows,
2574 		 * which is probably quite cheap.  Otherwise, the executor will have
2575 		 * to scan the whole inner rel for an unmatched row; not so cheap.
2576 		 */
2577 		if (has_indexed_join_quals(path))
2578 		{
2579 			/*
2580 			 * Successfully-matched outer rows will only require scanning
2581 			 * inner_scan_frac of the inner relation.  In this case, we don't
2582 			 * need to charge the full inner_run_cost even when that's more
2583 			 * than inner_rescan_run_cost, because we can assume that none of
2584 			 * the inner scans ever scan the whole inner relation.  So it's
2585 			 * okay to assume that all the inner scan executions can be
2586 			 * fractions of the full cost, even if materialization is reducing
2587 			 * the rescan cost.  At this writing, it's impossible to get here
2588 			 * for a materialized inner scan, so inner_run_cost and
2589 			 * inner_rescan_run_cost will be the same anyway; but just in
2590 			 * case, use inner_run_cost for the first matched tuple and
2591 			 * inner_rescan_run_cost for additional ones.
2592 			 */
2593 			run_cost += inner_run_cost * inner_scan_frac;
2594 			if (outer_matched_rows > 1)
2595 				run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2596 
2597 			/*
2598 			 * Add the cost of inner-scan executions for unmatched outer rows.
2599 			 * We estimate this as the same cost as returning the first tuple
2600 			 * of a nonempty scan.  We consider that these are all rescans,
2601 			 * since we used inner_run_cost once already.
2602 			 */
2603 			run_cost += outer_unmatched_rows *
2604 				inner_rescan_run_cost / inner_path_rows;
2605 
2606 			/*
2607 			 * We won't be evaluating any quals at all for unmatched rows, so
2608 			 * don't add them to ntuples.
2609 			 */
2610 		}
2611 		else
2612 		{
2613 			/*
2614 			 * Here, a complicating factor is that rescans may be cheaper than
2615 			 * first scans.  If we never scan all the way to the end of the
2616 			 * inner rel, it might be (depending on the plan type) that we'd
2617 			 * never pay the whole inner first-scan run cost.  However it is
2618 			 * difficult to estimate whether that will happen (and it could
2619 			 * not happen if there are any unmatched outer rows!), so be
2620 			 * conservative and always charge the whole first-scan cost once.
2621 			 * We consider this charge to correspond to the first unmatched
2622 			 * outer row, unless there isn't one in our estimate, in which
2623 			 * case blame it on the first matched row.
2624 			 */
2625 
2626 			/* First, count all unmatched join tuples as being processed */
2627 			ntuples += outer_unmatched_rows * inner_path_rows;
2628 
2629 			/* Now add the forced full scan, and decrement appropriate count */
2630 			run_cost += inner_run_cost;
2631 			if (outer_unmatched_rows >= 1)
2632 				outer_unmatched_rows -= 1;
2633 			else
2634 				outer_matched_rows -= 1;
2635 
2636 			/* Add inner run cost for additional outer tuples having matches */
2637 			if (outer_matched_rows > 0)
2638 				run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2639 
2640 			/* Add inner run cost for additional unmatched outer tuples */
2641 			if (outer_unmatched_rows > 0)
2642 				run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2643 		}
2644 	}
2645 	else
2646 	{
2647 		/* Normal-case source costs were included in preliminary estimate */
2648 
2649 		/* Compute number of tuples processed (not number emitted!) */
2650 		ntuples = outer_path_rows * inner_path_rows;
2651 	}
2652 
2653 	/* CPU costs */
2654 	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2655 	startup_cost += restrict_qual_cost.startup;
2656 	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2657 	run_cost += cpu_per_tuple * ntuples;
2658 
2659 	/* tlist eval costs are paid per output row, not per tuple scanned */
2660 	startup_cost += path->path.pathtarget->cost.startup;
2661 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2662 
2663 	path->path.startup_cost = startup_cost;
2664 	path->path.total_cost = startup_cost + run_cost;
2665 }
2666 
2667 /*
2668  * initial_cost_mergejoin
2669  *	  Preliminary estimate of the cost of a mergejoin path.
2670  *
2671  * This must quickly produce lower-bound estimates of the path's startup and
2672  * total costs.  If we are unable to eliminate the proposed path from
2673  * consideration using the lower bounds, final_cost_mergejoin will be called
2674  * to obtain the final estimates.
2675  *
2676  * The exact division of labor between this function and final_cost_mergejoin
2677  * is private to them, and represents a tradeoff between speed of the initial
2678  * estimate and getting a tight lower bound.  We choose to not examine the
2679  * join quals here, except for obtaining the scan selectivity estimate which
2680  * is really essential (but fortunately, use of caching keeps the cost of
2681  * getting that down to something reasonable).
2682  * We also assume that cost_sort is cheap enough to use here.
2683  *
2684  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2685  *		other data to be used by final_cost_mergejoin
2686  * 'jointype' is the type of join to be performed
2687  * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2688  * 'outer_path' is the outer input to the join
2689  * 'inner_path' is the inner input to the join
2690  * 'outersortkeys' is the list of sort keys for the outer path
2691  * 'innersortkeys' is the list of sort keys for the inner path
2692  * 'extra' contains miscellaneous information about the join
2693  *
2694  * Note: outersortkeys and innersortkeys should be NIL if no explicit
2695  * sort is needed because the respective source path is already ordered.
2696  */
2697 void
initial_cost_mergejoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * mergeclauses,Path * outer_path,Path * inner_path,List * outersortkeys,List * innersortkeys,JoinPathExtraData * extra)2698 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2699 					   JoinType jointype,
2700 					   List *mergeclauses,
2701 					   Path *outer_path, Path *inner_path,
2702 					   List *outersortkeys, List *innersortkeys,
2703 					   JoinPathExtraData *extra)
2704 {
2705 	Cost		startup_cost = 0;
2706 	Cost		run_cost = 0;
2707 	double		outer_path_rows = outer_path->rows;
2708 	double		inner_path_rows = inner_path->rows;
2709 	Cost		inner_run_cost;
2710 	double		outer_rows,
2711 				inner_rows,
2712 				outer_skip_rows,
2713 				inner_skip_rows;
2714 	Selectivity outerstartsel,
2715 				outerendsel,
2716 				innerstartsel,
2717 				innerendsel;
2718 	Path		sort_path;		/* dummy for result of cost_sort */
2719 
2720 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2721 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2722 		outer_path_rows = 1;
2723 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2724 		inner_path_rows = 1;
2725 
2726 	/*
2727 	 * A merge join will stop as soon as it exhausts either input stream
2728 	 * (unless it's an outer join, in which case the outer side has to be
2729 	 * scanned all the way anyway).  Estimate fraction of the left and right
2730 	 * inputs that will actually need to be scanned.  Likewise, we can
2731 	 * estimate the number of rows that will be skipped before the first join
2732 	 * pair is found, which should be factored into startup cost. We use only
2733 	 * the first (most significant) merge clause for this purpose. Since
2734 	 * mergejoinscansel() is a fairly expensive computation, we cache the
2735 	 * results in the merge clause RestrictInfo.
2736 	 */
2737 	if (mergeclauses && jointype != JOIN_FULL)
2738 	{
2739 		RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2740 		List	   *opathkeys;
2741 		List	   *ipathkeys;
2742 		PathKey    *opathkey;
2743 		PathKey    *ipathkey;
2744 		MergeScanSelCache *cache;
2745 
2746 		/* Get the input pathkeys to determine the sort-order details */
2747 		opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2748 		ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2749 		Assert(opathkeys);
2750 		Assert(ipathkeys);
2751 		opathkey = (PathKey *) linitial(opathkeys);
2752 		ipathkey = (PathKey *) linitial(ipathkeys);
2753 		/* debugging check */
2754 		if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2755 			opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2756 			opathkey->pk_strategy != ipathkey->pk_strategy ||
2757 			opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2758 			elog(ERROR, "left and right pathkeys do not match in mergejoin");
2759 
2760 		/* Get the selectivity with caching */
2761 		cache = cached_scansel(root, firstclause, opathkey);
2762 
2763 		if (bms_is_subset(firstclause->left_relids,
2764 						  outer_path->parent->relids))
2765 		{
2766 			/* left side of clause is outer */
2767 			outerstartsel = cache->leftstartsel;
2768 			outerendsel = cache->leftendsel;
2769 			innerstartsel = cache->rightstartsel;
2770 			innerendsel = cache->rightendsel;
2771 		}
2772 		else
2773 		{
2774 			/* left side of clause is inner */
2775 			outerstartsel = cache->rightstartsel;
2776 			outerendsel = cache->rightendsel;
2777 			innerstartsel = cache->leftstartsel;
2778 			innerendsel = cache->leftendsel;
2779 		}
2780 		if (jointype == JOIN_LEFT ||
2781 			jointype == JOIN_ANTI)
2782 		{
2783 			outerstartsel = 0.0;
2784 			outerendsel = 1.0;
2785 		}
2786 		else if (jointype == JOIN_RIGHT)
2787 		{
2788 			innerstartsel = 0.0;
2789 			innerendsel = 1.0;
2790 		}
2791 	}
2792 	else
2793 	{
2794 		/* cope with clauseless or full mergejoin */
2795 		outerstartsel = innerstartsel = 0.0;
2796 		outerendsel = innerendsel = 1.0;
2797 	}
2798 
2799 	/*
2800 	 * Convert selectivities to row counts.  We force outer_rows and
2801 	 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
2802 	 */
2803 	outer_skip_rows = rint(outer_path_rows * outerstartsel);
2804 	inner_skip_rows = rint(inner_path_rows * innerstartsel);
2805 	outer_rows = clamp_row_est(outer_path_rows * outerendsel);
2806 	inner_rows = clamp_row_est(inner_path_rows * innerendsel);
2807 
2808 	/* skip rows can become NaN when path rows has become infinite */
2809 	Assert(outer_skip_rows <= outer_rows || isnan(outer_skip_rows));
2810 	Assert(inner_skip_rows <= inner_rows || isnan(inner_skip_rows));
2811 
2812 	/*
2813 	 * Readjust scan selectivities to account for above rounding.  This is
2814 	 * normally an insignificant effect, but when there are only a few rows in
2815 	 * the inputs, failing to do this makes for a large percentage error.
2816 	 */
2817 	outerstartsel = outer_skip_rows / outer_path_rows;
2818 	innerstartsel = inner_skip_rows / inner_path_rows;
2819 	outerendsel = outer_rows / outer_path_rows;
2820 	innerendsel = inner_rows / inner_path_rows;
2821 
2822 	/* start sel can become NaN when path rows has become infinite */
2823 	Assert(outerstartsel <= outerendsel || isnan(outerstartsel));
2824 	Assert(innerstartsel <= innerendsel || isnan(innerstartsel));
2825 
2826 	/* cost of source data */
2827 
2828 	if (outersortkeys)			/* do we need to sort outer? */
2829 	{
2830 		cost_sort(&sort_path,
2831 				  root,
2832 				  outersortkeys,
2833 				  outer_path->total_cost,
2834 				  outer_path_rows,
2835 				  outer_path->pathtarget->width,
2836 				  0.0,
2837 				  work_mem,
2838 				  -1.0);
2839 		startup_cost += sort_path.startup_cost;
2840 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2841 			* outerstartsel;
2842 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
2843 			* (outerendsel - outerstartsel);
2844 	}
2845 	else
2846 	{
2847 		startup_cost += outer_path->startup_cost;
2848 		startup_cost += (outer_path->total_cost - outer_path->startup_cost)
2849 			* outerstartsel;
2850 		run_cost += (outer_path->total_cost - outer_path->startup_cost)
2851 			* (outerendsel - outerstartsel);
2852 	}
2853 
2854 	if (innersortkeys)			/* do we need to sort inner? */
2855 	{
2856 		cost_sort(&sort_path,
2857 				  root,
2858 				  innersortkeys,
2859 				  inner_path->total_cost,
2860 				  inner_path_rows,
2861 				  inner_path->pathtarget->width,
2862 				  0.0,
2863 				  work_mem,
2864 				  -1.0);
2865 		startup_cost += sort_path.startup_cost;
2866 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2867 			* innerstartsel;
2868 		inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
2869 			* (innerendsel - innerstartsel);
2870 	}
2871 	else
2872 	{
2873 		startup_cost += inner_path->startup_cost;
2874 		startup_cost += (inner_path->total_cost - inner_path->startup_cost)
2875 			* innerstartsel;
2876 		inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
2877 			* (innerendsel - innerstartsel);
2878 	}
2879 
2880 	/*
2881 	 * We can't yet determine whether rescanning occurs, or whether
2882 	 * materialization of the inner input should be done.  The minimum
2883 	 * possible inner input cost, regardless of rescan and materialization
2884 	 * considerations, is inner_run_cost.  We include that in
2885 	 * workspace->total_cost, but not yet in run_cost.
2886 	 */
2887 
2888 	/* CPU costs left for later */
2889 
2890 	/* Public result fields */
2891 	workspace->startup_cost = startup_cost;
2892 	workspace->total_cost = startup_cost + run_cost + inner_run_cost;
2893 	/* Save private data for final_cost_mergejoin */
2894 	workspace->run_cost = run_cost;
2895 	workspace->inner_run_cost = inner_run_cost;
2896 	workspace->outer_rows = outer_rows;
2897 	workspace->inner_rows = inner_rows;
2898 	workspace->outer_skip_rows = outer_skip_rows;
2899 	workspace->inner_skip_rows = inner_skip_rows;
2900 }
2901 
2902 /*
2903  * final_cost_mergejoin
2904  *	  Final estimate of the cost and result size of a mergejoin path.
2905  *
2906  * Unlike other costsize functions, this routine makes two actual decisions:
2907  * whether the executor will need to do mark/restore, and whether we should
2908  * materialize the inner path.  It would be logically cleaner to build
2909  * separate paths testing these alternatives, but that would require repeating
2910  * most of the cost calculations, which are not all that cheap.  Since the
2911  * choice will not affect output pathkeys or startup cost, only total cost,
2912  * there is no possibility of wanting to keep more than one path.  So it seems
2913  * best to make the decisions here and record them in the path's
2914  * skip_mark_restore and materialize_inner fields.
2915  *
2916  * Mark/restore overhead is usually required, but can be skipped if we know
2917  * that the executor need find only one match per outer tuple, and that the
2918  * mergeclauses are sufficient to identify a match.
2919  *
2920  * We materialize the inner path if we need mark/restore and either the inner
2921  * path can't support mark/restore, or it's cheaper to use an interposed
2922  * Material node to handle mark/restore.
2923  *
2924  * 'path' is already filled in except for the rows and cost fields and
2925  *		skip_mark_restore and materialize_inner
2926  * 'workspace' is the result from initial_cost_mergejoin
2927  * 'extra' contains miscellaneous information about the join
2928  */
2929 void
final_cost_mergejoin(PlannerInfo * root,MergePath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2930 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
2931 					 JoinCostWorkspace *workspace,
2932 					 JoinPathExtraData *extra)
2933 {
2934 	Path	   *outer_path = path->jpath.outerjoinpath;
2935 	Path	   *inner_path = path->jpath.innerjoinpath;
2936 	double		inner_path_rows = inner_path->rows;
2937 	List	   *mergeclauses = path->path_mergeclauses;
2938 	List	   *innersortkeys = path->innersortkeys;
2939 	Cost		startup_cost = workspace->startup_cost;
2940 	Cost		run_cost = workspace->run_cost;
2941 	Cost		inner_run_cost = workspace->inner_run_cost;
2942 	double		outer_rows = workspace->outer_rows;
2943 	double		inner_rows = workspace->inner_rows;
2944 	double		outer_skip_rows = workspace->outer_skip_rows;
2945 	double		inner_skip_rows = workspace->inner_skip_rows;
2946 	Cost		cpu_per_tuple,
2947 				bare_inner_cost,
2948 				mat_inner_cost;
2949 	QualCost	merge_qual_cost;
2950 	QualCost	qp_qual_cost;
2951 	double		mergejointuples,
2952 				rescannedtuples;
2953 	double		rescanratio;
2954 
2955 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2956 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2957 		inner_path_rows = 1;
2958 
2959 	/* Mark the path with the correct row estimate */
2960 	if (path->jpath.path.param_info)
2961 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2962 	else
2963 		path->jpath.path.rows = path->jpath.path.parent->rows;
2964 
2965 	/* For partial paths, scale row estimate. */
2966 	if (path->jpath.path.parallel_workers > 0)
2967 	{
2968 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
2969 
2970 		path->jpath.path.rows =
2971 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
2972 	}
2973 
2974 	/*
2975 	 * We could include disable_cost in the preliminary estimate, but that
2976 	 * would amount to optimizing for the case where the join method is
2977 	 * disabled, which doesn't seem like the way to bet.
2978 	 */
2979 	if (!enable_mergejoin)
2980 		startup_cost += disable_cost;
2981 
2982 	/*
2983 	 * Compute cost of the mergequals and qpquals (other restriction clauses)
2984 	 * separately.
2985 	 */
2986 	cost_qual_eval(&merge_qual_cost, mergeclauses, root);
2987 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
2988 	qp_qual_cost.startup -= merge_qual_cost.startup;
2989 	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
2990 
2991 	/*
2992 	 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2993 	 * executor will stop scanning for matches after the first match.  When
2994 	 * all the joinclauses are merge clauses, this means we don't ever need to
2995 	 * back up the merge, and so we can skip mark/restore overhead.
2996 	 */
2997 	if ((path->jpath.jointype == JOIN_SEMI ||
2998 		 path->jpath.jointype == JOIN_ANTI ||
2999 		 extra->inner_unique) &&
3000 		(list_length(path->jpath.joinrestrictinfo) ==
3001 		 list_length(path->path_mergeclauses)))
3002 		path->skip_mark_restore = true;
3003 	else
3004 		path->skip_mark_restore = false;
3005 
3006 	/*
3007 	 * Get approx # tuples passing the mergequals.  We use approx_tuple_count
3008 	 * here because we need an estimate done with JOIN_INNER semantics.
3009 	 */
3010 	mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3011 
3012 	/*
3013 	 * When there are equal merge keys in the outer relation, the mergejoin
3014 	 * must rescan any matching tuples in the inner relation. This means
3015 	 * re-fetching inner tuples; we have to estimate how often that happens.
3016 	 *
3017 	 * For regular inner and outer joins, the number of re-fetches can be
3018 	 * estimated approximately as size of merge join output minus size of
3019 	 * inner relation. Assume that the distinct key values are 1, 2, ..., and
3020 	 * denote the number of values of each key in the outer relation as m1,
3021 	 * m2, ...; in the inner relation, n1, n2, ...  Then we have
3022 	 *
3023 	 * size of join = m1 * n1 + m2 * n2 + ...
3024 	 *
3025 	 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3026 	 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3027 	 * relation
3028 	 *
3029 	 * This equation works correctly for outer tuples having no inner match
3030 	 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3031 	 * are effectively subtracting those from the number of rescanned tuples,
3032 	 * when we should not.  Can we do better without expensive selectivity
3033 	 * computations?
3034 	 *
3035 	 * The whole issue is moot if we are working from a unique-ified outer
3036 	 * input, or if we know we don't need to mark/restore at all.
3037 	 */
3038 	if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
3039 		rescannedtuples = 0;
3040 	else
3041 	{
3042 		rescannedtuples = mergejointuples - inner_path_rows;
3043 		/* Must clamp because of possible underestimate */
3044 		if (rescannedtuples < 0)
3045 			rescannedtuples = 0;
3046 	}
3047 
3048 	/*
3049 	 * We'll inflate various costs this much to account for rescanning.  Note
3050 	 * that this is to be multiplied by something involving inner_rows, or
3051 	 * another number related to the portion of the inner rel we'll scan.
3052 	 */
3053 	rescanratio = 1.0 + (rescannedtuples / inner_rows);
3054 
3055 	/*
3056 	 * Decide whether we want to materialize the inner input to shield it from
3057 	 * mark/restore and performing re-fetches.  Our cost model for regular
3058 	 * re-fetches is that a re-fetch costs the same as an original fetch,
3059 	 * which is probably an overestimate; but on the other hand we ignore the
3060 	 * bookkeeping costs of mark/restore.  Not clear if it's worth developing
3061 	 * a more refined model.  So we just need to inflate the inner run cost by
3062 	 * rescanratio.
3063 	 */
3064 	bare_inner_cost = inner_run_cost * rescanratio;
3065 
3066 	/*
3067 	 * When we interpose a Material node the re-fetch cost is assumed to be
3068 	 * just cpu_operator_cost per tuple, independently of the underlying
3069 	 * plan's cost; and we charge an extra cpu_operator_cost per original
3070 	 * fetch as well.  Note that we're assuming the materialize node will
3071 	 * never spill to disk, since it only has to remember tuples back to the
3072 	 * last mark.  (If there are a huge number of duplicates, our other cost
3073 	 * factors will make the path so expensive that it probably won't get
3074 	 * chosen anyway.)	So we don't use cost_rescan here.
3075 	 *
3076 	 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3077 	 * of the generated Material node.
3078 	 */
3079 	mat_inner_cost = inner_run_cost +
3080 		cpu_operator_cost * inner_rows * rescanratio;
3081 
3082 	/*
3083 	 * If we don't need mark/restore at all, we don't need materialization.
3084 	 */
3085 	if (path->skip_mark_restore)
3086 		path->materialize_inner = false;
3087 
3088 	/*
3089 	 * Prefer materializing if it looks cheaper, unless the user has asked to
3090 	 * suppress materialization.
3091 	 */
3092 	else if (enable_material && mat_inner_cost < bare_inner_cost)
3093 		path->materialize_inner = true;
3094 
3095 	/*
3096 	 * Even if materializing doesn't look cheaper, we *must* do it if the
3097 	 * inner path is to be used directly (without sorting) and it doesn't
3098 	 * support mark/restore.
3099 	 *
3100 	 * Since the inner side must be ordered, and only Sorts and IndexScans can
3101 	 * create order to begin with, and they both support mark/restore, you
3102 	 * might think there's no problem --- but you'd be wrong.  Nestloop and
3103 	 * merge joins can *preserve* the order of their inputs, so they can be
3104 	 * selected as the input of a mergejoin, and they don't support
3105 	 * mark/restore at present.
3106 	 *
3107 	 * We don't test the value of enable_material here, because
3108 	 * materialization is required for correctness in this case, and turning
3109 	 * it off does not entitle us to deliver an invalid plan.
3110 	 */
3111 	else if (innersortkeys == NIL &&
3112 			 !ExecSupportsMarkRestore(inner_path))
3113 		path->materialize_inner = true;
3114 
3115 	/*
3116 	 * Also, force materializing if the inner path is to be sorted and the
3117 	 * sort is expected to spill to disk.  This is because the final merge
3118 	 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3119 	 * We don't try to adjust the cost estimates for this consideration,
3120 	 * though.
3121 	 *
3122 	 * Since materialization is a performance optimization in this case,
3123 	 * rather than necessary for correctness, we skip it if enable_material is
3124 	 * off.
3125 	 */
3126 	else if (enable_material && innersortkeys != NIL &&
3127 			 relation_byte_size(inner_path_rows,
3128 								inner_path->pathtarget->width) >
3129 			 (work_mem * 1024L))
3130 		path->materialize_inner = true;
3131 	else
3132 		path->materialize_inner = false;
3133 
3134 	/* Charge the right incremental cost for the chosen case */
3135 	if (path->materialize_inner)
3136 		run_cost += mat_inner_cost;
3137 	else
3138 		run_cost += bare_inner_cost;
3139 
3140 	/* CPU costs */
3141 
3142 	/*
3143 	 * The number of tuple comparisons needed is approximately number of outer
3144 	 * rows plus number of inner rows plus number of rescanned tuples (can we
3145 	 * refine this?).  At each one, we need to evaluate the mergejoin quals.
3146 	 */
3147 	startup_cost += merge_qual_cost.startup;
3148 	startup_cost += merge_qual_cost.per_tuple *
3149 		(outer_skip_rows + inner_skip_rows * rescanratio);
3150 	run_cost += merge_qual_cost.per_tuple *
3151 		((outer_rows - outer_skip_rows) +
3152 		 (inner_rows - inner_skip_rows) * rescanratio);
3153 
3154 	/*
3155 	 * For each tuple that gets through the mergejoin proper, we charge
3156 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3157 	 * clauses that are to be applied at the join.  (This is pessimistic since
3158 	 * not all of the quals may get evaluated at each tuple.)
3159 	 *
3160 	 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3161 	 * evaluations here, but it's probably not worth the trouble.
3162 	 */
3163 	startup_cost += qp_qual_cost.startup;
3164 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3165 	run_cost += cpu_per_tuple * mergejointuples;
3166 
3167 	/* tlist eval costs are paid per output row, not per tuple scanned */
3168 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3169 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3170 
3171 	path->jpath.path.startup_cost = startup_cost;
3172 	path->jpath.path.total_cost = startup_cost + run_cost;
3173 }
3174 
3175 /*
3176  * run mergejoinscansel() with caching
3177  */
3178 static MergeScanSelCache *
cached_scansel(PlannerInfo * root,RestrictInfo * rinfo,PathKey * pathkey)3179 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3180 {
3181 	MergeScanSelCache *cache;
3182 	ListCell   *lc;
3183 	Selectivity leftstartsel,
3184 				leftendsel,
3185 				rightstartsel,
3186 				rightendsel;
3187 	MemoryContext oldcontext;
3188 
3189 	/* Do we have this result already? */
3190 	foreach(lc, rinfo->scansel_cache)
3191 	{
3192 		cache = (MergeScanSelCache *) lfirst(lc);
3193 		if (cache->opfamily == pathkey->pk_opfamily &&
3194 			cache->collation == pathkey->pk_eclass->ec_collation &&
3195 			cache->strategy == pathkey->pk_strategy &&
3196 			cache->nulls_first == pathkey->pk_nulls_first)
3197 			return cache;
3198 	}
3199 
3200 	/* Nope, do the computation */
3201 	mergejoinscansel(root,
3202 					 (Node *) rinfo->clause,
3203 					 pathkey->pk_opfamily,
3204 					 pathkey->pk_strategy,
3205 					 pathkey->pk_nulls_first,
3206 					 &leftstartsel,
3207 					 &leftendsel,
3208 					 &rightstartsel,
3209 					 &rightendsel);
3210 
3211 	/* Cache the result in suitably long-lived workspace */
3212 	oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3213 
3214 	cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3215 	cache->opfamily = pathkey->pk_opfamily;
3216 	cache->collation = pathkey->pk_eclass->ec_collation;
3217 	cache->strategy = pathkey->pk_strategy;
3218 	cache->nulls_first = pathkey->pk_nulls_first;
3219 	cache->leftstartsel = leftstartsel;
3220 	cache->leftendsel = leftendsel;
3221 	cache->rightstartsel = rightstartsel;
3222 	cache->rightendsel = rightendsel;
3223 
3224 	rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3225 
3226 	MemoryContextSwitchTo(oldcontext);
3227 
3228 	return cache;
3229 }
3230 
3231 /*
3232  * initial_cost_hashjoin
3233  *	  Preliminary estimate of the cost of a hashjoin path.
3234  *
3235  * This must quickly produce lower-bound estimates of the path's startup and
3236  * total costs.  If we are unable to eliminate the proposed path from
3237  * consideration using the lower bounds, final_cost_hashjoin will be called
3238  * to obtain the final estimates.
3239  *
3240  * The exact division of labor between this function and final_cost_hashjoin
3241  * is private to them, and represents a tradeoff between speed of the initial
3242  * estimate and getting a tight lower bound.  We choose to not examine the
3243  * join quals here (other than by counting the number of hash clauses),
3244  * so we can't do much with CPU costs.  We do assume that
3245  * ExecChooseHashTableSize is cheap enough to use here.
3246  *
3247  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3248  *		other data to be used by final_cost_hashjoin
3249  * 'jointype' is the type of join to be performed
3250  * 'hashclauses' is the list of joinclauses to be used as hash clauses
3251  * 'outer_path' is the outer input to the join
3252  * 'inner_path' is the inner input to the join
3253  * 'extra' contains miscellaneous information about the join
3254  * 'parallel_hash' indicates that inner_path is partial and that a shared
3255  *		hash table will be built in parallel
3256  */
3257 void
initial_cost_hashjoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * hashclauses,Path * outer_path,Path * inner_path,JoinPathExtraData * extra,bool parallel_hash)3258 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3259 					  JoinType jointype,
3260 					  List *hashclauses,
3261 					  Path *outer_path, Path *inner_path,
3262 					  JoinPathExtraData *extra,
3263 					  bool parallel_hash)
3264 {
3265 	Cost		startup_cost = 0;
3266 	Cost		run_cost = 0;
3267 	double		outer_path_rows = outer_path->rows;
3268 	double		inner_path_rows = inner_path->rows;
3269 	double		inner_path_rows_total = inner_path_rows;
3270 	int			num_hashclauses = list_length(hashclauses);
3271 	int			numbuckets;
3272 	int			numbatches;
3273 	int			num_skew_mcvs;
3274 	size_t		space_allowed;	/* unused */
3275 
3276 	/* cost of source data */
3277 	startup_cost += outer_path->startup_cost;
3278 	run_cost += outer_path->total_cost - outer_path->startup_cost;
3279 	startup_cost += inner_path->total_cost;
3280 
3281 	/*
3282 	 * Cost of computing hash function: must do it once per input tuple. We
3283 	 * charge one cpu_operator_cost for each column's hash function.  Also,
3284 	 * tack on one cpu_tuple_cost per inner row, to model the costs of
3285 	 * inserting the row into the hashtable.
3286 	 *
3287 	 * XXX when a hashclause is more complex than a single operator, we really
3288 	 * should charge the extra eval costs of the left or right side, as
3289 	 * appropriate, here.  This seems more work than it's worth at the moment.
3290 	 */
3291 	startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3292 		* inner_path_rows;
3293 	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3294 
3295 	/*
3296 	 * If this is a parallel hash build, then the value we have for
3297 	 * inner_rows_total currently refers only to the rows returned by each
3298 	 * participant.  For shared hash table size estimation, we need the total
3299 	 * number, so we need to undo the division.
3300 	 */
3301 	if (parallel_hash)
3302 		inner_path_rows_total *= get_parallel_divisor(inner_path);
3303 
3304 	/*
3305 	 * Get hash table size that executor would use for inner relation.
3306 	 *
3307 	 * XXX for the moment, always assume that skew optimization will be
3308 	 * performed.  As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
3309 	 * trying to determine that for sure.
3310 	 *
3311 	 * XXX at some point it might be interesting to try to account for skew
3312 	 * optimization in the cost estimate, but for now, we don't.
3313 	 */
3314 	ExecChooseHashTableSize(inner_path_rows_total,
3315 							inner_path->pathtarget->width,
3316 							true,	/* useskew */
3317 							parallel_hash,	/* try_combined_work_mem */
3318 							outer_path->parallel_workers,
3319 							&space_allowed,
3320 							&numbuckets,
3321 							&numbatches,
3322 							&num_skew_mcvs);
3323 
3324 	/*
3325 	 * If inner relation is too big then we will need to "batch" the join,
3326 	 * which implies writing and reading most of the tuples to disk an extra
3327 	 * time.  Charge seq_page_cost per page, since the I/O should be nice and
3328 	 * sequential.  Writing the inner rel counts as startup cost, all the rest
3329 	 * as run cost.
3330 	 */
3331 	if (numbatches > 1)
3332 	{
3333 		double		outerpages = page_size(outer_path_rows,
3334 										   outer_path->pathtarget->width);
3335 		double		innerpages = page_size(inner_path_rows,
3336 										   inner_path->pathtarget->width);
3337 
3338 		startup_cost += seq_page_cost * innerpages;
3339 		run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3340 	}
3341 
3342 	/* CPU costs left for later */
3343 
3344 	/* Public result fields */
3345 	workspace->startup_cost = startup_cost;
3346 	workspace->total_cost = startup_cost + run_cost;
3347 	/* Save private data for final_cost_hashjoin */
3348 	workspace->run_cost = run_cost;
3349 	workspace->numbuckets = numbuckets;
3350 	workspace->numbatches = numbatches;
3351 	workspace->inner_rows_total = inner_path_rows_total;
3352 }
3353 
3354 /*
3355  * final_cost_hashjoin
3356  *	  Final estimate of the cost and result size of a hashjoin path.
3357  *
3358  * Note: the numbatches estimate is also saved into 'path' for use later
3359  *
3360  * 'path' is already filled in except for the rows and cost fields and
3361  *		num_batches
3362  * 'workspace' is the result from initial_cost_hashjoin
3363  * 'extra' contains miscellaneous information about the join
3364  */
3365 void
final_cost_hashjoin(PlannerInfo * root,HashPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3366 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3367 					JoinCostWorkspace *workspace,
3368 					JoinPathExtraData *extra)
3369 {
3370 	Path	   *outer_path = path->jpath.outerjoinpath;
3371 	Path	   *inner_path = path->jpath.innerjoinpath;
3372 	double		outer_path_rows = outer_path->rows;
3373 	double		inner_path_rows = inner_path->rows;
3374 	double		inner_path_rows_total = workspace->inner_rows_total;
3375 	List	   *hashclauses = path->path_hashclauses;
3376 	Cost		startup_cost = workspace->startup_cost;
3377 	Cost		run_cost = workspace->run_cost;
3378 	int			numbuckets = workspace->numbuckets;
3379 	int			numbatches = workspace->numbatches;
3380 	Cost		cpu_per_tuple;
3381 	QualCost	hash_qual_cost;
3382 	QualCost	qp_qual_cost;
3383 	double		hashjointuples;
3384 	double		virtualbuckets;
3385 	Selectivity innerbucketsize;
3386 	Selectivity innermcvfreq;
3387 	ListCell   *hcl;
3388 
3389 	/* Mark the path with the correct row estimate */
3390 	if (path->jpath.path.param_info)
3391 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3392 	else
3393 		path->jpath.path.rows = path->jpath.path.parent->rows;
3394 
3395 	/* For partial paths, scale row estimate. */
3396 	if (path->jpath.path.parallel_workers > 0)
3397 	{
3398 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
3399 
3400 		path->jpath.path.rows =
3401 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
3402 	}
3403 
3404 	/*
3405 	 * We could include disable_cost in the preliminary estimate, but that
3406 	 * would amount to optimizing for the case where the join method is
3407 	 * disabled, which doesn't seem like the way to bet.
3408 	 */
3409 	if (!enable_hashjoin)
3410 		startup_cost += disable_cost;
3411 
3412 	/* mark the path with estimated # of batches */
3413 	path->num_batches = numbatches;
3414 
3415 	/* store the total number of tuples (sum of partial row estimates) */
3416 	path->inner_rows_total = inner_path_rows_total;
3417 
3418 	/* and compute the number of "virtual" buckets in the whole join */
3419 	virtualbuckets = (double) numbuckets * (double) numbatches;
3420 
3421 	/*
3422 	 * Determine bucketsize fraction and MCV frequency for the inner relation.
3423 	 * We use the smallest bucketsize or MCV frequency estimated for any
3424 	 * individual hashclause; this is undoubtedly conservative.
3425 	 *
3426 	 * BUT: if inner relation has been unique-ified, we can assume it's good
3427 	 * for hashing.  This is important both because it's the right answer, and
3428 	 * because we avoid contaminating the cache with a value that's wrong for
3429 	 * non-unique-ified paths.
3430 	 */
3431 	if (IsA(inner_path, UniquePath))
3432 	{
3433 		innerbucketsize = 1.0 / virtualbuckets;
3434 		innermcvfreq = 0.0;
3435 	}
3436 	else
3437 	{
3438 		innerbucketsize = 1.0;
3439 		innermcvfreq = 1.0;
3440 		foreach(hcl, hashclauses)
3441 		{
3442 			RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3443 			Selectivity thisbucketsize;
3444 			Selectivity thismcvfreq;
3445 
3446 			/*
3447 			 * First we have to figure out which side of the hashjoin clause
3448 			 * is the inner side.
3449 			 *
3450 			 * Since we tend to visit the same clauses over and over when
3451 			 * planning a large query, we cache the bucket stats estimates in
3452 			 * the RestrictInfo node to avoid repeated lookups of statistics.
3453 			 */
3454 			if (bms_is_subset(restrictinfo->right_relids,
3455 							  inner_path->parent->relids))
3456 			{
3457 				/* righthand side is inner */
3458 				thisbucketsize = restrictinfo->right_bucketsize;
3459 				if (thisbucketsize < 0)
3460 				{
3461 					/* not cached yet */
3462 					estimate_hash_bucket_stats(root,
3463 											   get_rightop(restrictinfo->clause),
3464 											   virtualbuckets,
3465 											   &restrictinfo->right_mcvfreq,
3466 											   &restrictinfo->right_bucketsize);
3467 					thisbucketsize = restrictinfo->right_bucketsize;
3468 				}
3469 				thismcvfreq = restrictinfo->right_mcvfreq;
3470 			}
3471 			else
3472 			{
3473 				Assert(bms_is_subset(restrictinfo->left_relids,
3474 									 inner_path->parent->relids));
3475 				/* lefthand side is inner */
3476 				thisbucketsize = restrictinfo->left_bucketsize;
3477 				if (thisbucketsize < 0)
3478 				{
3479 					/* not cached yet */
3480 					estimate_hash_bucket_stats(root,
3481 											   get_leftop(restrictinfo->clause),
3482 											   virtualbuckets,
3483 											   &restrictinfo->left_mcvfreq,
3484 											   &restrictinfo->left_bucketsize);
3485 					thisbucketsize = restrictinfo->left_bucketsize;
3486 				}
3487 				thismcvfreq = restrictinfo->left_mcvfreq;
3488 			}
3489 
3490 			if (innerbucketsize > thisbucketsize)
3491 				innerbucketsize = thisbucketsize;
3492 			if (innermcvfreq > thismcvfreq)
3493 				innermcvfreq = thismcvfreq;
3494 		}
3495 	}
3496 
3497 	/*
3498 	 * If the bucket holding the inner MCV would exceed work_mem, we don't
3499 	 * want to hash unless there is really no other alternative, so apply
3500 	 * disable_cost.  (The executor normally copes with excessive memory usage
3501 	 * by splitting batches, but obviously it cannot separate equal values
3502 	 * that way, so it will be unable to drive the batch size below work_mem
3503 	 * when this is true.)
3504 	 */
3505 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3506 						   inner_path->pathtarget->width) >
3507 		(work_mem * 1024L))
3508 		startup_cost += disable_cost;
3509 
3510 	/*
3511 	 * Compute cost of the hashquals and qpquals (other restriction clauses)
3512 	 * separately.
3513 	 */
3514 	cost_qual_eval(&hash_qual_cost, hashclauses, root);
3515 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3516 	qp_qual_cost.startup -= hash_qual_cost.startup;
3517 	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3518 
3519 	/* CPU costs */
3520 
3521 	if (path->jpath.jointype == JOIN_SEMI ||
3522 		path->jpath.jointype == JOIN_ANTI ||
3523 		extra->inner_unique)
3524 	{
3525 		double		outer_matched_rows;
3526 		Selectivity inner_scan_frac;
3527 
3528 		/*
3529 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3530 		 * executor will stop after the first match.
3531 		 *
3532 		 * For an outer-rel row that has at least one match, we can expect the
3533 		 * bucket scan to stop after a fraction 1/(match_count+1) of the
3534 		 * bucket's rows, if the matches are evenly distributed.  Since they
3535 		 * probably aren't quite evenly distributed, we apply a fuzz factor of
3536 		 * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
3537 		 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3538 		 * at least 1, no such clamp is needed now.)
3539 		 */
3540 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3541 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3542 
3543 		startup_cost += hash_qual_cost.startup;
3544 		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3545 			clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3546 
3547 		/*
3548 		 * For unmatched outer-rel rows, the picture is quite a lot different.
3549 		 * In the first place, there is no reason to assume that these rows
3550 		 * preferentially hit heavily-populated buckets; instead assume they
3551 		 * are uncorrelated with the inner distribution and so they see an
3552 		 * average bucket size of inner_path_rows / virtualbuckets.  In the
3553 		 * second place, it seems likely that they will have few if any exact
3554 		 * hash-code matches and so very few of the tuples in the bucket will
3555 		 * actually require eval of the hash quals.  We don't have any good
3556 		 * way to estimate how many will, but for the moment assume that the
3557 		 * effective cost per bucket entry is one-tenth what it is for
3558 		 * matchable tuples.
3559 		 */
3560 		run_cost += hash_qual_cost.per_tuple *
3561 			(outer_path_rows - outer_matched_rows) *
3562 			clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3563 
3564 		/* Get # of tuples that will pass the basic join */
3565 		if (path->jpath.jointype == JOIN_ANTI)
3566 			hashjointuples = outer_path_rows - outer_matched_rows;
3567 		else
3568 			hashjointuples = outer_matched_rows;
3569 	}
3570 	else
3571 	{
3572 		/*
3573 		 * The number of tuple comparisons needed is the number of outer
3574 		 * tuples times the typical number of tuples in a hash bucket, which
3575 		 * is the inner relation size times its bucketsize fraction.  At each
3576 		 * one, we need to evaluate the hashjoin quals.  But actually,
3577 		 * charging the full qual eval cost at each tuple is pessimistic,
3578 		 * since we don't evaluate the quals unless the hash values match
3579 		 * exactly.  For lack of a better idea, halve the cost estimate to
3580 		 * allow for that.
3581 		 */
3582 		startup_cost += hash_qual_cost.startup;
3583 		run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3584 			clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3585 
3586 		/*
3587 		 * Get approx # tuples passing the hashquals.  We use
3588 		 * approx_tuple_count here because we need an estimate done with
3589 		 * JOIN_INNER semantics.
3590 		 */
3591 		hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3592 	}
3593 
3594 	/*
3595 	 * For each tuple that gets through the hashjoin proper, we charge
3596 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3597 	 * clauses that are to be applied at the join.  (This is pessimistic since
3598 	 * not all of the quals may get evaluated at each tuple.)
3599 	 */
3600 	startup_cost += qp_qual_cost.startup;
3601 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3602 	run_cost += cpu_per_tuple * hashjointuples;
3603 
3604 	/* tlist eval costs are paid per output row, not per tuple scanned */
3605 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3606 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3607 
3608 	path->jpath.path.startup_cost = startup_cost;
3609 	path->jpath.path.total_cost = startup_cost + run_cost;
3610 }
3611 
3612 
3613 /*
3614  * cost_subplan
3615  *		Figure the costs for a SubPlan (or initplan).
3616  *
3617  * Note: we could dig the subplan's Plan out of the root list, but in practice
3618  * all callers have it handy already, so we make them pass it.
3619  */
3620 void
cost_subplan(PlannerInfo * root,SubPlan * subplan,Plan * plan)3621 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3622 {
3623 	QualCost	sp_cost;
3624 
3625 	/* Figure any cost for evaluating the testexpr */
3626 	cost_qual_eval(&sp_cost,
3627 				   make_ands_implicit((Expr *) subplan->testexpr),
3628 				   root);
3629 
3630 	if (subplan->useHashTable)
3631 	{
3632 		/*
3633 		 * If we are using a hash table for the subquery outputs, then the
3634 		 * cost of evaluating the query is a one-time cost.  We charge one
3635 		 * cpu_operator_cost per tuple for the work of loading the hashtable,
3636 		 * too.
3637 		 */
3638 		sp_cost.startup += plan->total_cost +
3639 			cpu_operator_cost * plan->plan_rows;
3640 
3641 		/*
3642 		 * The per-tuple costs include the cost of evaluating the lefthand
3643 		 * expressions, plus the cost of probing the hashtable.  We already
3644 		 * accounted for the lefthand expressions as part of the testexpr, and
3645 		 * will also have counted one cpu_operator_cost for each comparison
3646 		 * operator.  That is probably too low for the probing cost, but it's
3647 		 * hard to make a better estimate, so live with it for now.
3648 		 */
3649 	}
3650 	else
3651 	{
3652 		/*
3653 		 * Otherwise we will be rescanning the subplan output on each
3654 		 * evaluation.  We need to estimate how much of the output we will
3655 		 * actually need to scan.  NOTE: this logic should agree with the
3656 		 * tuple_fraction estimates used by make_subplan() in
3657 		 * plan/subselect.c.
3658 		 */
3659 		Cost		plan_run_cost = plan->total_cost - plan->startup_cost;
3660 
3661 		if (subplan->subLinkType == EXISTS_SUBLINK)
3662 		{
3663 			/* we only need to fetch 1 tuple; clamp to avoid zero divide */
3664 			sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3665 		}
3666 		else if (subplan->subLinkType == ALL_SUBLINK ||
3667 				 subplan->subLinkType == ANY_SUBLINK)
3668 		{
3669 			/* assume we need 50% of the tuples */
3670 			sp_cost.per_tuple += 0.50 * plan_run_cost;
3671 			/* also charge a cpu_operator_cost per row examined */
3672 			sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3673 		}
3674 		else
3675 		{
3676 			/* assume we need all tuples */
3677 			sp_cost.per_tuple += plan_run_cost;
3678 		}
3679 
3680 		/*
3681 		 * Also account for subplan's startup cost. If the subplan is
3682 		 * uncorrelated or undirect correlated, AND its topmost node is one
3683 		 * that materializes its output, assume that we'll only need to pay
3684 		 * its startup cost once; otherwise assume we pay the startup cost
3685 		 * every time.
3686 		 */
3687 		if (subplan->parParam == NIL &&
3688 			ExecMaterializesOutput(nodeTag(plan)))
3689 			sp_cost.startup += plan->startup_cost;
3690 		else
3691 			sp_cost.per_tuple += plan->startup_cost;
3692 	}
3693 
3694 	subplan->startup_cost = sp_cost.startup;
3695 	subplan->per_call_cost = sp_cost.per_tuple;
3696 }
3697 
3698 
3699 /*
3700  * cost_rescan
3701  *		Given a finished Path, estimate the costs of rescanning it after
3702  *		having done so the first time.  For some Path types a rescan is
3703  *		cheaper than an original scan (if no parameters change), and this
3704  *		function embodies knowledge about that.  The default is to return
3705  *		the same costs stored in the Path.  (Note that the cost estimates
3706  *		actually stored in Paths are always for first scans.)
3707  *
3708  * This function is not currently intended to model effects such as rescans
3709  * being cheaper due to disk block caching; what we are concerned with is
3710  * plan types wherein the executor caches results explicitly, or doesn't
3711  * redo startup calculations, etc.
3712  */
3713 static void
cost_rescan(PlannerInfo * root,Path * path,Cost * rescan_startup_cost,Cost * rescan_total_cost)3714 cost_rescan(PlannerInfo *root, Path *path,
3715 			Cost *rescan_startup_cost,	/* output parameters */
3716 			Cost *rescan_total_cost)
3717 {
3718 	switch (path->pathtype)
3719 	{
3720 		case T_FunctionScan:
3721 
3722 			/*
3723 			 * Currently, nodeFunctionscan.c always executes the function to
3724 			 * completion before returning any rows, and caches the results in
3725 			 * a tuplestore.  So the function eval cost is all startup cost
3726 			 * and isn't paid over again on rescans. However, all run costs
3727 			 * will be paid over again.
3728 			 */
3729 			*rescan_startup_cost = 0;
3730 			*rescan_total_cost = path->total_cost - path->startup_cost;
3731 			break;
3732 		case T_HashJoin:
3733 
3734 			/*
3735 			 * If it's a single-batch join, we don't need to rebuild the hash
3736 			 * table during a rescan.
3737 			 */
3738 			if (((HashPath *) path)->num_batches == 1)
3739 			{
3740 				/* Startup cost is exactly the cost of hash table building */
3741 				*rescan_startup_cost = 0;
3742 				*rescan_total_cost = path->total_cost - path->startup_cost;
3743 			}
3744 			else
3745 			{
3746 				/* Otherwise, no special treatment */
3747 				*rescan_startup_cost = path->startup_cost;
3748 				*rescan_total_cost = path->total_cost;
3749 			}
3750 			break;
3751 		case T_CteScan:
3752 		case T_WorkTableScan:
3753 			{
3754 				/*
3755 				 * These plan types materialize their final result in a
3756 				 * tuplestore or tuplesort object.  So the rescan cost is only
3757 				 * cpu_tuple_cost per tuple, unless the result is large enough
3758 				 * to spill to disk.
3759 				 */
3760 				Cost		run_cost = cpu_tuple_cost * path->rows;
3761 				double		nbytes = relation_byte_size(path->rows,
3762 														path->pathtarget->width);
3763 				long		work_mem_bytes = work_mem * 1024L;
3764 
3765 				if (nbytes > work_mem_bytes)
3766 				{
3767 					/* It will spill, so account for re-read cost */
3768 					double		npages = ceil(nbytes / BLCKSZ);
3769 
3770 					run_cost += seq_page_cost * npages;
3771 				}
3772 				*rescan_startup_cost = 0;
3773 				*rescan_total_cost = run_cost;
3774 			}
3775 			break;
3776 		case T_Material:
3777 		case T_Sort:
3778 			{
3779 				/*
3780 				 * These plan types not only materialize their results, but do
3781 				 * not implement qual filtering or projection.  So they are
3782 				 * even cheaper to rescan than the ones above.  We charge only
3783 				 * cpu_operator_cost per tuple.  (Note: keep that in sync with
3784 				 * the run_cost charge in cost_sort, and also see comments in
3785 				 * cost_material before you change it.)
3786 				 */
3787 				Cost		run_cost = cpu_operator_cost * path->rows;
3788 				double		nbytes = relation_byte_size(path->rows,
3789 														path->pathtarget->width);
3790 				long		work_mem_bytes = work_mem * 1024L;
3791 
3792 				if (nbytes > work_mem_bytes)
3793 				{
3794 					/* It will spill, so account for re-read cost */
3795 					double		npages = ceil(nbytes / BLCKSZ);
3796 
3797 					run_cost += seq_page_cost * npages;
3798 				}
3799 				*rescan_startup_cost = 0;
3800 				*rescan_total_cost = run_cost;
3801 			}
3802 			break;
3803 		default:
3804 			*rescan_startup_cost = path->startup_cost;
3805 			*rescan_total_cost = path->total_cost;
3806 			break;
3807 	}
3808 }
3809 
3810 
3811 /*
3812  * cost_qual_eval
3813  *		Estimate the CPU costs of evaluating a WHERE clause.
3814  *		The input can be either an implicitly-ANDed list of boolean
3815  *		expressions, or a list of RestrictInfo nodes.  (The latter is
3816  *		preferred since it allows caching of the results.)
3817  *		The result includes both a one-time (startup) component,
3818  *		and a per-evaluation component.
3819  */
3820 void
cost_qual_eval(QualCost * cost,List * quals,PlannerInfo * root)3821 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
3822 {
3823 	cost_qual_eval_context context;
3824 	ListCell   *l;
3825 
3826 	context.root = root;
3827 	context.total.startup = 0;
3828 	context.total.per_tuple = 0;
3829 
3830 	/* We don't charge any cost for the implicit ANDing at top level ... */
3831 
3832 	foreach(l, quals)
3833 	{
3834 		Node	   *qual = (Node *) lfirst(l);
3835 
3836 		cost_qual_eval_walker(qual, &context);
3837 	}
3838 
3839 	*cost = context.total;
3840 }
3841 
3842 /*
3843  * cost_qual_eval_node
3844  *		As above, for a single RestrictInfo or expression.
3845  */
3846 void
cost_qual_eval_node(QualCost * cost,Node * qual,PlannerInfo * root)3847 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
3848 {
3849 	cost_qual_eval_context context;
3850 
3851 	context.root = root;
3852 	context.total.startup = 0;
3853 	context.total.per_tuple = 0;
3854 
3855 	cost_qual_eval_walker(qual, &context);
3856 
3857 	*cost = context.total;
3858 }
3859 
3860 static bool
cost_qual_eval_walker(Node * node,cost_qual_eval_context * context)3861 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
3862 {
3863 	if (node == NULL)
3864 		return false;
3865 
3866 	/*
3867 	 * RestrictInfo nodes contain an eval_cost field reserved for this
3868 	 * routine's use, so that it's not necessary to evaluate the qual clause's
3869 	 * cost more than once.  If the clause's cost hasn't been computed yet,
3870 	 * the field's startup value will contain -1.
3871 	 */
3872 	if (IsA(node, RestrictInfo))
3873 	{
3874 		RestrictInfo *rinfo = (RestrictInfo *) node;
3875 
3876 		if (rinfo->eval_cost.startup < 0)
3877 		{
3878 			cost_qual_eval_context locContext;
3879 
3880 			locContext.root = context->root;
3881 			locContext.total.startup = 0;
3882 			locContext.total.per_tuple = 0;
3883 
3884 			/*
3885 			 * For an OR clause, recurse into the marked-up tree so that we
3886 			 * set the eval_cost for contained RestrictInfos too.
3887 			 */
3888 			if (rinfo->orclause)
3889 				cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
3890 			else
3891 				cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
3892 
3893 			/*
3894 			 * If the RestrictInfo is marked pseudoconstant, it will be tested
3895 			 * only once, so treat its cost as all startup cost.
3896 			 */
3897 			if (rinfo->pseudoconstant)
3898 			{
3899 				/* count one execution during startup */
3900 				locContext.total.startup += locContext.total.per_tuple;
3901 				locContext.total.per_tuple = 0;
3902 			}
3903 			rinfo->eval_cost = locContext.total;
3904 		}
3905 		context->total.startup += rinfo->eval_cost.startup;
3906 		context->total.per_tuple += rinfo->eval_cost.per_tuple;
3907 		/* do NOT recurse into children */
3908 		return false;
3909 	}
3910 
3911 	/*
3912 	 * For each operator or function node in the given tree, we charge the
3913 	 * estimated execution cost given by pg_proc.procost (remember to multiply
3914 	 * this by cpu_operator_cost).
3915 	 *
3916 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
3917 	 * OR, NOT). Simplistic, but a lot better than no model at all.
3918 	 *
3919 	 * Should we try to account for the possibility of short-circuit
3920 	 * evaluation of AND/OR?  Probably *not*, because that would make the
3921 	 * results depend on the clause ordering, and we are not in any position
3922 	 * to expect that the current ordering of the clauses is the one that's
3923 	 * going to end up being used.  The above per-RestrictInfo caching would
3924 	 * not mix well with trying to re-order clauses anyway.
3925 	 *
3926 	 * Another issue that is entirely ignored here is that if a set-returning
3927 	 * function is below top level in the tree, the functions/operators above
3928 	 * it will need to be evaluated multiple times.  In practical use, such
3929 	 * cases arise so seldom as to not be worth the added complexity needed;
3930 	 * moreover, since our rowcount estimates for functions tend to be pretty
3931 	 * phony, the results would also be pretty phony.
3932 	 */
3933 	if (IsA(node, FuncExpr))
3934 	{
3935 		add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
3936 						  &context->total);
3937 	}
3938 	else if (IsA(node, OpExpr) ||
3939 			 IsA(node, DistinctExpr) ||
3940 			 IsA(node, NullIfExpr))
3941 	{
3942 		/* rely on struct equivalence to treat these all alike */
3943 		set_opfuncid((OpExpr *) node);
3944 		add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
3945 						  &context->total);
3946 	}
3947 	else if (IsA(node, ScalarArrayOpExpr))
3948 	{
3949 		/*
3950 		 * Estimate that the operator will be applied to about half of the
3951 		 * array elements before the answer is determined.
3952 		 */
3953 		ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
3954 		Node	   *arraynode = (Node *) lsecond(saop->args);
3955 		QualCost	sacosts;
3956 
3957 		set_sa_opfuncid(saop);
3958 		sacosts.startup = sacosts.per_tuple = 0;
3959 		add_function_cost(context->root, saop->opfuncid, NULL,
3960 						  &sacosts);
3961 		context->total.startup += sacosts.startup;
3962 		context->total.per_tuple += sacosts.per_tuple *
3963 			estimate_array_length(arraynode) * 0.5;
3964 	}
3965 	else if (IsA(node, Aggref) ||
3966 			 IsA(node, WindowFunc))
3967 	{
3968 		/*
3969 		 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
3970 		 * ie, zero execution cost in the current model, because they behave
3971 		 * essentially like Vars at execution.  We disregard the costs of
3972 		 * their input expressions for the same reason.  The actual execution
3973 		 * costs of the aggregate/window functions and their arguments have to
3974 		 * be factored into plan-node-specific costing of the Agg or WindowAgg
3975 		 * plan node.
3976 		 */
3977 		return false;			/* don't recurse into children */
3978 	}
3979 	else if (IsA(node, CoerceViaIO))
3980 	{
3981 		CoerceViaIO *iocoerce = (CoerceViaIO *) node;
3982 		Oid			iofunc;
3983 		Oid			typioparam;
3984 		bool		typisvarlena;
3985 
3986 		/* check the result type's input function */
3987 		getTypeInputInfo(iocoerce->resulttype,
3988 						 &iofunc, &typioparam);
3989 		add_function_cost(context->root, iofunc, NULL,
3990 						  &context->total);
3991 		/* check the input type's output function */
3992 		getTypeOutputInfo(exprType((Node *) iocoerce->arg),
3993 						  &iofunc, &typisvarlena);
3994 		add_function_cost(context->root, iofunc, NULL,
3995 						  &context->total);
3996 	}
3997 	else if (IsA(node, ArrayCoerceExpr))
3998 	{
3999 		ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4000 		QualCost	perelemcost;
4001 
4002 		cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4003 							context->root);
4004 		context->total.startup += perelemcost.startup;
4005 		if (perelemcost.per_tuple > 0)
4006 			context->total.per_tuple += perelemcost.per_tuple *
4007 				estimate_array_length((Node *) acoerce->arg);
4008 	}
4009 	else if (IsA(node, RowCompareExpr))
4010 	{
4011 		/* Conservatively assume we will check all the columns */
4012 		RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4013 		ListCell   *lc;
4014 
4015 		foreach(lc, rcexpr->opnos)
4016 		{
4017 			Oid			opid = lfirst_oid(lc);
4018 
4019 			add_function_cost(context->root, get_opcode(opid), NULL,
4020 							  &context->total);
4021 		}
4022 	}
4023 	else if (IsA(node, MinMaxExpr) ||
4024 			 IsA(node, SQLValueFunction) ||
4025 			 IsA(node, XmlExpr) ||
4026 			 IsA(node, CoerceToDomain) ||
4027 			 IsA(node, NextValueExpr))
4028 	{
4029 		/* Treat all these as having cost 1 */
4030 		context->total.per_tuple += cpu_operator_cost;
4031 	}
4032 	else if (IsA(node, CurrentOfExpr))
4033 	{
4034 		/* Report high cost to prevent selection of anything but TID scan */
4035 		context->total.startup += disable_cost;
4036 	}
4037 	else if (IsA(node, SubLink))
4038 	{
4039 		/* This routine should not be applied to un-planned expressions */
4040 		elog(ERROR, "cannot handle unplanned sub-select");
4041 	}
4042 	else if (IsA(node, SubPlan))
4043 	{
4044 		/*
4045 		 * A subplan node in an expression typically indicates that the
4046 		 * subplan will be executed on each evaluation, so charge accordingly.
4047 		 * (Sub-selects that can be executed as InitPlans have already been
4048 		 * removed from the expression.)
4049 		 */
4050 		SubPlan    *subplan = (SubPlan *) node;
4051 
4052 		context->total.startup += subplan->startup_cost;
4053 		context->total.per_tuple += subplan->per_call_cost;
4054 
4055 		/*
4056 		 * We don't want to recurse into the testexpr, because it was already
4057 		 * counted in the SubPlan node's costs.  So we're done.
4058 		 */
4059 		return false;
4060 	}
4061 	else if (IsA(node, AlternativeSubPlan))
4062 	{
4063 		/*
4064 		 * Arbitrarily use the first alternative plan for costing.  (We should
4065 		 * certainly only include one alternative, and we don't yet have
4066 		 * enough information to know which one the executor is most likely to
4067 		 * use.)
4068 		 */
4069 		AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4070 
4071 		return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4072 									 context);
4073 	}
4074 	else if (IsA(node, PlaceHolderVar))
4075 	{
4076 		/*
4077 		 * A PlaceHolderVar should be given cost zero when considering general
4078 		 * expression evaluation costs.  The expense of doing the contained
4079 		 * expression is charged as part of the tlist eval costs of the scan
4080 		 * or join where the PHV is first computed (see set_rel_width and
4081 		 * add_placeholders_to_joinrel).  If we charged it again here, we'd be
4082 		 * double-counting the cost for each level of plan that the PHV
4083 		 * bubbles up through.  Hence, return without recursing into the
4084 		 * phexpr.
4085 		 */
4086 		return false;
4087 	}
4088 
4089 	/* recurse into children */
4090 	return expression_tree_walker(node, cost_qual_eval_walker,
4091 								  (void *) context);
4092 }
4093 
4094 /*
4095  * get_restriction_qual_cost
4096  *	  Compute evaluation costs of a baserel's restriction quals, plus any
4097  *	  movable join quals that have been pushed down to the scan.
4098  *	  Results are returned into *qpqual_cost.
4099  *
4100  * This is a convenience subroutine that works for seqscans and other cases
4101  * where all the given quals will be evaluated the hard way.  It's not useful
4102  * for cost_index(), for example, where the index machinery takes care of
4103  * some of the quals.  We assume baserestrictcost was previously set by
4104  * set_baserel_size_estimates().
4105  */
4106 static void
get_restriction_qual_cost(PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,QualCost * qpqual_cost)4107 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4108 						  ParamPathInfo *param_info,
4109 						  QualCost *qpqual_cost)
4110 {
4111 	if (param_info)
4112 	{
4113 		/* Include costs of pushed-down clauses */
4114 		cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4115 
4116 		qpqual_cost->startup += baserel->baserestrictcost.startup;
4117 		qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4118 	}
4119 	else
4120 		*qpqual_cost = baserel->baserestrictcost;
4121 }
4122 
4123 
4124 /*
4125  * compute_semi_anti_join_factors
4126  *	  Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4127  *	  can be expected to scan.
4128  *
4129  * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4130  * inner rows as soon as it finds a match to the current outer row.
4131  * The same happens if we have detected the inner rel is unique.
4132  * We should therefore adjust some of the cost components for this effect.
4133  * This function computes some estimates needed for these adjustments.
4134  * These estimates will be the same regardless of the particular paths used
4135  * for the outer and inner relation, so we compute these once and then pass
4136  * them to all the join cost estimation functions.
4137  *
4138  * Input parameters:
4139  *	joinrel: join relation under consideration
4140  *	outerrel: outer relation under consideration
4141  *	innerrel: inner relation under consideration
4142  *	jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4143  *	sjinfo: SpecialJoinInfo relevant to this join
4144  *	restrictlist: join quals
4145  * Output parameters:
4146  *	*semifactors is filled in (see pathnodes.h for field definitions)
4147  */
4148 void
compute_semi_anti_join_factors(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outerrel,RelOptInfo * innerrel,JoinType jointype,SpecialJoinInfo * sjinfo,List * restrictlist,SemiAntiJoinFactors * semifactors)4149 compute_semi_anti_join_factors(PlannerInfo *root,
4150 							   RelOptInfo *joinrel,
4151 							   RelOptInfo *outerrel,
4152 							   RelOptInfo *innerrel,
4153 							   JoinType jointype,
4154 							   SpecialJoinInfo *sjinfo,
4155 							   List *restrictlist,
4156 							   SemiAntiJoinFactors *semifactors)
4157 {
4158 	Selectivity jselec;
4159 	Selectivity nselec;
4160 	Selectivity avgmatch;
4161 	SpecialJoinInfo norm_sjinfo;
4162 	List	   *joinquals;
4163 	ListCell   *l;
4164 
4165 	/*
4166 	 * In an ANTI join, we must ignore clauses that are "pushed down", since
4167 	 * those won't affect the match logic.  In a SEMI join, we do not
4168 	 * distinguish joinquals from "pushed down" quals, so just use the whole
4169 	 * restrictinfo list.  For other outer join types, we should consider only
4170 	 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4171 	 */
4172 	if (IS_OUTER_JOIN(jointype))
4173 	{
4174 		joinquals = NIL;
4175 		foreach(l, restrictlist)
4176 		{
4177 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4178 
4179 			if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4180 				joinquals = lappend(joinquals, rinfo);
4181 		}
4182 	}
4183 	else
4184 		joinquals = restrictlist;
4185 
4186 	/*
4187 	 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4188 	 */
4189 	jselec = clauselist_selectivity(root,
4190 									joinquals,
4191 									0,
4192 									(jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4193 									sjinfo);
4194 
4195 	/*
4196 	 * Also get the normal inner-join selectivity of the join clauses.
4197 	 */
4198 	norm_sjinfo.type = T_SpecialJoinInfo;
4199 	norm_sjinfo.min_lefthand = outerrel->relids;
4200 	norm_sjinfo.min_righthand = innerrel->relids;
4201 	norm_sjinfo.syn_lefthand = outerrel->relids;
4202 	norm_sjinfo.syn_righthand = innerrel->relids;
4203 	norm_sjinfo.jointype = JOIN_INNER;
4204 	/* we don't bother trying to make the remaining fields valid */
4205 	norm_sjinfo.lhs_strict = false;
4206 	norm_sjinfo.delay_upper_joins = false;
4207 	norm_sjinfo.semi_can_btree = false;
4208 	norm_sjinfo.semi_can_hash = false;
4209 	norm_sjinfo.semi_operators = NIL;
4210 	norm_sjinfo.semi_rhs_exprs = NIL;
4211 
4212 	nselec = clauselist_selectivity(root,
4213 									joinquals,
4214 									0,
4215 									JOIN_INNER,
4216 									&norm_sjinfo);
4217 
4218 	/* Avoid leaking a lot of ListCells */
4219 	if (IS_OUTER_JOIN(jointype))
4220 		list_free(joinquals);
4221 
4222 	/*
4223 	 * jselec can be interpreted as the fraction of outer-rel rows that have
4224 	 * any matches (this is true for both SEMI and ANTI cases).  And nselec is
4225 	 * the fraction of the Cartesian product that matches.  So, the average
4226 	 * number of matches for each outer-rel row that has at least one match is
4227 	 * nselec * inner_rows / jselec.
4228 	 *
4229 	 * Note: it is correct to use the inner rel's "rows" count here, even
4230 	 * though we might later be considering a parameterized inner path with
4231 	 * fewer rows.  This is because we have included all the join clauses in
4232 	 * the selectivity estimate.
4233 	 */
4234 	if (jselec > 0)				/* protect against zero divide */
4235 	{
4236 		avgmatch = nselec * innerrel->rows / jselec;
4237 		/* Clamp to sane range */
4238 		avgmatch = Max(1.0, avgmatch);
4239 	}
4240 	else
4241 		avgmatch = 1.0;
4242 
4243 	semifactors->outer_match_frac = jselec;
4244 	semifactors->match_count = avgmatch;
4245 }
4246 
4247 /*
4248  * has_indexed_join_quals
4249  *	  Check whether all the joinquals of a nestloop join are used as
4250  *	  inner index quals.
4251  *
4252  * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4253  * indexscan) that uses all the joinquals as indexquals, we can assume that an
4254  * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4255  * expensive.
4256  */
4257 static bool
has_indexed_join_quals(NestPath * joinpath)4258 has_indexed_join_quals(NestPath *joinpath)
4259 {
4260 	Relids		joinrelids = joinpath->path.parent->relids;
4261 	Path	   *innerpath = joinpath->innerjoinpath;
4262 	List	   *indexclauses;
4263 	bool		found_one;
4264 	ListCell   *lc;
4265 
4266 	/* If join still has quals to evaluate, it's not fast */
4267 	if (joinpath->joinrestrictinfo != NIL)
4268 		return false;
4269 	/* Nor if the inner path isn't parameterized at all */
4270 	if (innerpath->param_info == NULL)
4271 		return false;
4272 
4273 	/* Find the indexclauses list for the inner scan */
4274 	switch (innerpath->pathtype)
4275 	{
4276 		case T_IndexScan:
4277 		case T_IndexOnlyScan:
4278 			indexclauses = ((IndexPath *) innerpath)->indexclauses;
4279 			break;
4280 		case T_BitmapHeapScan:
4281 			{
4282 				/* Accept only a simple bitmap scan, not AND/OR cases */
4283 				Path	   *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4284 
4285 				if (IsA(bmqual, IndexPath))
4286 					indexclauses = ((IndexPath *) bmqual)->indexclauses;
4287 				else
4288 					return false;
4289 				break;
4290 			}
4291 		default:
4292 
4293 			/*
4294 			 * If it's not a simple indexscan, it probably doesn't run quickly
4295 			 * for zero rows out, even if it's a parameterized path using all
4296 			 * the joinquals.
4297 			 */
4298 			return false;
4299 	}
4300 
4301 	/*
4302 	 * Examine the inner path's param clauses.  Any that are from the outer
4303 	 * path must be found in the indexclauses list, either exactly or in an
4304 	 * equivalent form generated by equivclass.c.  Also, we must find at least
4305 	 * one such clause, else it's a clauseless join which isn't fast.
4306 	 */
4307 	found_one = false;
4308 	foreach(lc, innerpath->param_info->ppi_clauses)
4309 	{
4310 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4311 
4312 		if (join_clause_is_movable_into(rinfo,
4313 										innerpath->parent->relids,
4314 										joinrelids))
4315 		{
4316 			if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4317 				return false;
4318 			found_one = true;
4319 		}
4320 	}
4321 	return found_one;
4322 }
4323 
4324 
4325 /*
4326  * approx_tuple_count
4327  *		Quick-and-dirty estimation of the number of join rows passing
4328  *		a set of qual conditions.
4329  *
4330  * The quals can be either an implicitly-ANDed list of boolean expressions,
4331  * or a list of RestrictInfo nodes (typically the latter).
4332  *
4333  * We intentionally compute the selectivity under JOIN_INNER rules, even
4334  * if it's some type of outer join.  This is appropriate because we are
4335  * trying to figure out how many tuples pass the initial merge or hash
4336  * join step.
4337  *
4338  * This is quick-and-dirty because we bypass clauselist_selectivity, and
4339  * simply multiply the independent clause selectivities together.  Now
4340  * clauselist_selectivity often can't do any better than that anyhow, but
4341  * for some situations (such as range constraints) it is smarter.  However,
4342  * we can't effectively cache the results of clauselist_selectivity, whereas
4343  * the individual clause selectivities can be and are cached.
4344  *
4345  * Since we are only using the results to estimate how many potential
4346  * output tuples are generated and passed through qpqual checking, it
4347  * seems OK to live with the approximation.
4348  */
4349 static double
approx_tuple_count(PlannerInfo * root,JoinPath * path,List * quals)4350 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4351 {
4352 	double		tuples;
4353 	double		outer_tuples = path->outerjoinpath->rows;
4354 	double		inner_tuples = path->innerjoinpath->rows;
4355 	SpecialJoinInfo sjinfo;
4356 	Selectivity selec = 1.0;
4357 	ListCell   *l;
4358 
4359 	/*
4360 	 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4361 	 */
4362 	sjinfo.type = T_SpecialJoinInfo;
4363 	sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4364 	sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4365 	sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4366 	sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4367 	sjinfo.jointype = JOIN_INNER;
4368 	/* we don't bother trying to make the remaining fields valid */
4369 	sjinfo.lhs_strict = false;
4370 	sjinfo.delay_upper_joins = false;
4371 	sjinfo.semi_can_btree = false;
4372 	sjinfo.semi_can_hash = false;
4373 	sjinfo.semi_operators = NIL;
4374 	sjinfo.semi_rhs_exprs = NIL;
4375 
4376 	/* Get the approximate selectivity */
4377 	foreach(l, quals)
4378 	{
4379 		Node	   *qual = (Node *) lfirst(l);
4380 
4381 		/* Note that clause_selectivity will be able to cache its result */
4382 		selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4383 	}
4384 
4385 	/* Apply it to the input relation sizes */
4386 	tuples = selec * outer_tuples * inner_tuples;
4387 
4388 	return clamp_row_est(tuples);
4389 }
4390 
4391 
4392 /*
4393  * set_baserel_size_estimates
4394  *		Set the size estimates for the given base relation.
4395  *
4396  * The rel's targetlist and restrictinfo list must have been constructed
4397  * already, and rel->tuples must be set.
4398  *
4399  * We set the following fields of the rel node:
4400  *	rows: the estimated number of output tuples (after applying
4401  *		  restriction clauses).
4402  *	width: the estimated average output tuple width in bytes.
4403  *	baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4404  */
4405 void
set_baserel_size_estimates(PlannerInfo * root,RelOptInfo * rel)4406 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4407 {
4408 	double		nrows;
4409 
4410 	/* Should only be applied to base relations */
4411 	Assert(rel->relid > 0);
4412 
4413 	nrows = rel->tuples *
4414 		clauselist_selectivity(root,
4415 							   rel->baserestrictinfo,
4416 							   0,
4417 							   JOIN_INNER,
4418 							   NULL);
4419 
4420 	rel->rows = clamp_row_est(nrows);
4421 
4422 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4423 
4424 	set_rel_width(root, rel);
4425 }
4426 
4427 /*
4428  * get_parameterized_baserel_size
4429  *		Make a size estimate for a parameterized scan of a base relation.
4430  *
4431  * 'param_clauses' lists the additional join clauses to be used.
4432  *
4433  * set_baserel_size_estimates must have been applied already.
4434  */
4435 double
get_parameterized_baserel_size(PlannerInfo * root,RelOptInfo * rel,List * param_clauses)4436 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4437 							   List *param_clauses)
4438 {
4439 	List	   *allclauses;
4440 	double		nrows;
4441 
4442 	/*
4443 	 * Estimate the number of rows returned by the parameterized scan, knowing
4444 	 * that it will apply all the extra join clauses as well as the rel's own
4445 	 * restriction clauses.  Note that we force the clauses to be treated as
4446 	 * non-join clauses during selectivity estimation.
4447 	 */
4448 	allclauses = list_concat(list_copy(param_clauses),
4449 							 rel->baserestrictinfo);
4450 	nrows = rel->tuples *
4451 		clauselist_selectivity(root,
4452 							   allclauses,
4453 							   rel->relid,	/* do not use 0! */
4454 							   JOIN_INNER,
4455 							   NULL);
4456 	nrows = clamp_row_est(nrows);
4457 	/* For safety, make sure result is not more than the base estimate */
4458 	if (nrows > rel->rows)
4459 		nrows = rel->rows;
4460 	return nrows;
4461 }
4462 
4463 /*
4464  * set_joinrel_size_estimates
4465  *		Set the size estimates for the given join relation.
4466  *
4467  * The rel's targetlist must have been constructed already, and a
4468  * restriction clause list that matches the given component rels must
4469  * be provided.
4470  *
4471  * Since there is more than one way to make a joinrel for more than two
4472  * base relations, the results we get here could depend on which component
4473  * rel pair is provided.  In theory we should get the same answers no matter
4474  * which pair is provided; in practice, since the selectivity estimation
4475  * routines don't handle all cases equally well, we might not.  But there's
4476  * not much to be done about it.  (Would it make sense to repeat the
4477  * calculations for each pair of input rels that's encountered, and somehow
4478  * average the results?  Probably way more trouble than it's worth, and
4479  * anyway we must keep the rowcount estimate the same for all paths for the
4480  * joinrel.)
4481  *
4482  * We set only the rows field here.  The reltarget field was already set by
4483  * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4484  */
4485 void
set_joinrel_size_estimates(PlannerInfo * root,RelOptInfo * rel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,SpecialJoinInfo * sjinfo,List * restrictlist)4486 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4487 						   RelOptInfo *outer_rel,
4488 						   RelOptInfo *inner_rel,
4489 						   SpecialJoinInfo *sjinfo,
4490 						   List *restrictlist)
4491 {
4492 	rel->rows = calc_joinrel_size_estimate(root,
4493 										   rel,
4494 										   outer_rel,
4495 										   inner_rel,
4496 										   outer_rel->rows,
4497 										   inner_rel->rows,
4498 										   sjinfo,
4499 										   restrictlist);
4500 }
4501 
4502 /*
4503  * get_parameterized_joinrel_size
4504  *		Make a size estimate for a parameterized scan of a join relation.
4505  *
4506  * 'rel' is the joinrel under consideration.
4507  * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4508  *		produce the relations being joined.
4509  * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4510  * 'restrict_clauses' lists the join clauses that need to be applied at the
4511  * join node (including any movable clauses that were moved down to this join,
4512  * and not including any movable clauses that were pushed down into the
4513  * child paths).
4514  *
4515  * set_joinrel_size_estimates must have been applied already.
4516  */
4517 double
get_parameterized_joinrel_size(PlannerInfo * root,RelOptInfo * rel,Path * outer_path,Path * inner_path,SpecialJoinInfo * sjinfo,List * restrict_clauses)4518 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4519 							   Path *outer_path,
4520 							   Path *inner_path,
4521 							   SpecialJoinInfo *sjinfo,
4522 							   List *restrict_clauses)
4523 {
4524 	double		nrows;
4525 
4526 	/*
4527 	 * Estimate the number of rows returned by the parameterized join as the
4528 	 * sizes of the input paths times the selectivity of the clauses that have
4529 	 * ended up at this join node.
4530 	 *
4531 	 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4532 	 * on the pair of input paths provided, though ideally we'd get the same
4533 	 * estimate for any pair with the same parameterization.
4534 	 */
4535 	nrows = calc_joinrel_size_estimate(root,
4536 									   rel,
4537 									   outer_path->parent,
4538 									   inner_path->parent,
4539 									   outer_path->rows,
4540 									   inner_path->rows,
4541 									   sjinfo,
4542 									   restrict_clauses);
4543 	/* For safety, make sure result is not more than the base estimate */
4544 	if (nrows > rel->rows)
4545 		nrows = rel->rows;
4546 	return nrows;
4547 }
4548 
4549 /*
4550  * calc_joinrel_size_estimate
4551  *		Workhorse for set_joinrel_size_estimates and
4552  *		get_parameterized_joinrel_size.
4553  *
4554  * outer_rel/inner_rel are the relations being joined, but they should be
4555  * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4556  * than what rel->rows says, when we are considering parameterized paths.
4557  */
4558 static double
calc_joinrel_size_estimate(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,double outer_rows,double inner_rows,SpecialJoinInfo * sjinfo,List * restrictlist_in)4559 calc_joinrel_size_estimate(PlannerInfo *root,
4560 						   RelOptInfo *joinrel,
4561 						   RelOptInfo *outer_rel,
4562 						   RelOptInfo *inner_rel,
4563 						   double outer_rows,
4564 						   double inner_rows,
4565 						   SpecialJoinInfo *sjinfo,
4566 						   List *restrictlist_in)
4567 {
4568 	/* This apparently-useless variable dodges a compiler bug in VS2013: */
4569 	List	   *restrictlist = restrictlist_in;
4570 	JoinType	jointype = sjinfo->jointype;
4571 	Selectivity fkselec;
4572 	Selectivity jselec;
4573 	Selectivity pselec;
4574 	double		nrows;
4575 
4576 	/*
4577 	 * Compute joinclause selectivity.  Note that we are only considering
4578 	 * clauses that become restriction clauses at this join level; we are not
4579 	 * double-counting them because they were not considered in estimating the
4580 	 * sizes of the component rels.
4581 	 *
4582 	 * First, see whether any of the joinclauses can be matched to known FK
4583 	 * constraints.  If so, drop those clauses from the restrictlist, and
4584 	 * instead estimate their selectivity using FK semantics.  (We do this
4585 	 * without regard to whether said clauses are local or "pushed down".
4586 	 * Probably, an FK-matching clause could never be seen as pushed down at
4587 	 * an outer join, since it would be strict and hence would be grounds for
4588 	 * join strength reduction.)  fkselec gets the net selectivity for
4589 	 * FK-matching clauses, or 1.0 if there are none.
4590 	 */
4591 	fkselec = get_foreign_key_join_selectivity(root,
4592 											   outer_rel->relids,
4593 											   inner_rel->relids,
4594 											   sjinfo,
4595 											   &restrictlist);
4596 
4597 	/*
4598 	 * For an outer join, we have to distinguish the selectivity of the join's
4599 	 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4600 	 * down".  For inner joins we just count them all as joinclauses.
4601 	 */
4602 	if (IS_OUTER_JOIN(jointype))
4603 	{
4604 		List	   *joinquals = NIL;
4605 		List	   *pushedquals = NIL;
4606 		ListCell   *l;
4607 
4608 		/* Grovel through the clauses to separate into two lists */
4609 		foreach(l, restrictlist)
4610 		{
4611 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4612 
4613 			if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4614 				pushedquals = lappend(pushedquals, rinfo);
4615 			else
4616 				joinquals = lappend(joinquals, rinfo);
4617 		}
4618 
4619 		/* Get the separate selectivities */
4620 		jselec = clauselist_selectivity(root,
4621 										joinquals,
4622 										0,
4623 										jointype,
4624 										sjinfo);
4625 		pselec = clauselist_selectivity(root,
4626 										pushedquals,
4627 										0,
4628 										jointype,
4629 										sjinfo);
4630 
4631 		/* Avoid leaking a lot of ListCells */
4632 		list_free(joinquals);
4633 		list_free(pushedquals);
4634 	}
4635 	else
4636 	{
4637 		jselec = clauselist_selectivity(root,
4638 										restrictlist,
4639 										0,
4640 										jointype,
4641 										sjinfo);
4642 		pselec = 0.0;			/* not used, keep compiler quiet */
4643 	}
4644 
4645 	/*
4646 	 * Basically, we multiply size of Cartesian product by selectivity.
4647 	 *
4648 	 * If we are doing an outer join, take that into account: the joinqual
4649 	 * selectivity has to be clamped using the knowledge that the output must
4650 	 * be at least as large as the non-nullable input.  However, any
4651 	 * pushed-down quals are applied after the outer join, so their
4652 	 * selectivity applies fully.
4653 	 *
4654 	 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4655 	 * of LHS rows that have matches, and we apply that straightforwardly.
4656 	 */
4657 	switch (jointype)
4658 	{
4659 		case JOIN_INNER:
4660 			nrows = outer_rows * inner_rows * fkselec * jselec;
4661 			/* pselec not used */
4662 			break;
4663 		case JOIN_LEFT:
4664 			nrows = outer_rows * inner_rows * fkselec * jselec;
4665 			if (nrows < outer_rows)
4666 				nrows = outer_rows;
4667 			nrows *= pselec;
4668 			break;
4669 		case JOIN_FULL:
4670 			nrows = outer_rows * inner_rows * fkselec * jselec;
4671 			if (nrows < outer_rows)
4672 				nrows = outer_rows;
4673 			if (nrows < inner_rows)
4674 				nrows = inner_rows;
4675 			nrows *= pselec;
4676 			break;
4677 		case JOIN_SEMI:
4678 			nrows = outer_rows * fkselec * jselec;
4679 			/* pselec not used */
4680 			break;
4681 		case JOIN_ANTI:
4682 			nrows = outer_rows * (1.0 - fkselec * jselec);
4683 			nrows *= pselec;
4684 			break;
4685 		default:
4686 			/* other values not expected here */
4687 			elog(ERROR, "unrecognized join type: %d", (int) jointype);
4688 			nrows = 0;			/* keep compiler quiet */
4689 			break;
4690 	}
4691 
4692 	return clamp_row_est(nrows);
4693 }
4694 
4695 /*
4696  * get_foreign_key_join_selectivity
4697  *		Estimate join selectivity for foreign-key-related clauses.
4698  *
4699  * Remove any clauses that can be matched to FK constraints from *restrictlist,
4700  * and return a substitute estimate of their selectivity.  1.0 is returned
4701  * when there are no such clauses.
4702  *
4703  * The reason for treating such clauses specially is that we can get better
4704  * estimates this way than by relying on clauselist_selectivity(), especially
4705  * for multi-column FKs where that function's assumption that the clauses are
4706  * independent falls down badly.  But even with single-column FKs, we may be
4707  * able to get a better answer when the pg_statistic stats are missing or out
4708  * of date.
4709  */
4710 static Selectivity
get_foreign_key_join_selectivity(PlannerInfo * root,Relids outer_relids,Relids inner_relids,SpecialJoinInfo * sjinfo,List ** restrictlist)4711 get_foreign_key_join_selectivity(PlannerInfo *root,
4712 								 Relids outer_relids,
4713 								 Relids inner_relids,
4714 								 SpecialJoinInfo *sjinfo,
4715 								 List **restrictlist)
4716 {
4717 	Selectivity fkselec = 1.0;
4718 	JoinType	jointype = sjinfo->jointype;
4719 	List	   *worklist = *restrictlist;
4720 	ListCell   *lc;
4721 
4722 	/* Consider each FK constraint that is known to match the query */
4723 	foreach(lc, root->fkey_list)
4724 	{
4725 		ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4726 		bool		ref_is_outer;
4727 		List	   *removedlist;
4728 		ListCell   *cell;
4729 		ListCell   *prev;
4730 		ListCell   *next;
4731 
4732 		/*
4733 		 * This FK is not relevant unless it connects a baserel on one side of
4734 		 * this join to a baserel on the other side.
4735 		 */
4736 		if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4737 			bms_is_member(fkinfo->ref_relid, inner_relids))
4738 			ref_is_outer = false;
4739 		else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4740 				 bms_is_member(fkinfo->con_relid, inner_relids))
4741 			ref_is_outer = true;
4742 		else
4743 			continue;
4744 
4745 		/*
4746 		 * If we're dealing with a semi/anti join, and the FK's referenced
4747 		 * relation is on the outside, then knowledge of the FK doesn't help
4748 		 * us figure out what we need to know (which is the fraction of outer
4749 		 * rows that have matches).  On the other hand, if the referenced rel
4750 		 * is on the inside, then all outer rows must have matches in the
4751 		 * referenced table (ignoring nulls).  But any restriction or join
4752 		 * clauses that filter that table will reduce the fraction of matches.
4753 		 * We can account for restriction clauses, but it's too hard to guess
4754 		 * how many table rows would get through a join that's inside the RHS.
4755 		 * Hence, if either case applies, punt and ignore the FK.
4756 		 */
4757 		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4758 			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4759 			continue;
4760 
4761 		/*
4762 		 * Modify the restrictlist by removing clauses that match the FK (and
4763 		 * putting them into removedlist instead).  It seems unsafe to modify
4764 		 * the originally-passed List structure, so we make a shallow copy the
4765 		 * first time through.
4766 		 */
4767 		if (worklist == *restrictlist)
4768 			worklist = list_copy(worklist);
4769 
4770 		removedlist = NIL;
4771 		prev = NULL;
4772 		for (cell = list_head(worklist); cell; cell = next)
4773 		{
4774 			RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
4775 			bool		remove_it = false;
4776 			int			i;
4777 
4778 			next = lnext(cell);
4779 			/* Drop this clause if it matches any column of the FK */
4780 			for (i = 0; i < fkinfo->nkeys; i++)
4781 			{
4782 				if (rinfo->parent_ec)
4783 				{
4784 					/*
4785 					 * EC-derived clauses can only match by EC.  It is okay to
4786 					 * consider any clause derived from the same EC as
4787 					 * matching the FK: even if equivclass.c chose to generate
4788 					 * a clause equating some other pair of Vars, it could
4789 					 * have generated one equating the FK's Vars.  So for
4790 					 * purposes of estimation, we can act as though it did so.
4791 					 *
4792 					 * Note: checking parent_ec is a bit of a cheat because
4793 					 * there are EC-derived clauses that don't have parent_ec
4794 					 * set; but such clauses must compare expressions that
4795 					 * aren't just Vars, so they cannot match the FK anyway.
4796 					 */
4797 					if (fkinfo->eclass[i] == rinfo->parent_ec)
4798 					{
4799 						remove_it = true;
4800 						break;
4801 					}
4802 				}
4803 				else
4804 				{
4805 					/*
4806 					 * Otherwise, see if rinfo was previously matched to FK as
4807 					 * a "loose" clause.
4808 					 */
4809 					if (list_member_ptr(fkinfo->rinfos[i], rinfo))
4810 					{
4811 						remove_it = true;
4812 						break;
4813 					}
4814 				}
4815 			}
4816 			if (remove_it)
4817 			{
4818 				worklist = list_delete_cell(worklist, cell, prev);
4819 				removedlist = lappend(removedlist, rinfo);
4820 			}
4821 			else
4822 				prev = cell;
4823 		}
4824 
4825 		/*
4826 		 * If we failed to remove all the matching clauses we expected to
4827 		 * find, chicken out and ignore this FK; applying its selectivity
4828 		 * might result in double-counting.  Put any clauses we did manage to
4829 		 * remove back into the worklist.
4830 		 *
4831 		 * Since the matching clauses are known not outerjoin-delayed, they
4832 		 * should certainly have appeared in the initial joinclause list.  If
4833 		 * we didn't find them, they must have been matched to, and removed
4834 		 * by, some other FK in a previous iteration of this loop.  (A likely
4835 		 * case is that two FKs are matched to the same EC; there will be only
4836 		 * one EC-derived clause in the initial list, so the first FK will
4837 		 * consume it.)  Applying both FKs' selectivity independently risks
4838 		 * underestimating the join size; in particular, this would undo one
4839 		 * of the main things that ECs were invented for, namely to avoid
4840 		 * double-counting the selectivity of redundant equality conditions.
4841 		 * Later we might think of a reasonable way to combine the estimates,
4842 		 * but for now, just punt, since this is a fairly uncommon situation.
4843 		 */
4844 		if (list_length(removedlist) !=
4845 			(fkinfo->nmatched_ec + fkinfo->nmatched_ri))
4846 		{
4847 			worklist = list_concat(worklist, removedlist);
4848 			continue;
4849 		}
4850 
4851 		/*
4852 		 * Finally we get to the payoff: estimate selectivity using the
4853 		 * knowledge that each referencing row will match exactly one row in
4854 		 * the referenced table.
4855 		 *
4856 		 * XXX that's not true in the presence of nulls in the referencing
4857 		 * column(s), so in principle we should derate the estimate for those.
4858 		 * However (1) if there are any strict restriction clauses for the
4859 		 * referencing column(s) elsewhere in the query, derating here would
4860 		 * be double-counting the null fraction, and (2) it's not very clear
4861 		 * how to combine null fractions for multiple referencing columns. So
4862 		 * we do nothing for now about correcting for nulls.
4863 		 *
4864 		 * XXX another point here is that if either side of an FK constraint
4865 		 * is an inheritance parent, we estimate as though the constraint
4866 		 * covers all its children as well.  This is not an unreasonable
4867 		 * assumption for a referencing table, ie the user probably applied
4868 		 * identical constraints to all child tables (though perhaps we ought
4869 		 * to check that).  But it's not possible to have done that for a
4870 		 * referenced table.  Fortunately, precisely because that doesn't
4871 		 * work, it is uncommon in practice to have an FK referencing a parent
4872 		 * table.  So, at least for now, disregard inheritance here.
4873 		 */
4874 		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
4875 		{
4876 			/*
4877 			 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
4878 			 * referenced table is exactly the inside of the join.  The join
4879 			 * selectivity is defined as the fraction of LHS rows that have
4880 			 * matches.  The FK implies that every LHS row has a match *in the
4881 			 * referenced table*; but any restriction clauses on it will
4882 			 * reduce the number of matches.  Hence we take the join
4883 			 * selectivity as equal to the selectivity of the table's
4884 			 * restriction clauses, which is rows / tuples; but we must guard
4885 			 * against tuples == 0.
4886 			 */
4887 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4888 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
4889 
4890 			fkselec *= ref_rel->rows / ref_tuples;
4891 		}
4892 		else
4893 		{
4894 			/*
4895 			 * Otherwise, selectivity is exactly 1/referenced-table-size; but
4896 			 * guard against tuples == 0.  Note we should use the raw table
4897 			 * tuple count, not any estimate of its filtered or joined size.
4898 			 */
4899 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4900 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
4901 
4902 			fkselec *= 1.0 / ref_tuples;
4903 		}
4904 	}
4905 
4906 	*restrictlist = worklist;
4907 	return fkselec;
4908 }
4909 
4910 /*
4911  * set_subquery_size_estimates
4912  *		Set the size estimates for a base relation that is a subquery.
4913  *
4914  * The rel's targetlist and restrictinfo list must have been constructed
4915  * already, and the Paths for the subquery must have been completed.
4916  * We look at the subquery's PlannerInfo to extract data.
4917  *
4918  * We set the same fields as set_baserel_size_estimates.
4919  */
4920 void
set_subquery_size_estimates(PlannerInfo * root,RelOptInfo * rel)4921 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4922 {
4923 	PlannerInfo *subroot = rel->subroot;
4924 	RelOptInfo *sub_final_rel;
4925 	ListCell   *lc;
4926 
4927 	/* Should only be applied to base relations that are subqueries */
4928 	Assert(rel->relid > 0);
4929 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
4930 
4931 	/*
4932 	 * Copy raw number of output rows from subquery.  All of its paths should
4933 	 * have the same output rowcount, so just look at cheapest-total.
4934 	 */
4935 	sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
4936 	rel->tuples = sub_final_rel->cheapest_total_path->rows;
4937 
4938 	/*
4939 	 * Compute per-output-column width estimates by examining the subquery's
4940 	 * targetlist.  For any output that is a plain Var, get the width estimate
4941 	 * that was made while planning the subquery.  Otherwise, we leave it to
4942 	 * set_rel_width to fill in a datatype-based default estimate.
4943 	 */
4944 	foreach(lc, subroot->parse->targetList)
4945 	{
4946 		TargetEntry *te = lfirst_node(TargetEntry, lc);
4947 		Node	   *texpr = (Node *) te->expr;
4948 		int32		item_width = 0;
4949 
4950 		/* junk columns aren't visible to upper query */
4951 		if (te->resjunk)
4952 			continue;
4953 
4954 		/*
4955 		 * The subquery could be an expansion of a view that's had columns
4956 		 * added to it since the current query was parsed, so that there are
4957 		 * non-junk tlist columns in it that don't correspond to any column
4958 		 * visible at our query level.  Ignore such columns.
4959 		 */
4960 		if (te->resno < rel->min_attr || te->resno > rel->max_attr)
4961 			continue;
4962 
4963 		/*
4964 		 * XXX This currently doesn't work for subqueries containing set
4965 		 * operations, because the Vars in their tlists are bogus references
4966 		 * to the first leaf subquery, which wouldn't give the right answer
4967 		 * even if we could still get to its PlannerInfo.
4968 		 *
4969 		 * Also, the subquery could be an appendrel for which all branches are
4970 		 * known empty due to constraint exclusion, in which case
4971 		 * set_append_rel_pathlist will have left the attr_widths set to zero.
4972 		 *
4973 		 * In either case, we just leave the width estimate zero until
4974 		 * set_rel_width fixes it.
4975 		 */
4976 		if (IsA(texpr, Var) &&
4977 			subroot->parse->setOperations == NULL)
4978 		{
4979 			Var		   *var = (Var *) texpr;
4980 			RelOptInfo *subrel = find_base_rel(subroot, var->varno);
4981 
4982 			item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
4983 		}
4984 		rel->attr_widths[te->resno - rel->min_attr] = item_width;
4985 	}
4986 
4987 	/* Now estimate number of output rows, etc */
4988 	set_baserel_size_estimates(root, rel);
4989 }
4990 
4991 /*
4992  * set_function_size_estimates
4993  *		Set the size estimates for a base relation that is a function call.
4994  *
4995  * The rel's targetlist and restrictinfo list must have been constructed
4996  * already.
4997  *
4998  * We set the same fields as set_baserel_size_estimates.
4999  */
5000 void
set_function_size_estimates(PlannerInfo * root,RelOptInfo * rel)5001 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5002 {
5003 	RangeTblEntry *rte;
5004 	ListCell   *lc;
5005 
5006 	/* Should only be applied to base relations that are functions */
5007 	Assert(rel->relid > 0);
5008 	rte = planner_rt_fetch(rel->relid, root);
5009 	Assert(rte->rtekind == RTE_FUNCTION);
5010 
5011 	/*
5012 	 * Estimate number of rows the functions will return. The rowcount of the
5013 	 * node is that of the largest function result.
5014 	 */
5015 	rel->tuples = 0;
5016 	foreach(lc, rte->functions)
5017 	{
5018 		RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5019 		double		ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5020 
5021 		if (ntup > rel->tuples)
5022 			rel->tuples = ntup;
5023 	}
5024 
5025 	/* Now estimate number of output rows, etc */
5026 	set_baserel_size_estimates(root, rel);
5027 }
5028 
5029 /*
5030  * set_function_size_estimates
5031  *		Set the size estimates for a base relation that is a function call.
5032  *
5033  * The rel's targetlist and restrictinfo list must have been constructed
5034  * already.
5035  *
5036  * We set the same fields as set_tablefunc_size_estimates.
5037  */
5038 void
set_tablefunc_size_estimates(PlannerInfo * root,RelOptInfo * rel)5039 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5040 {
5041 	/* Should only be applied to base relations that are functions */
5042 	Assert(rel->relid > 0);
5043 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5044 
5045 	rel->tuples = 100;
5046 
5047 	/* Now estimate number of output rows, etc */
5048 	set_baserel_size_estimates(root, rel);
5049 }
5050 
5051 /*
5052  * set_values_size_estimates
5053  *		Set the size estimates for a base relation that is a values list.
5054  *
5055  * The rel's targetlist and restrictinfo list must have been constructed
5056  * already.
5057  *
5058  * We set the same fields as set_baserel_size_estimates.
5059  */
5060 void
set_values_size_estimates(PlannerInfo * root,RelOptInfo * rel)5061 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5062 {
5063 	RangeTblEntry *rte;
5064 
5065 	/* Should only be applied to base relations that are values lists */
5066 	Assert(rel->relid > 0);
5067 	rte = planner_rt_fetch(rel->relid, root);
5068 	Assert(rte->rtekind == RTE_VALUES);
5069 
5070 	/*
5071 	 * Estimate number of rows the values list will return. We know this
5072 	 * precisely based on the list length (well, barring set-returning
5073 	 * functions in list items, but that's a refinement not catered for
5074 	 * anywhere else either).
5075 	 */
5076 	rel->tuples = list_length(rte->values_lists);
5077 
5078 	/* Now estimate number of output rows, etc */
5079 	set_baserel_size_estimates(root, rel);
5080 }
5081 
5082 /*
5083  * set_cte_size_estimates
5084  *		Set the size estimates for a base relation that is a CTE reference.
5085  *
5086  * The rel's targetlist and restrictinfo list must have been constructed
5087  * already, and we need an estimate of the number of rows returned by the CTE
5088  * (if a regular CTE) or the non-recursive term (if a self-reference).
5089  *
5090  * We set the same fields as set_baserel_size_estimates.
5091  */
5092 void
set_cte_size_estimates(PlannerInfo * root,RelOptInfo * rel,double cte_rows)5093 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5094 {
5095 	RangeTblEntry *rte;
5096 
5097 	/* Should only be applied to base relations that are CTE references */
5098 	Assert(rel->relid > 0);
5099 	rte = planner_rt_fetch(rel->relid, root);
5100 	Assert(rte->rtekind == RTE_CTE);
5101 
5102 	if (rte->self_reference)
5103 	{
5104 		/*
5105 		 * In a self-reference, arbitrarily assume the average worktable size
5106 		 * is about 10 times the nonrecursive term's size.
5107 		 */
5108 		rel->tuples = 10 * cte_rows;
5109 	}
5110 	else
5111 	{
5112 		/* Otherwise just believe the CTE's rowcount estimate */
5113 		rel->tuples = cte_rows;
5114 	}
5115 
5116 	/* Now estimate number of output rows, etc */
5117 	set_baserel_size_estimates(root, rel);
5118 }
5119 
5120 /*
5121  * set_namedtuplestore_size_estimates
5122  *		Set the size estimates for a base relation that is a tuplestore reference.
5123  *
5124  * The rel's targetlist and restrictinfo list must have been constructed
5125  * already.
5126  *
5127  * We set the same fields as set_baserel_size_estimates.
5128  */
5129 void
set_namedtuplestore_size_estimates(PlannerInfo * root,RelOptInfo * rel)5130 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5131 {
5132 	RangeTblEntry *rte;
5133 
5134 	/* Should only be applied to base relations that are tuplestore references */
5135 	Assert(rel->relid > 0);
5136 	rte = planner_rt_fetch(rel->relid, root);
5137 	Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5138 
5139 	/*
5140 	 * Use the estimate provided by the code which is generating the named
5141 	 * tuplestore.  In some cases, the actual number might be available; in
5142 	 * others the same plan will be re-used, so a "typical" value might be
5143 	 * estimated and used.
5144 	 */
5145 	rel->tuples = rte->enrtuples;
5146 	if (rel->tuples < 0)
5147 		rel->tuples = 1000;
5148 
5149 	/* Now estimate number of output rows, etc */
5150 	set_baserel_size_estimates(root, rel);
5151 }
5152 
5153 /*
5154  * set_result_size_estimates
5155  *		Set the size estimates for an RTE_RESULT base relation
5156  *
5157  * The rel's targetlist and restrictinfo list must have been constructed
5158  * already.
5159  *
5160  * We set the same fields as set_baserel_size_estimates.
5161  */
5162 void
set_result_size_estimates(PlannerInfo * root,RelOptInfo * rel)5163 set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5164 {
5165 	/* Should only be applied to RTE_RESULT base relations */
5166 	Assert(rel->relid > 0);
5167 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5168 
5169 	/* RTE_RESULT always generates a single row, natively */
5170 	rel->tuples = 1;
5171 
5172 	/* Now estimate number of output rows, etc */
5173 	set_baserel_size_estimates(root, rel);
5174 }
5175 
5176 /*
5177  * set_foreign_size_estimates
5178  *		Set the size estimates for a base relation that is a foreign table.
5179  *
5180  * There is not a whole lot that we can do here; the foreign-data wrapper
5181  * is responsible for producing useful estimates.  We can do a decent job
5182  * of estimating baserestrictcost, so we set that, and we also set up width
5183  * using what will be purely datatype-driven estimates from the targetlist.
5184  * There is no way to do anything sane with the rows value, so we just put
5185  * a default estimate and hope that the wrapper can improve on it.  The
5186  * wrapper's GetForeignRelSize function will be called momentarily.
5187  *
5188  * The rel's targetlist and restrictinfo list must have been constructed
5189  * already.
5190  */
5191 void
set_foreign_size_estimates(PlannerInfo * root,RelOptInfo * rel)5192 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5193 {
5194 	/* Should only be applied to base relations */
5195 	Assert(rel->relid > 0);
5196 
5197 	rel->rows = 1000;			/* entirely bogus default estimate */
5198 
5199 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5200 
5201 	set_rel_width(root, rel);
5202 }
5203 
5204 
5205 /*
5206  * set_rel_width
5207  *		Set the estimated output width of a base relation.
5208  *
5209  * The estimated output width is the sum of the per-attribute width estimates
5210  * for the actually-referenced columns, plus any PHVs or other expressions
5211  * that have to be calculated at this relation.  This is the amount of data
5212  * we'd need to pass upwards in case of a sort, hash, etc.
5213  *
5214  * This function also sets reltarget->cost, so it's a bit misnamed now.
5215  *
5216  * NB: this works best on plain relations because it prefers to look at
5217  * real Vars.  For subqueries, set_subquery_size_estimates will already have
5218  * copied up whatever per-column estimates were made within the subquery,
5219  * and for other types of rels there isn't much we can do anyway.  We fall
5220  * back on (fairly stupid) datatype-based width estimates if we can't get
5221  * any better number.
5222  *
5223  * The per-attribute width estimates are cached for possible re-use while
5224  * building join relations or post-scan/join pathtargets.
5225  */
5226 static void
set_rel_width(PlannerInfo * root,RelOptInfo * rel)5227 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5228 {
5229 	Oid			reloid = planner_rt_fetch(rel->relid, root)->relid;
5230 	int32		tuple_width = 0;
5231 	bool		have_wholerow_var = false;
5232 	ListCell   *lc;
5233 
5234 	/* Vars are assumed to have cost zero, but other exprs do not */
5235 	rel->reltarget->cost.startup = 0;
5236 	rel->reltarget->cost.per_tuple = 0;
5237 
5238 	foreach(lc, rel->reltarget->exprs)
5239 	{
5240 		Node	   *node = (Node *) lfirst(lc);
5241 
5242 		/*
5243 		 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5244 		 * but there are corner cases involving LATERAL references where that
5245 		 * isn't so.  If the Var has the wrong varno, fall through to the
5246 		 * generic case (it doesn't seem worth the trouble to be any smarter).
5247 		 */
5248 		if (IsA(node, Var) &&
5249 			((Var *) node)->varno == rel->relid)
5250 		{
5251 			Var		   *var = (Var *) node;
5252 			int			ndx;
5253 			int32		item_width;
5254 
5255 			Assert(var->varattno >= rel->min_attr);
5256 			Assert(var->varattno <= rel->max_attr);
5257 
5258 			ndx = var->varattno - rel->min_attr;
5259 
5260 			/*
5261 			 * If it's a whole-row Var, we'll deal with it below after we have
5262 			 * already cached as many attr widths as possible.
5263 			 */
5264 			if (var->varattno == 0)
5265 			{
5266 				have_wholerow_var = true;
5267 				continue;
5268 			}
5269 
5270 			/*
5271 			 * The width may have been cached already (especially if it's a
5272 			 * subquery), so don't duplicate effort.
5273 			 */
5274 			if (rel->attr_widths[ndx] > 0)
5275 			{
5276 				tuple_width += rel->attr_widths[ndx];
5277 				continue;
5278 			}
5279 
5280 			/* Try to get column width from statistics */
5281 			if (reloid != InvalidOid && var->varattno > 0)
5282 			{
5283 				item_width = get_attavgwidth(reloid, var->varattno);
5284 				if (item_width > 0)
5285 				{
5286 					rel->attr_widths[ndx] = item_width;
5287 					tuple_width += item_width;
5288 					continue;
5289 				}
5290 			}
5291 
5292 			/*
5293 			 * Not a plain relation, or can't find statistics for it. Estimate
5294 			 * using just the type info.
5295 			 */
5296 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5297 			Assert(item_width > 0);
5298 			rel->attr_widths[ndx] = item_width;
5299 			tuple_width += item_width;
5300 		}
5301 		else if (IsA(node, PlaceHolderVar))
5302 		{
5303 			/*
5304 			 * We will need to evaluate the PHV's contained expression while
5305 			 * scanning this rel, so be sure to include it in reltarget->cost.
5306 			 */
5307 			PlaceHolderVar *phv = (PlaceHolderVar *) node;
5308 			PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5309 			QualCost	cost;
5310 
5311 			tuple_width += phinfo->ph_width;
5312 			cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5313 			rel->reltarget->cost.startup += cost.startup;
5314 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5315 		}
5316 		else
5317 		{
5318 			/*
5319 			 * We could be looking at an expression pulled up from a subquery,
5320 			 * or a ROW() representing a whole-row child Var, etc.  Do what we
5321 			 * can using the expression type information.
5322 			 */
5323 			int32		item_width;
5324 			QualCost	cost;
5325 
5326 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5327 			Assert(item_width > 0);
5328 			tuple_width += item_width;
5329 			/* Not entirely clear if we need to account for cost, but do so */
5330 			cost_qual_eval_node(&cost, node, root);
5331 			rel->reltarget->cost.startup += cost.startup;
5332 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5333 		}
5334 	}
5335 
5336 	/*
5337 	 * If we have a whole-row reference, estimate its width as the sum of
5338 	 * per-column widths plus heap tuple header overhead.
5339 	 */
5340 	if (have_wholerow_var)
5341 	{
5342 		int32		wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5343 
5344 		if (reloid != InvalidOid)
5345 		{
5346 			/* Real relation, so estimate true tuple width */
5347 			wholerow_width += get_relation_data_width(reloid,
5348 													  rel->attr_widths - rel->min_attr);
5349 		}
5350 		else
5351 		{
5352 			/* Do what we can with info for a phony rel */
5353 			AttrNumber	i;
5354 
5355 			for (i = 1; i <= rel->max_attr; i++)
5356 				wholerow_width += rel->attr_widths[i - rel->min_attr];
5357 		}
5358 
5359 		rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5360 
5361 		/*
5362 		 * Include the whole-row Var as part of the output tuple.  Yes, that
5363 		 * really is what happens at runtime.
5364 		 */
5365 		tuple_width += wholerow_width;
5366 	}
5367 
5368 	Assert(tuple_width >= 0);
5369 	rel->reltarget->width = tuple_width;
5370 }
5371 
5372 /*
5373  * set_pathtarget_cost_width
5374  *		Set the estimated eval cost and output width of a PathTarget tlist.
5375  *
5376  * As a notational convenience, returns the same PathTarget pointer passed in.
5377  *
5378  * Most, though not quite all, uses of this function occur after we've run
5379  * set_rel_width() for base relations; so we can usually obtain cached width
5380  * estimates for Vars.  If we can't, fall back on datatype-based width
5381  * estimates.  Present early-planning uses of PathTargets don't need accurate
5382  * widths badly enough to justify going to the catalogs for better data.
5383  */
5384 PathTarget *
set_pathtarget_cost_width(PlannerInfo * root,PathTarget * target)5385 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5386 {
5387 	int32		tuple_width = 0;
5388 	ListCell   *lc;
5389 
5390 	/* Vars are assumed to have cost zero, but other exprs do not */
5391 	target->cost.startup = 0;
5392 	target->cost.per_tuple = 0;
5393 
5394 	foreach(lc, target->exprs)
5395 	{
5396 		Node	   *node = (Node *) lfirst(lc);
5397 
5398 		if (IsA(node, Var))
5399 		{
5400 			Var		   *var = (Var *) node;
5401 			int32		item_width;
5402 
5403 			/* We should not see any upper-level Vars here */
5404 			Assert(var->varlevelsup == 0);
5405 
5406 			/* Try to get data from RelOptInfo cache */
5407 			if (var->varno < root->simple_rel_array_size)
5408 			{
5409 				RelOptInfo *rel = root->simple_rel_array[var->varno];
5410 
5411 				if (rel != NULL &&
5412 					var->varattno >= rel->min_attr &&
5413 					var->varattno <= rel->max_attr)
5414 				{
5415 					int			ndx = var->varattno - rel->min_attr;
5416 
5417 					if (rel->attr_widths[ndx] > 0)
5418 					{
5419 						tuple_width += rel->attr_widths[ndx];
5420 						continue;
5421 					}
5422 				}
5423 			}
5424 
5425 			/*
5426 			 * No cached data available, so estimate using just the type info.
5427 			 */
5428 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5429 			Assert(item_width > 0);
5430 			tuple_width += item_width;
5431 		}
5432 		else
5433 		{
5434 			/*
5435 			 * Handle general expressions using type info.
5436 			 */
5437 			int32		item_width;
5438 			QualCost	cost;
5439 
5440 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5441 			Assert(item_width > 0);
5442 			tuple_width += item_width;
5443 
5444 			/* Account for cost, too */
5445 			cost_qual_eval_node(&cost, node, root);
5446 			target->cost.startup += cost.startup;
5447 			target->cost.per_tuple += cost.per_tuple;
5448 		}
5449 	}
5450 
5451 	Assert(tuple_width >= 0);
5452 	target->width = tuple_width;
5453 
5454 	return target;
5455 }
5456 
5457 /*
5458  * relation_byte_size
5459  *	  Estimate the storage space in bytes for a given number of tuples
5460  *	  of a given width (size in bytes).
5461  */
5462 static double
relation_byte_size(double tuples,int width)5463 relation_byte_size(double tuples, int width)
5464 {
5465 	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5466 }
5467 
5468 /*
5469  * page_size
5470  *	  Returns an estimate of the number of pages covered by a given
5471  *	  number of tuples of a given width (size in bytes).
5472  */
5473 static double
page_size(double tuples,int width)5474 page_size(double tuples, int width)
5475 {
5476 	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5477 }
5478 
5479 /*
5480  * Estimate the fraction of the work that each worker will do given the
5481  * number of workers budgeted for the path.
5482  */
5483 static double
get_parallel_divisor(Path * path)5484 get_parallel_divisor(Path *path)
5485 {
5486 	double		parallel_divisor = path->parallel_workers;
5487 
5488 	/*
5489 	 * Early experience with parallel query suggests that when there is only
5490 	 * one worker, the leader often makes a very substantial contribution to
5491 	 * executing the parallel portion of the plan, but as more workers are
5492 	 * added, it does less and less, because it's busy reading tuples from the
5493 	 * workers and doing whatever non-parallel post-processing is needed.  By
5494 	 * the time we reach 4 workers, the leader no longer makes a meaningful
5495 	 * contribution.  Thus, for now, estimate that the leader spends 30% of
5496 	 * its time servicing each worker, and the remainder executing the
5497 	 * parallel plan.
5498 	 */
5499 	if (parallel_leader_participation)
5500 	{
5501 		double		leader_contribution;
5502 
5503 		leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5504 		if (leader_contribution > 0)
5505 			parallel_divisor += leader_contribution;
5506 	}
5507 
5508 	return parallel_divisor;
5509 }
5510 
5511 /*
5512  * compute_bitmap_pages
5513  *
5514  * compute number of pages fetched from heap in bitmap heap scan.
5515  */
5516 double
compute_bitmap_pages(PlannerInfo * root,RelOptInfo * baserel,Path * bitmapqual,int loop_count,Cost * cost,double * tuple)5517 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5518 					 int loop_count, Cost *cost, double *tuple)
5519 {
5520 	Cost		indexTotalCost;
5521 	Selectivity indexSelectivity;
5522 	double		T;
5523 	double		pages_fetched;
5524 	double		tuples_fetched;
5525 	double		heap_pages;
5526 	long		maxentries;
5527 
5528 	/*
5529 	 * Fetch total cost of obtaining the bitmap, as well as its total
5530 	 * selectivity.
5531 	 */
5532 	cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5533 
5534 	/*
5535 	 * Estimate number of main-table pages fetched.
5536 	 */
5537 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5538 
5539 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5540 
5541 	/*
5542 	 * For a single scan, the number of heap pages that need to be fetched is
5543 	 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5544 	 * re-reads needed).
5545 	 */
5546 	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5547 
5548 	/*
5549 	 * Calculate the number of pages fetched from the heap.  Then based on
5550 	 * current work_mem estimate get the estimated maxentries in the bitmap.
5551 	 * (Note that we always do this calculation based on the number of pages
5552 	 * that would be fetched in a single iteration, even if loop_count > 1.
5553 	 * That's correct, because only that number of entries will be stored in
5554 	 * the bitmap at one time.)
5555 	 */
5556 	heap_pages = Min(pages_fetched, baserel->pages);
5557 	maxentries = tbm_calculate_entries(work_mem * 1024L);
5558 
5559 	if (loop_count > 1)
5560 	{
5561 		/*
5562 		 * For repeated bitmap scans, scale up the number of tuples fetched in
5563 		 * the Mackert and Lohman formula by the number of scans, so that we
5564 		 * estimate the number of pages fetched by all the scans. Then
5565 		 * pro-rate for one scan.
5566 		 */
5567 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5568 											baserel->pages,
5569 											get_indexpath_pages(bitmapqual),
5570 											root);
5571 		pages_fetched /= loop_count;
5572 	}
5573 
5574 	if (pages_fetched >= T)
5575 		pages_fetched = T;
5576 	else
5577 		pages_fetched = ceil(pages_fetched);
5578 
5579 	if (maxentries < heap_pages)
5580 	{
5581 		double		exact_pages;
5582 		double		lossy_pages;
5583 
5584 		/*
5585 		 * Crude approximation of the number of lossy pages.  Because of the
5586 		 * way tbm_lossify() is coded, the number of lossy pages increases
5587 		 * very sharply as soon as we run short of memory; this formula has
5588 		 * that property and seems to perform adequately in testing, but it's
5589 		 * possible we could do better somehow.
5590 		 */
5591 		lossy_pages = Max(0, heap_pages - maxentries / 2);
5592 		exact_pages = heap_pages - lossy_pages;
5593 
5594 		/*
5595 		 * If there are lossy pages then recompute the  number of tuples
5596 		 * processed by the bitmap heap node.  We assume here that the chance
5597 		 * of a given tuple coming from an exact page is the same as the
5598 		 * chance that a given page is exact.  This might not be true, but
5599 		 * it's not clear how we can do any better.
5600 		 */
5601 		if (lossy_pages > 0)
5602 			tuples_fetched =
5603 				clamp_row_est(indexSelectivity *
5604 							  (exact_pages / heap_pages) * baserel->tuples +
5605 							  (lossy_pages / heap_pages) * baserel->tuples);
5606 	}
5607 
5608 	if (cost)
5609 		*cost = indexTotalCost;
5610 	if (tuple)
5611 		*tuple = tuples_fetched;
5612 
5613 	return pages_fetched;
5614 }
5615