1 /*-------------------------------------------------------------------------
2  *
3  * costsize.c
4  *	  Routines to compute (and set) relation sizes and path costs
5  *
6  * Path costs are measured in arbitrary units established by these basic
7  * parameters:
8  *
9  *	seq_page_cost		Cost of a sequential page fetch
10  *	random_page_cost	Cost of a non-sequential page fetch
11  *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
12  *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
13  *	cpu_operator_cost	Cost of CPU time to execute an operator or function
14  *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15  *	parallel_setup_cost Cost of setting up shared memory for parallelism
16  *
17  * We expect that the kernel will typically do some amount of read-ahead
18  * optimization; this in conjunction with seek costs means that seq_page_cost
19  * is normally considerably less than random_page_cost.  (However, if the
20  * database is fully cached in RAM, it is reasonable to set them equal.)
21  *
22  * We also use a rough estimate "effective_cache_size" of the number of
23  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
24  * NBuffers for this purpose because that would ignore the effects of
25  * the kernel's disk cache.)
26  *
27  * Obviously, taking constants for these values is an oversimplification,
28  * but it's tough enough to get any useful estimates even at this level of
29  * detail.  Note that all of these parameters are user-settable, in case
30  * the default values are drastically off for a particular platform.
31  *
32  * seq_page_cost and random_page_cost can also be overridden for an individual
33  * tablespace, in case some data is on a fast disk and other data is on a slow
34  * disk.  Per-tablespace overrides never apply to temporary work files such as
35  * an external sort or a materialize node that overflows work_mem.
36  *
37  * We compute two separate costs for each path:
38  *		total_cost: total estimated cost to fetch all tuples
39  *		startup_cost: cost that is expended before first tuple is fetched
40  * In some scenarios, such as when there is a LIMIT or we are implementing
41  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42  * path's result.  A caller can estimate the cost of fetching a partial
43  * result by interpolating between startup_cost and total_cost.  In detail:
44  *		actual_cost = startup_cost +
45  *			(total_cost - startup_cost) * tuples_to_fetch / path->rows;
46  * Note that a base relation's rows count (and, by extension, plan_rows for
47  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48  * that this equation works properly.  (Note: while path->rows is never zero
49  * for ordinary relations, it is zero for paths for provably-empty relations,
50  * so beware of division-by-zero.)	The LIMIT is applied as a top-level
51  * plan node.
52  *
53  * For largely historical reasons, most of the routines in this module use
54  * the passed result Path only to store their results (rows, startup_cost and
55  * total_cost) into.  All the input data they need is passed as separate
56  * parameters, even though much of it could be extracted from the Path.
57  * An exception is made for the cost_XXXjoin() routines, which expect all
58  * the other fields of the passed XXXPath to be filled in, and similarly
59  * cost_index() assumes the passed IndexPath is valid except for its output
60  * values.
61  *
62  *
63  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
64  * Portions Copyright (c) 1994, Regents of the University of California
65  *
66  * IDENTIFICATION
67  *	  src/backend/optimizer/path/costsize.c
68  *
69  *-------------------------------------------------------------------------
70  */
71 
72 #include "postgres.h"
73 
74 #ifdef _MSC_VER
75 #include <float.h>				/* for _isnan */
76 #endif
77 #include <math.h>
78 
79 #include "access/amapi.h"
80 #include "access/htup_details.h"
81 #include "access/tsmapi.h"
82 #include "executor/executor.h"
83 #include "executor/nodeHash.h"
84 #include "miscadmin.h"
85 #include "nodes/nodeFuncs.h"
86 #include "optimizer/clauses.h"
87 #include "optimizer/cost.h"
88 #include "optimizer/pathnode.h"
89 #include "optimizer/paths.h"
90 #include "optimizer/placeholder.h"
91 #include "optimizer/plancat.h"
92 #include "optimizer/planmain.h"
93 #include "optimizer/restrictinfo.h"
94 #include "parser/parsetree.h"
95 #include "utils/lsyscache.h"
96 #include "utils/selfuncs.h"
97 #include "utils/spccache.h"
98 #include "utils/tuplesort.h"
99 
100 
101 #define LOG2(x)  (log(x) / 0.693147180559945)
102 
103 /*
104  * Append and MergeAppend nodes are less expensive than some other operations
105  * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
106  * per-tuple cost as cpu_tuple_cost multiplied by this value.
107  */
108 #define APPEND_CPU_COST_MULTIPLIER 0.5
109 
110 
111 double		seq_page_cost = DEFAULT_SEQ_PAGE_COST;
112 double		random_page_cost = DEFAULT_RANDOM_PAGE_COST;
113 double		cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
114 double		cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
115 double		cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
116 double		parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
117 double		parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
118 
119 int			effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
120 
121 Cost		disable_cost = 1.0e10;
122 
123 int			max_parallel_workers_per_gather = 2;
124 
125 bool		enable_seqscan = true;
126 bool		enable_indexscan = true;
127 bool		enable_indexonlyscan = true;
128 bool		enable_bitmapscan = true;
129 bool		enable_tidscan = true;
130 bool		enable_sort = true;
131 bool		enable_hashagg = true;
132 bool		enable_nestloop = true;
133 bool		enable_material = true;
134 bool		enable_mergejoin = true;
135 bool		enable_hashjoin = true;
136 bool		enable_gathermerge = true;
137 bool		enable_partitionwise_join = false;
138 bool		enable_partitionwise_aggregate = false;
139 bool		enable_parallel_append = true;
140 bool		enable_parallel_hash = true;
141 bool		enable_partition_pruning = true;
142 
143 typedef struct
144 {
145 	PlannerInfo *root;
146 	QualCost	total;
147 } cost_qual_eval_context;
148 
149 static List *extract_nonindex_conditions(List *qual_clauses, List *indexquals);
150 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
151 			   RestrictInfo *rinfo,
152 			   PathKey *pathkey);
153 static void cost_rescan(PlannerInfo *root, Path *path,
154 			Cost *rescan_startup_cost, Cost *rescan_total_cost);
155 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
156 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
157 						  ParamPathInfo *param_info,
158 						  QualCost *qpqual_cost);
159 static bool has_indexed_join_quals(NestPath *joinpath);
160 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
161 				   List *quals);
162 static double calc_joinrel_size_estimate(PlannerInfo *root,
163 						   RelOptInfo *joinrel,
164 						   RelOptInfo *outer_rel,
165 						   RelOptInfo *inner_rel,
166 						   double outer_rows,
167 						   double inner_rows,
168 						   SpecialJoinInfo *sjinfo,
169 						   List *restrictlist);
170 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
171 								 Relids outer_relids,
172 								 Relids inner_relids,
173 								 SpecialJoinInfo *sjinfo,
174 								 List **restrictlist);
175 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
176 					   int parallel_workers);
177 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
178 static double relation_byte_size(double tuples, int width);
179 static double page_size(double tuples, int width);
180 static double get_parallel_divisor(Path *path);
181 
182 
183 /*
184  * clamp_row_est
185  *		Force a row-count estimate to a sane value.
186  */
187 double
clamp_row_est(double nrows)188 clamp_row_est(double nrows)
189 {
190 	/*
191 	 * Force estimate to be at least one row, to make explain output look
192 	 * better and to avoid possible divide-by-zero when interpolating costs.
193 	 * Make it an integer, too.
194 	 */
195 	if (nrows <= 1.0)
196 		nrows = 1.0;
197 	else
198 		nrows = rint(nrows);
199 
200 	return nrows;
201 }
202 
203 
204 /*
205  * cost_seqscan
206  *	  Determines and returns the cost of scanning a relation sequentially.
207  *
208  * 'baserel' is the relation to be scanned
209  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
210  */
211 void
cost_seqscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)212 cost_seqscan(Path *path, PlannerInfo *root,
213 			 RelOptInfo *baserel, ParamPathInfo *param_info)
214 {
215 	Cost		startup_cost = 0;
216 	Cost		cpu_run_cost;
217 	Cost		disk_run_cost;
218 	double		spc_seq_page_cost;
219 	QualCost	qpqual_cost;
220 	Cost		cpu_per_tuple;
221 
222 	/* Should only be applied to base relations */
223 	Assert(baserel->relid > 0);
224 	Assert(baserel->rtekind == RTE_RELATION);
225 
226 	/* Mark the path with the correct row estimate */
227 	if (param_info)
228 		path->rows = param_info->ppi_rows;
229 	else
230 		path->rows = baserel->rows;
231 
232 	if (!enable_seqscan)
233 		startup_cost += disable_cost;
234 
235 	/* fetch estimated page cost for tablespace containing table */
236 	get_tablespace_page_costs(baserel->reltablespace,
237 							  NULL,
238 							  &spc_seq_page_cost);
239 
240 	/*
241 	 * disk costs
242 	 */
243 	disk_run_cost = spc_seq_page_cost * baserel->pages;
244 
245 	/* CPU costs */
246 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
247 
248 	startup_cost += qpqual_cost.startup;
249 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
250 	cpu_run_cost = cpu_per_tuple * baserel->tuples;
251 	/* tlist eval costs are paid per output row, not per tuple scanned */
252 	startup_cost += path->pathtarget->cost.startup;
253 	cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
254 
255 	/* Adjust costing for parallelism, if used. */
256 	if (path->parallel_workers > 0)
257 	{
258 		double		parallel_divisor = get_parallel_divisor(path);
259 
260 		/* The CPU cost is divided among all the workers. */
261 		cpu_run_cost /= parallel_divisor;
262 
263 		/*
264 		 * It may be possible to amortize some of the I/O cost, but probably
265 		 * not very much, because most operating systems already do aggressive
266 		 * prefetching.  For now, we assume that the disk run cost can't be
267 		 * amortized at all.
268 		 */
269 
270 		/*
271 		 * In the case of a parallel plan, the row count needs to represent
272 		 * the number of tuples processed per worker.
273 		 */
274 		path->rows = clamp_row_est(path->rows / parallel_divisor);
275 	}
276 
277 	path->startup_cost = startup_cost;
278 	path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
279 }
280 
281 /*
282  * cost_samplescan
283  *	  Determines and returns the cost of scanning a relation using sampling.
284  *
285  * 'baserel' is the relation to be scanned
286  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
287  */
288 void
cost_samplescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)289 cost_samplescan(Path *path, PlannerInfo *root,
290 				RelOptInfo *baserel, ParamPathInfo *param_info)
291 {
292 	Cost		startup_cost = 0;
293 	Cost		run_cost = 0;
294 	RangeTblEntry *rte;
295 	TableSampleClause *tsc;
296 	TsmRoutine *tsm;
297 	double		spc_seq_page_cost,
298 				spc_random_page_cost,
299 				spc_page_cost;
300 	QualCost	qpqual_cost;
301 	Cost		cpu_per_tuple;
302 
303 	/* Should only be applied to base relations with tablesample clauses */
304 	Assert(baserel->relid > 0);
305 	rte = planner_rt_fetch(baserel->relid, root);
306 	Assert(rte->rtekind == RTE_RELATION);
307 	tsc = rte->tablesample;
308 	Assert(tsc != NULL);
309 	tsm = GetTsmRoutine(tsc->tsmhandler);
310 
311 	/* Mark the path with the correct row estimate */
312 	if (param_info)
313 		path->rows = param_info->ppi_rows;
314 	else
315 		path->rows = baserel->rows;
316 
317 	/* fetch estimated page cost for tablespace containing table */
318 	get_tablespace_page_costs(baserel->reltablespace,
319 							  &spc_random_page_cost,
320 							  &spc_seq_page_cost);
321 
322 	/* if NextSampleBlock is used, assume random access, else sequential */
323 	spc_page_cost = (tsm->NextSampleBlock != NULL) ?
324 		spc_random_page_cost : spc_seq_page_cost;
325 
326 	/*
327 	 * disk costs (recall that baserel->pages has already been set to the
328 	 * number of pages the sampling method will visit)
329 	 */
330 	run_cost += spc_page_cost * baserel->pages;
331 
332 	/*
333 	 * CPU costs (recall that baserel->tuples has already been set to the
334 	 * number of tuples the sampling method will select).  Note that we ignore
335 	 * execution cost of the TABLESAMPLE parameter expressions; they will be
336 	 * evaluated only once per scan, and in most usages they'll likely be
337 	 * simple constants anyway.  We also don't charge anything for the
338 	 * calculations the sampling method might do internally.
339 	 */
340 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
341 
342 	startup_cost += qpqual_cost.startup;
343 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
344 	run_cost += cpu_per_tuple * baserel->tuples;
345 	/* tlist eval costs are paid per output row, not per tuple scanned */
346 	startup_cost += path->pathtarget->cost.startup;
347 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
348 
349 	path->startup_cost = startup_cost;
350 	path->total_cost = startup_cost + run_cost;
351 }
352 
353 /*
354  * cost_gather
355  *	  Determines and returns the cost of gather path.
356  *
357  * 'rel' is the relation to be operated upon
358  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
359  * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
360  * both 'rel' and 'param_info'.  This is useful when the path doesn't exactly
361  * correspond to any particular RelOptInfo.
362  */
363 void
cost_gather(GatherPath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,double * rows)364 cost_gather(GatherPath *path, PlannerInfo *root,
365 			RelOptInfo *rel, ParamPathInfo *param_info,
366 			double *rows)
367 {
368 	Cost		startup_cost = 0;
369 	Cost		run_cost = 0;
370 
371 	/* Mark the path with the correct row estimate */
372 	if (rows)
373 		path->path.rows = *rows;
374 	else if (param_info)
375 		path->path.rows = param_info->ppi_rows;
376 	else
377 		path->path.rows = rel->rows;
378 
379 	startup_cost = path->subpath->startup_cost;
380 
381 	run_cost = path->subpath->total_cost - path->subpath->startup_cost;
382 
383 	/* Parallel setup and communication cost. */
384 	startup_cost += parallel_setup_cost;
385 	run_cost += parallel_tuple_cost * path->path.rows;
386 
387 	path->path.startup_cost = startup_cost;
388 	path->path.total_cost = (startup_cost + run_cost);
389 }
390 
391 /*
392  * cost_gather_merge
393  *	  Determines and returns the cost of gather merge path.
394  *
395  * GatherMerge merges several pre-sorted input streams, using a heap that at
396  * any given instant holds the next tuple from each stream. If there are N
397  * streams, we need about N*log2(N) tuple comparisons to construct the heap at
398  * startup, and then for each output tuple, about log2(N) comparisons to
399  * replace the top heap entry with the next tuple from the same stream.
400  */
401 void
cost_gather_merge(GatherMergePath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,Cost input_startup_cost,Cost input_total_cost,double * rows)402 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
403 				  RelOptInfo *rel, ParamPathInfo *param_info,
404 				  Cost input_startup_cost, Cost input_total_cost,
405 				  double *rows)
406 {
407 	Cost		startup_cost = 0;
408 	Cost		run_cost = 0;
409 	Cost		comparison_cost;
410 	double		N;
411 	double		logN;
412 
413 	/* Mark the path with the correct row estimate */
414 	if (rows)
415 		path->path.rows = *rows;
416 	else if (param_info)
417 		path->path.rows = param_info->ppi_rows;
418 	else
419 		path->path.rows = rel->rows;
420 
421 	if (!enable_gathermerge)
422 		startup_cost += disable_cost;
423 
424 	/*
425 	 * Add one to the number of workers to account for the leader.  This might
426 	 * be overgenerous since the leader will do less work than other workers
427 	 * in typical cases, but we'll go with it for now.
428 	 */
429 	Assert(path->num_workers > 0);
430 	N = (double) path->num_workers + 1;
431 	logN = LOG2(N);
432 
433 	/* Assumed cost per tuple comparison */
434 	comparison_cost = 2.0 * cpu_operator_cost;
435 
436 	/* Heap creation cost */
437 	startup_cost += comparison_cost * N * logN;
438 
439 	/* Per-tuple heap maintenance cost */
440 	run_cost += path->path.rows * comparison_cost * logN;
441 
442 	/* small cost for heap management, like cost_merge_append */
443 	run_cost += cpu_operator_cost * path->path.rows;
444 
445 	/*
446 	 * Parallel setup and communication cost.  Since Gather Merge, unlike
447 	 * Gather, requires us to block until a tuple is available from every
448 	 * worker, we bump the IPC cost up a little bit as compared with Gather.
449 	 * For lack of a better idea, charge an extra 5%.
450 	 */
451 	startup_cost += parallel_setup_cost;
452 	run_cost += parallel_tuple_cost * path->path.rows * 1.05;
453 
454 	path->path.startup_cost = startup_cost + input_startup_cost;
455 	path->path.total_cost = (startup_cost + run_cost + input_total_cost);
456 }
457 
458 /*
459  * cost_index
460  *	  Determines and returns the cost of scanning a relation using an index.
461  *
462  * 'path' describes the indexscan under consideration, and is complete
463  *		except for the fields to be set by this routine
464  * 'loop_count' is the number of repetitions of the indexscan to factor into
465  *		estimates of caching behavior
466  *
467  * In addition to rows, startup_cost and total_cost, cost_index() sets the
468  * path's indextotalcost and indexselectivity fields.  These values will be
469  * needed if the IndexPath is used in a BitmapIndexScan.
470  *
471  * NOTE: path->indexquals must contain only clauses usable as index
472  * restrictions.  Any additional quals evaluated as qpquals may reduce the
473  * number of returned tuples, but they won't reduce the number of tuples
474  * we have to fetch from the table, so they don't reduce the scan cost.
475  */
476 void
cost_index(IndexPath * path,PlannerInfo * root,double loop_count,bool partial_path)477 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
478 		   bool partial_path)
479 {
480 	IndexOptInfo *index = path->indexinfo;
481 	RelOptInfo *baserel = index->rel;
482 	bool		indexonly = (path->path.pathtype == T_IndexOnlyScan);
483 	amcostestimate_function amcostestimate;
484 	List	   *qpquals;
485 	Cost		startup_cost = 0;
486 	Cost		run_cost = 0;
487 	Cost		cpu_run_cost = 0;
488 	Cost		indexStartupCost;
489 	Cost		indexTotalCost;
490 	Selectivity indexSelectivity;
491 	double		indexCorrelation,
492 				csquared;
493 	double		spc_seq_page_cost,
494 				spc_random_page_cost;
495 	Cost		min_IO_cost,
496 				max_IO_cost;
497 	QualCost	qpqual_cost;
498 	Cost		cpu_per_tuple;
499 	double		tuples_fetched;
500 	double		pages_fetched;
501 	double		rand_heap_pages;
502 	double		index_pages;
503 
504 	/* Should only be applied to base relations */
505 	Assert(IsA(baserel, RelOptInfo) &&
506 		   IsA(index, IndexOptInfo));
507 	Assert(baserel->relid > 0);
508 	Assert(baserel->rtekind == RTE_RELATION);
509 
510 	/*
511 	 * Mark the path with the correct row estimate, and identify which quals
512 	 * will need to be enforced as qpquals.  We need not check any quals that
513 	 * are implied by the index's predicate, so we can use indrestrictinfo not
514 	 * baserestrictinfo as the list of relevant restriction clauses for the
515 	 * rel.
516 	 */
517 	if (path->path.param_info)
518 	{
519 		path->path.rows = path->path.param_info->ppi_rows;
520 		/* qpquals come from the rel's restriction clauses and ppi_clauses */
521 		qpquals = list_concat(
522 							  extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
523 														  path->indexquals),
524 							  extract_nonindex_conditions(path->path.param_info->ppi_clauses,
525 														  path->indexquals));
526 	}
527 	else
528 	{
529 		path->path.rows = baserel->rows;
530 		/* qpquals come from just the rel's restriction clauses */
531 		qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
532 											  path->indexquals);
533 	}
534 
535 	if (!enable_indexscan)
536 		startup_cost += disable_cost;
537 	/* we don't need to check enable_indexonlyscan; indxpath.c does that */
538 
539 	/*
540 	 * Call index-access-method-specific code to estimate the processing cost
541 	 * for scanning the index, as well as the selectivity of the index (ie,
542 	 * the fraction of main-table tuples we will have to retrieve) and its
543 	 * correlation to the main-table tuple order.  We need a cast here because
544 	 * relation.h uses a weak function type to avoid including amapi.h.
545 	 */
546 	amcostestimate = (amcostestimate_function) index->amcostestimate;
547 	amcostestimate(root, path, loop_count,
548 				   &indexStartupCost, &indexTotalCost,
549 				   &indexSelectivity, &indexCorrelation,
550 				   &index_pages);
551 
552 	/*
553 	 * Save amcostestimate's results for possible use in bitmap scan planning.
554 	 * We don't bother to save indexStartupCost or indexCorrelation, because a
555 	 * bitmap scan doesn't care about either.
556 	 */
557 	path->indextotalcost = indexTotalCost;
558 	path->indexselectivity = indexSelectivity;
559 
560 	/* all costs for touching index itself included here */
561 	startup_cost += indexStartupCost;
562 	run_cost += indexTotalCost - indexStartupCost;
563 
564 	/* estimate number of main-table tuples fetched */
565 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
566 
567 	/* fetch estimated page costs for tablespace containing table */
568 	get_tablespace_page_costs(baserel->reltablespace,
569 							  &spc_random_page_cost,
570 							  &spc_seq_page_cost);
571 
572 	/*----------
573 	 * Estimate number of main-table pages fetched, and compute I/O cost.
574 	 *
575 	 * When the index ordering is uncorrelated with the table ordering,
576 	 * we use an approximation proposed by Mackert and Lohman (see
577 	 * index_pages_fetched() for details) to compute the number of pages
578 	 * fetched, and then charge spc_random_page_cost per page fetched.
579 	 *
580 	 * When the index ordering is exactly correlated with the table ordering
581 	 * (just after a CLUSTER, for example), the number of pages fetched should
582 	 * be exactly selectivity * table_size.  What's more, all but the first
583 	 * will be sequential fetches, not the random fetches that occur in the
584 	 * uncorrelated case.  So if the number of pages is more than 1, we
585 	 * ought to charge
586 	 *		spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
587 	 * For partially-correlated indexes, we ought to charge somewhere between
588 	 * these two estimates.  We currently interpolate linearly between the
589 	 * estimates based on the correlation squared (XXX is that appropriate?).
590 	 *
591 	 * If it's an index-only scan, then we will not need to fetch any heap
592 	 * pages for which the visibility map shows all tuples are visible.
593 	 * Hence, reduce the estimated number of heap fetches accordingly.
594 	 * We use the measured fraction of the entire heap that is all-visible,
595 	 * which might not be particularly relevant to the subset of the heap
596 	 * that this query will fetch; but it's not clear how to do better.
597 	 *----------
598 	 */
599 	if (loop_count > 1)
600 	{
601 		/*
602 		 * For repeated indexscans, the appropriate estimate for the
603 		 * uncorrelated case is to scale up the number of tuples fetched in
604 		 * the Mackert and Lohman formula by the number of scans, so that we
605 		 * estimate the number of pages fetched by all the scans; then
606 		 * pro-rate the costs for one scan.  In this case we assume all the
607 		 * fetches are random accesses.
608 		 */
609 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
610 											baserel->pages,
611 											(double) index->pages,
612 											root);
613 
614 		if (indexonly)
615 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
616 
617 		rand_heap_pages = pages_fetched;
618 
619 		max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
620 
621 		/*
622 		 * In the perfectly correlated case, the number of pages touched by
623 		 * each scan is selectivity * table_size, and we can use the Mackert
624 		 * and Lohman formula at the page level to estimate how much work is
625 		 * saved by caching across scans.  We still assume all the fetches are
626 		 * random, though, which is an overestimate that's hard to correct for
627 		 * without double-counting the cache effects.  (But in most cases
628 		 * where such a plan is actually interesting, only one page would get
629 		 * fetched per scan anyway, so it shouldn't matter much.)
630 		 */
631 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
632 
633 		pages_fetched = index_pages_fetched(pages_fetched * loop_count,
634 											baserel->pages,
635 											(double) index->pages,
636 											root);
637 
638 		if (indexonly)
639 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
640 
641 		min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
642 	}
643 	else
644 	{
645 		/*
646 		 * Normal case: apply the Mackert and Lohman formula, and then
647 		 * interpolate between that and the correlation-derived result.
648 		 */
649 		pages_fetched = index_pages_fetched(tuples_fetched,
650 											baserel->pages,
651 											(double) index->pages,
652 											root);
653 
654 		if (indexonly)
655 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
656 
657 		rand_heap_pages = pages_fetched;
658 
659 		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
660 		max_IO_cost = pages_fetched * spc_random_page_cost;
661 
662 		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
663 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
664 
665 		if (indexonly)
666 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
667 
668 		if (pages_fetched > 0)
669 		{
670 			min_IO_cost = spc_random_page_cost;
671 			if (pages_fetched > 1)
672 				min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
673 		}
674 		else
675 			min_IO_cost = 0;
676 	}
677 
678 	if (partial_path)
679 	{
680 		/*
681 		 * For index only scans compute workers based on number of index pages
682 		 * fetched; the number of heap pages we fetch might be so small as to
683 		 * effectively rule out parallelism, which we don't want to do.
684 		 */
685 		if (indexonly)
686 			rand_heap_pages = -1;
687 
688 		/*
689 		 * Estimate the number of parallel workers required to scan index. Use
690 		 * the number of heap pages computed considering heap fetches won't be
691 		 * sequential as for parallel scans the pages are accessed in random
692 		 * order.
693 		 */
694 		path->path.parallel_workers = compute_parallel_worker(baserel,
695 															  rand_heap_pages,
696 															  index_pages,
697 															  max_parallel_workers_per_gather);
698 
699 		/*
700 		 * Fall out if workers can't be assigned for parallel scan, because in
701 		 * such a case this path will be rejected.  So there is no benefit in
702 		 * doing extra computation.
703 		 */
704 		if (path->path.parallel_workers <= 0)
705 			return;
706 
707 		path->path.parallel_aware = true;
708 	}
709 
710 	/*
711 	 * Now interpolate based on estimated index order correlation to get total
712 	 * disk I/O cost for main table accesses.
713 	 */
714 	csquared = indexCorrelation * indexCorrelation;
715 
716 	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
717 
718 	/*
719 	 * Estimate CPU costs per tuple.
720 	 *
721 	 * What we want here is cpu_tuple_cost plus the evaluation costs of any
722 	 * qual clauses that we have to evaluate as qpquals.
723 	 */
724 	cost_qual_eval(&qpqual_cost, qpquals, root);
725 
726 	startup_cost += qpqual_cost.startup;
727 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
728 
729 	cpu_run_cost += cpu_per_tuple * tuples_fetched;
730 
731 	/* tlist eval costs are paid per output row, not per tuple scanned */
732 	startup_cost += path->path.pathtarget->cost.startup;
733 	cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
734 
735 	/* Adjust costing for parallelism, if used. */
736 	if (path->path.parallel_workers > 0)
737 	{
738 		double		parallel_divisor = get_parallel_divisor(&path->path);
739 
740 		path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
741 
742 		/* The CPU cost is divided among all the workers. */
743 		cpu_run_cost /= parallel_divisor;
744 	}
745 
746 	run_cost += cpu_run_cost;
747 
748 	path->path.startup_cost = startup_cost;
749 	path->path.total_cost = startup_cost + run_cost;
750 }
751 
752 /*
753  * extract_nonindex_conditions
754  *
755  * Given a list of quals to be enforced in an indexscan, extract the ones that
756  * will have to be applied as qpquals (ie, the index machinery won't handle
757  * them).  The actual rules for this appear in create_indexscan_plan() in
758  * createplan.c, but the full rules are fairly expensive and we don't want to
759  * go to that much effort for index paths that don't get selected for the
760  * final plan.  So we approximate it as quals that don't appear directly in
761  * indexquals and also are not redundant children of the same EquivalenceClass
762  * as some indexqual.  This method neglects some infrequently-relevant
763  * considerations, specifically clauses that needn't be checked because they
764  * are implied by an indexqual.  It does not seem worth the cycles to try to
765  * factor that in at this stage, even though createplan.c will take pains to
766  * remove such unnecessary clauses from the qpquals list if this path is
767  * selected for use.
768  */
769 static List *
extract_nonindex_conditions(List * qual_clauses,List * indexquals)770 extract_nonindex_conditions(List *qual_clauses, List *indexquals)
771 {
772 	List	   *result = NIL;
773 	ListCell   *lc;
774 
775 	foreach(lc, qual_clauses)
776 	{
777 		RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
778 
779 		if (rinfo->pseudoconstant)
780 			continue;			/* we may drop pseudoconstants here */
781 		if (list_member_ptr(indexquals, rinfo))
782 			continue;			/* simple duplicate */
783 		if (is_redundant_derived_clause(rinfo, indexquals))
784 			continue;			/* derived from same EquivalenceClass */
785 		/* ... skip the predicate proof attempt createplan.c will try ... */
786 		result = lappend(result, rinfo);
787 	}
788 	return result;
789 }
790 
791 /*
792  * index_pages_fetched
793  *	  Estimate the number of pages actually fetched after accounting for
794  *	  cache effects.
795  *
796  * We use an approximation proposed by Mackert and Lohman, "Index Scans
797  * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
798  * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
799  * The Mackert and Lohman approximation is that the number of pages
800  * fetched is
801  *	PF =
802  *		min(2TNs/(2T+Ns), T)			when T <= b
803  *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
804  *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
805  * where
806  *		T = # pages in table
807  *		N = # tuples in table
808  *		s = selectivity = fraction of table to be scanned
809  *		b = # buffer pages available (we include kernel space here)
810  *
811  * We assume that effective_cache_size is the total number of buffer pages
812  * available for the whole query, and pro-rate that space across all the
813  * tables in the query and the index currently under consideration.  (This
814  * ignores space needed for other indexes used by the query, but since we
815  * don't know which indexes will get used, we can't estimate that very well;
816  * and in any case counting all the tables may well be an overestimate, since
817  * depending on the join plan not all the tables may be scanned concurrently.)
818  *
819  * The product Ns is the number of tuples fetched; we pass in that
820  * product rather than calculating it here.  "pages" is the number of pages
821  * in the object under consideration (either an index or a table).
822  * "index_pages" is the amount to add to the total table space, which was
823  * computed for us by query_planner.
824  *
825  * Caller is expected to have ensured that tuples_fetched is greater than zero
826  * and rounded to integer (see clamp_row_est).  The result will likewise be
827  * greater than zero and integral.
828  */
829 double
index_pages_fetched(double tuples_fetched,BlockNumber pages,double index_pages,PlannerInfo * root)830 index_pages_fetched(double tuples_fetched, BlockNumber pages,
831 					double index_pages, PlannerInfo *root)
832 {
833 	double		pages_fetched;
834 	double		total_pages;
835 	double		T,
836 				b;
837 
838 	/* T is # pages in table, but don't allow it to be zero */
839 	T = (pages > 1) ? (double) pages : 1.0;
840 
841 	/* Compute number of pages assumed to be competing for cache space */
842 	total_pages = root->total_table_pages + index_pages;
843 	total_pages = Max(total_pages, 1.0);
844 	Assert(T <= total_pages);
845 
846 	/* b is pro-rated share of effective_cache_size */
847 	b = (double) effective_cache_size * T / total_pages;
848 
849 	/* force it positive and integral */
850 	if (b <= 1.0)
851 		b = 1.0;
852 	else
853 		b = ceil(b);
854 
855 	/* This part is the Mackert and Lohman formula */
856 	if (T <= b)
857 	{
858 		pages_fetched =
859 			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
860 		if (pages_fetched >= T)
861 			pages_fetched = T;
862 		else
863 			pages_fetched = ceil(pages_fetched);
864 	}
865 	else
866 	{
867 		double		lim;
868 
869 		lim = (2.0 * T * b) / (2.0 * T - b);
870 		if (tuples_fetched <= lim)
871 		{
872 			pages_fetched =
873 				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
874 		}
875 		else
876 		{
877 			pages_fetched =
878 				b + (tuples_fetched - lim) * (T - b) / T;
879 		}
880 		pages_fetched = ceil(pages_fetched);
881 	}
882 	return pages_fetched;
883 }
884 
885 /*
886  * get_indexpath_pages
887  *		Determine the total size of the indexes used in a bitmap index path.
888  *
889  * Note: if the same index is used more than once in a bitmap tree, we will
890  * count it multiple times, which perhaps is the wrong thing ... but it's
891  * not completely clear, and detecting duplicates is difficult, so ignore it
892  * for now.
893  */
894 static double
get_indexpath_pages(Path * bitmapqual)895 get_indexpath_pages(Path *bitmapqual)
896 {
897 	double		result = 0;
898 	ListCell   *l;
899 
900 	if (IsA(bitmapqual, BitmapAndPath))
901 	{
902 		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
903 
904 		foreach(l, apath->bitmapquals)
905 		{
906 			result += get_indexpath_pages((Path *) lfirst(l));
907 		}
908 	}
909 	else if (IsA(bitmapqual, BitmapOrPath))
910 	{
911 		BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
912 
913 		foreach(l, opath->bitmapquals)
914 		{
915 			result += get_indexpath_pages((Path *) lfirst(l));
916 		}
917 	}
918 	else if (IsA(bitmapqual, IndexPath))
919 	{
920 		IndexPath  *ipath = (IndexPath *) bitmapqual;
921 
922 		result = (double) ipath->indexinfo->pages;
923 	}
924 	else
925 		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
926 
927 	return result;
928 }
929 
930 /*
931  * cost_bitmap_heap_scan
932  *	  Determines and returns the cost of scanning a relation using a bitmap
933  *	  index-then-heap plan.
934  *
935  * 'baserel' is the relation to be scanned
936  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
937  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
938  * 'loop_count' is the number of repetitions of the indexscan to factor into
939  *		estimates of caching behavior
940  *
941  * Note: the component IndexPaths in bitmapqual should have been costed
942  * using the same loop_count.
943  */
944 void
cost_bitmap_heap_scan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,Path * bitmapqual,double loop_count)945 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
946 					  ParamPathInfo *param_info,
947 					  Path *bitmapqual, double loop_count)
948 {
949 	Cost		startup_cost = 0;
950 	Cost		run_cost = 0;
951 	Cost		indexTotalCost;
952 	QualCost	qpqual_cost;
953 	Cost		cpu_per_tuple;
954 	Cost		cost_per_page;
955 	Cost		cpu_run_cost;
956 	double		tuples_fetched;
957 	double		pages_fetched;
958 	double		spc_seq_page_cost,
959 				spc_random_page_cost;
960 	double		T;
961 
962 	/* Should only be applied to base relations */
963 	Assert(IsA(baserel, RelOptInfo));
964 	Assert(baserel->relid > 0);
965 	Assert(baserel->rtekind == RTE_RELATION);
966 
967 	/* Mark the path with the correct row estimate */
968 	if (param_info)
969 		path->rows = param_info->ppi_rows;
970 	else
971 		path->rows = baserel->rows;
972 
973 	if (!enable_bitmapscan)
974 		startup_cost += disable_cost;
975 
976 	pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
977 										 loop_count, &indexTotalCost,
978 										 &tuples_fetched);
979 
980 	startup_cost += indexTotalCost;
981 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
982 
983 	/* Fetch estimated page costs for tablespace containing table. */
984 	get_tablespace_page_costs(baserel->reltablespace,
985 							  &spc_random_page_cost,
986 							  &spc_seq_page_cost);
987 
988 	/*
989 	 * For small numbers of pages we should charge spc_random_page_cost
990 	 * apiece, while if nearly all the table's pages are being read, it's more
991 	 * appropriate to charge spc_seq_page_cost apiece.  The effect is
992 	 * nonlinear, too. For lack of a better idea, interpolate like this to
993 	 * determine the cost per page.
994 	 */
995 	if (pages_fetched >= 2.0)
996 		cost_per_page = spc_random_page_cost -
997 			(spc_random_page_cost - spc_seq_page_cost)
998 			* sqrt(pages_fetched / T);
999 	else
1000 		cost_per_page = spc_random_page_cost;
1001 
1002 	run_cost += pages_fetched * cost_per_page;
1003 
1004 	/*
1005 	 * Estimate CPU costs per tuple.
1006 	 *
1007 	 * Often the indexquals don't need to be rechecked at each tuple ... but
1008 	 * not always, especially not if there are enough tuples involved that the
1009 	 * bitmaps become lossy.  For the moment, just assume they will be
1010 	 * rechecked always.  This means we charge the full freight for all the
1011 	 * scan clauses.
1012 	 */
1013 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1014 
1015 	startup_cost += qpqual_cost.startup;
1016 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1017 	cpu_run_cost = cpu_per_tuple * tuples_fetched;
1018 
1019 	/* Adjust costing for parallelism, if used. */
1020 	if (path->parallel_workers > 0)
1021 	{
1022 		double		parallel_divisor = get_parallel_divisor(path);
1023 
1024 		/* The CPU cost is divided among all the workers. */
1025 		cpu_run_cost /= parallel_divisor;
1026 
1027 		path->rows = clamp_row_est(path->rows / parallel_divisor);
1028 	}
1029 
1030 
1031 	run_cost += cpu_run_cost;
1032 
1033 	/* tlist eval costs are paid per output row, not per tuple scanned */
1034 	startup_cost += path->pathtarget->cost.startup;
1035 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1036 
1037 	path->startup_cost = startup_cost;
1038 	path->total_cost = startup_cost + run_cost;
1039 }
1040 
1041 /*
1042  * cost_bitmap_tree_node
1043  *		Extract cost and selectivity from a bitmap tree node (index/and/or)
1044  */
1045 void
cost_bitmap_tree_node(Path * path,Cost * cost,Selectivity * selec)1046 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1047 {
1048 	if (IsA(path, IndexPath))
1049 	{
1050 		*cost = ((IndexPath *) path)->indextotalcost;
1051 		*selec = ((IndexPath *) path)->indexselectivity;
1052 
1053 		/*
1054 		 * Charge a small amount per retrieved tuple to reflect the costs of
1055 		 * manipulating the bitmap.  This is mostly to make sure that a bitmap
1056 		 * scan doesn't look to be the same cost as an indexscan to retrieve a
1057 		 * single tuple.
1058 		 */
1059 		*cost += 0.1 * cpu_operator_cost * path->rows;
1060 	}
1061 	else if (IsA(path, BitmapAndPath))
1062 	{
1063 		*cost = path->total_cost;
1064 		*selec = ((BitmapAndPath *) path)->bitmapselectivity;
1065 	}
1066 	else if (IsA(path, BitmapOrPath))
1067 	{
1068 		*cost = path->total_cost;
1069 		*selec = ((BitmapOrPath *) path)->bitmapselectivity;
1070 	}
1071 	else
1072 	{
1073 		elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1074 		*cost = *selec = 0;		/* keep compiler quiet */
1075 	}
1076 }
1077 
1078 /*
1079  * cost_bitmap_and_node
1080  *		Estimate the cost of a BitmapAnd node
1081  *
1082  * Note that this considers only the costs of index scanning and bitmap
1083  * creation, not the eventual heap access.  In that sense the object isn't
1084  * truly a Path, but it has enough path-like properties (costs in particular)
1085  * to warrant treating it as one.  We don't bother to set the path rows field,
1086  * however.
1087  */
1088 void
cost_bitmap_and_node(BitmapAndPath * path,PlannerInfo * root)1089 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1090 {
1091 	Cost		totalCost;
1092 	Selectivity selec;
1093 	ListCell   *l;
1094 
1095 	/*
1096 	 * We estimate AND selectivity on the assumption that the inputs are
1097 	 * independent.  This is probably often wrong, but we don't have the info
1098 	 * to do better.
1099 	 *
1100 	 * The runtime cost of the BitmapAnd itself is estimated at 100x
1101 	 * cpu_operator_cost for each tbm_intersect needed.  Probably too small,
1102 	 * definitely too simplistic?
1103 	 */
1104 	totalCost = 0.0;
1105 	selec = 1.0;
1106 	foreach(l, path->bitmapquals)
1107 	{
1108 		Path	   *subpath = (Path *) lfirst(l);
1109 		Cost		subCost;
1110 		Selectivity subselec;
1111 
1112 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1113 
1114 		selec *= subselec;
1115 
1116 		totalCost += subCost;
1117 		if (l != list_head(path->bitmapquals))
1118 			totalCost += 100.0 * cpu_operator_cost;
1119 	}
1120 	path->bitmapselectivity = selec;
1121 	path->path.rows = 0;		/* per above, not used */
1122 	path->path.startup_cost = totalCost;
1123 	path->path.total_cost = totalCost;
1124 }
1125 
1126 /*
1127  * cost_bitmap_or_node
1128  *		Estimate the cost of a BitmapOr node
1129  *
1130  * See comments for cost_bitmap_and_node.
1131  */
1132 void
cost_bitmap_or_node(BitmapOrPath * path,PlannerInfo * root)1133 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1134 {
1135 	Cost		totalCost;
1136 	Selectivity selec;
1137 	ListCell   *l;
1138 
1139 	/*
1140 	 * We estimate OR selectivity on the assumption that the inputs are
1141 	 * non-overlapping, since that's often the case in "x IN (list)" type
1142 	 * situations.  Of course, we clamp to 1.0 at the end.
1143 	 *
1144 	 * The runtime cost of the BitmapOr itself is estimated at 100x
1145 	 * cpu_operator_cost for each tbm_union needed.  Probably too small,
1146 	 * definitely too simplistic?  We are aware that the tbm_unions are
1147 	 * optimized out when the inputs are BitmapIndexScans.
1148 	 */
1149 	totalCost = 0.0;
1150 	selec = 0.0;
1151 	foreach(l, path->bitmapquals)
1152 	{
1153 		Path	   *subpath = (Path *) lfirst(l);
1154 		Cost		subCost;
1155 		Selectivity subselec;
1156 
1157 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1158 
1159 		selec += subselec;
1160 
1161 		totalCost += subCost;
1162 		if (l != list_head(path->bitmapquals) &&
1163 			!IsA(subpath, IndexPath))
1164 			totalCost += 100.0 * cpu_operator_cost;
1165 	}
1166 	path->bitmapselectivity = Min(selec, 1.0);
1167 	path->path.rows = 0;		/* per above, not used */
1168 	path->path.startup_cost = totalCost;
1169 	path->path.total_cost = totalCost;
1170 }
1171 
1172 /*
1173  * cost_tidscan
1174  *	  Determines and returns the cost of scanning a relation using TIDs.
1175  *
1176  * 'baserel' is the relation to be scanned
1177  * 'tidquals' is the list of TID-checkable quals
1178  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1179  */
1180 void
cost_tidscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,List * tidquals,ParamPathInfo * param_info)1181 cost_tidscan(Path *path, PlannerInfo *root,
1182 			 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1183 {
1184 	Cost		startup_cost = 0;
1185 	Cost		run_cost = 0;
1186 	bool		isCurrentOf = false;
1187 	QualCost	qpqual_cost;
1188 	Cost		cpu_per_tuple;
1189 	QualCost	tid_qual_cost;
1190 	int			ntuples;
1191 	ListCell   *l;
1192 	double		spc_random_page_cost;
1193 
1194 	/* Should only be applied to base relations */
1195 	Assert(baserel->relid > 0);
1196 	Assert(baserel->rtekind == RTE_RELATION);
1197 
1198 	/* Mark the path with the correct row estimate */
1199 	if (param_info)
1200 		path->rows = param_info->ppi_rows;
1201 	else
1202 		path->rows = baserel->rows;
1203 
1204 	/* Count how many tuples we expect to retrieve */
1205 	ntuples = 0;
1206 	foreach(l, tidquals)
1207 	{
1208 		if (IsA(lfirst(l), ScalarArrayOpExpr))
1209 		{
1210 			/* Each element of the array yields 1 tuple */
1211 			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) lfirst(l);
1212 			Node	   *arraynode = (Node *) lsecond(saop->args);
1213 
1214 			ntuples += estimate_array_length(arraynode);
1215 		}
1216 		else if (IsA(lfirst(l), CurrentOfExpr))
1217 		{
1218 			/* CURRENT OF yields 1 tuple */
1219 			isCurrentOf = true;
1220 			ntuples++;
1221 		}
1222 		else
1223 		{
1224 			/* It's just CTID = something, count 1 tuple */
1225 			ntuples++;
1226 		}
1227 	}
1228 
1229 	/*
1230 	 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1231 	 * understands how to do it correctly.  Therefore, honor enable_tidscan
1232 	 * only when CURRENT OF isn't present.  Also note that cost_qual_eval
1233 	 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1234 	 * subtract off here; that's to prevent other plan types such as seqscan
1235 	 * from winning.
1236 	 */
1237 	if (isCurrentOf)
1238 	{
1239 		Assert(baserel->baserestrictcost.startup >= disable_cost);
1240 		startup_cost -= disable_cost;
1241 	}
1242 	else if (!enable_tidscan)
1243 		startup_cost += disable_cost;
1244 
1245 	/*
1246 	 * The TID qual expressions will be computed once, any other baserestrict
1247 	 * quals once per retrieved tuple.
1248 	 */
1249 	cost_qual_eval(&tid_qual_cost, tidquals, root);
1250 
1251 	/* fetch estimated page cost for tablespace containing table */
1252 	get_tablespace_page_costs(baserel->reltablespace,
1253 							  &spc_random_page_cost,
1254 							  NULL);
1255 
1256 	/* disk costs --- assume each tuple on a different page */
1257 	run_cost += spc_random_page_cost * ntuples;
1258 
1259 	/* Add scanning CPU costs */
1260 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1261 
1262 	/* XXX currently we assume TID quals are a subset of qpquals */
1263 	startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1264 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1265 		tid_qual_cost.per_tuple;
1266 	run_cost += cpu_per_tuple * ntuples;
1267 
1268 	/* tlist eval costs are paid per output row, not per tuple scanned */
1269 	startup_cost += path->pathtarget->cost.startup;
1270 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1271 
1272 	path->startup_cost = startup_cost;
1273 	path->total_cost = startup_cost + run_cost;
1274 }
1275 
1276 /*
1277  * cost_subqueryscan
1278  *	  Determines and returns the cost of scanning a subquery RTE.
1279  *
1280  * 'baserel' is the relation to be scanned
1281  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1282  */
1283 void
cost_subqueryscan(SubqueryScanPath * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1284 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1285 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1286 {
1287 	Cost		startup_cost;
1288 	Cost		run_cost;
1289 	QualCost	qpqual_cost;
1290 	Cost		cpu_per_tuple;
1291 
1292 	/* Should only be applied to base relations that are subqueries */
1293 	Assert(baserel->relid > 0);
1294 	Assert(baserel->rtekind == RTE_SUBQUERY);
1295 
1296 	/* Mark the path with the correct row estimate */
1297 	if (param_info)
1298 		path->path.rows = param_info->ppi_rows;
1299 	else
1300 		path->path.rows = baserel->rows;
1301 
1302 	/*
1303 	 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1304 	 * any restriction clauses and tlist that will be attached to the
1305 	 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1306 	 * projection overhead.
1307 	 */
1308 	path->path.startup_cost = path->subpath->startup_cost;
1309 	path->path.total_cost = path->subpath->total_cost;
1310 
1311 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1312 
1313 	startup_cost = qpqual_cost.startup;
1314 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1315 	run_cost = cpu_per_tuple * baserel->tuples;
1316 
1317 	/* tlist eval costs are paid per output row, not per tuple scanned */
1318 	startup_cost += path->path.pathtarget->cost.startup;
1319 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1320 
1321 	path->path.startup_cost += startup_cost;
1322 	path->path.total_cost += startup_cost + run_cost;
1323 }
1324 
1325 /*
1326  * cost_functionscan
1327  *	  Determines and returns the cost of scanning a function RTE.
1328  *
1329  * 'baserel' is the relation to be scanned
1330  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1331  */
1332 void
cost_functionscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1333 cost_functionscan(Path *path, PlannerInfo *root,
1334 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1335 {
1336 	Cost		startup_cost = 0;
1337 	Cost		run_cost = 0;
1338 	QualCost	qpqual_cost;
1339 	Cost		cpu_per_tuple;
1340 	RangeTblEntry *rte;
1341 	QualCost	exprcost;
1342 
1343 	/* Should only be applied to base relations that are functions */
1344 	Assert(baserel->relid > 0);
1345 	rte = planner_rt_fetch(baserel->relid, root);
1346 	Assert(rte->rtekind == RTE_FUNCTION);
1347 
1348 	/* Mark the path with the correct row estimate */
1349 	if (param_info)
1350 		path->rows = param_info->ppi_rows;
1351 	else
1352 		path->rows = baserel->rows;
1353 
1354 	/*
1355 	 * Estimate costs of executing the function expression(s).
1356 	 *
1357 	 * Currently, nodeFunctionscan.c always executes the functions to
1358 	 * completion before returning any rows, and caches the results in a
1359 	 * tuplestore.  So the function eval cost is all startup cost, and per-row
1360 	 * costs are minimal.
1361 	 *
1362 	 * XXX in principle we ought to charge tuplestore spill costs if the
1363 	 * number of rows is large.  However, given how phony our rowcount
1364 	 * estimates for functions tend to be, there's not a lot of point in that
1365 	 * refinement right now.
1366 	 */
1367 	cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1368 
1369 	startup_cost += exprcost.startup + exprcost.per_tuple;
1370 
1371 	/* Add scanning CPU costs */
1372 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1373 
1374 	startup_cost += qpqual_cost.startup;
1375 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1376 	run_cost += cpu_per_tuple * baserel->tuples;
1377 
1378 	/* tlist eval costs are paid per output row, not per tuple scanned */
1379 	startup_cost += path->pathtarget->cost.startup;
1380 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1381 
1382 	path->startup_cost = startup_cost;
1383 	path->total_cost = startup_cost + run_cost;
1384 }
1385 
1386 /*
1387  * cost_tablefuncscan
1388  *	  Determines and returns the cost of scanning a table function.
1389  *
1390  * 'baserel' is the relation to be scanned
1391  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1392  */
1393 void
cost_tablefuncscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1394 cost_tablefuncscan(Path *path, PlannerInfo *root,
1395 				   RelOptInfo *baserel, ParamPathInfo *param_info)
1396 {
1397 	Cost		startup_cost = 0;
1398 	Cost		run_cost = 0;
1399 	QualCost	qpqual_cost;
1400 	Cost		cpu_per_tuple;
1401 	RangeTblEntry *rte;
1402 	QualCost	exprcost;
1403 
1404 	/* Should only be applied to base relations that are functions */
1405 	Assert(baserel->relid > 0);
1406 	rte = planner_rt_fetch(baserel->relid, root);
1407 	Assert(rte->rtekind == RTE_TABLEFUNC);
1408 
1409 	/* Mark the path with the correct row estimate */
1410 	if (param_info)
1411 		path->rows = param_info->ppi_rows;
1412 	else
1413 		path->rows = baserel->rows;
1414 
1415 	/*
1416 	 * Estimate costs of executing the table func expression(s).
1417 	 *
1418 	 * XXX in principle we ought to charge tuplestore spill costs if the
1419 	 * number of rows is large.  However, given how phony our rowcount
1420 	 * estimates for tablefuncs tend to be, there's not a lot of point in that
1421 	 * refinement right now.
1422 	 */
1423 	cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1424 
1425 	startup_cost += exprcost.startup + exprcost.per_tuple;
1426 
1427 	/* Add scanning CPU costs */
1428 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1429 
1430 	startup_cost += qpqual_cost.startup;
1431 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1432 	run_cost += cpu_per_tuple * baserel->tuples;
1433 
1434 	/* tlist eval costs are paid per output row, not per tuple scanned */
1435 	startup_cost += path->pathtarget->cost.startup;
1436 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1437 
1438 	path->startup_cost = startup_cost;
1439 	path->total_cost = startup_cost + run_cost;
1440 }
1441 
1442 /*
1443  * cost_valuesscan
1444  *	  Determines and returns the cost of scanning a VALUES RTE.
1445  *
1446  * 'baserel' is the relation to be scanned
1447  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1448  */
1449 void
cost_valuesscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1450 cost_valuesscan(Path *path, PlannerInfo *root,
1451 				RelOptInfo *baserel, ParamPathInfo *param_info)
1452 {
1453 	Cost		startup_cost = 0;
1454 	Cost		run_cost = 0;
1455 	QualCost	qpqual_cost;
1456 	Cost		cpu_per_tuple;
1457 
1458 	/* Should only be applied to base relations that are values lists */
1459 	Assert(baserel->relid > 0);
1460 	Assert(baserel->rtekind == RTE_VALUES);
1461 
1462 	/* Mark the path with the correct row estimate */
1463 	if (param_info)
1464 		path->rows = param_info->ppi_rows;
1465 	else
1466 		path->rows = baserel->rows;
1467 
1468 	/*
1469 	 * For now, estimate list evaluation cost at one operator eval per list
1470 	 * (probably pretty bogus, but is it worth being smarter?)
1471 	 */
1472 	cpu_per_tuple = cpu_operator_cost;
1473 
1474 	/* Add scanning CPU costs */
1475 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1476 
1477 	startup_cost += qpqual_cost.startup;
1478 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1479 	run_cost += cpu_per_tuple * baserel->tuples;
1480 
1481 	/* tlist eval costs are paid per output row, not per tuple scanned */
1482 	startup_cost += path->pathtarget->cost.startup;
1483 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1484 
1485 	path->startup_cost = startup_cost;
1486 	path->total_cost = startup_cost + run_cost;
1487 }
1488 
1489 /*
1490  * cost_ctescan
1491  *	  Determines and returns the cost of scanning a CTE RTE.
1492  *
1493  * Note: this is used for both self-reference and regular CTEs; the
1494  * possible cost differences are below the threshold of what we could
1495  * estimate accurately anyway.  Note that the costs of evaluating the
1496  * referenced CTE query are added into the final plan as initplan costs,
1497  * and should NOT be counted here.
1498  */
1499 void
cost_ctescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1500 cost_ctescan(Path *path, PlannerInfo *root,
1501 			 RelOptInfo *baserel, ParamPathInfo *param_info)
1502 {
1503 	Cost		startup_cost = 0;
1504 	Cost		run_cost = 0;
1505 	QualCost	qpqual_cost;
1506 	Cost		cpu_per_tuple;
1507 
1508 	/* Should only be applied to base relations that are CTEs */
1509 	Assert(baserel->relid > 0);
1510 	Assert(baserel->rtekind == RTE_CTE);
1511 
1512 	/* Mark the path with the correct row estimate */
1513 	if (param_info)
1514 		path->rows = param_info->ppi_rows;
1515 	else
1516 		path->rows = baserel->rows;
1517 
1518 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1519 	cpu_per_tuple = cpu_tuple_cost;
1520 
1521 	/* Add scanning CPU costs */
1522 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1523 
1524 	startup_cost += qpqual_cost.startup;
1525 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1526 	run_cost += cpu_per_tuple * baserel->tuples;
1527 
1528 	/* tlist eval costs are paid per output row, not per tuple scanned */
1529 	startup_cost += path->pathtarget->cost.startup;
1530 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1531 
1532 	path->startup_cost = startup_cost;
1533 	path->total_cost = startup_cost + run_cost;
1534 }
1535 
1536 /*
1537  * cost_namedtuplestorescan
1538  *	  Determines and returns the cost of scanning a named tuplestore.
1539  */
1540 void
cost_namedtuplestorescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1541 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1542 						 RelOptInfo *baserel, ParamPathInfo *param_info)
1543 {
1544 	Cost		startup_cost = 0;
1545 	Cost		run_cost = 0;
1546 	QualCost	qpqual_cost;
1547 	Cost		cpu_per_tuple;
1548 
1549 	/* Should only be applied to base relations that are Tuplestores */
1550 	Assert(baserel->relid > 0);
1551 	Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1552 
1553 	/* Mark the path with the correct row estimate */
1554 	if (param_info)
1555 		path->rows = param_info->ppi_rows;
1556 	else
1557 		path->rows = baserel->rows;
1558 
1559 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1560 	cpu_per_tuple = cpu_tuple_cost;
1561 
1562 	/* Add scanning CPU costs */
1563 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1564 
1565 	startup_cost += qpqual_cost.startup;
1566 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1567 	run_cost += cpu_per_tuple * baserel->tuples;
1568 
1569 	path->startup_cost = startup_cost;
1570 	path->total_cost = startup_cost + run_cost;
1571 }
1572 
1573 /*
1574  * cost_recursive_union
1575  *	  Determines and returns the cost of performing a recursive union,
1576  *	  and also the estimated output size.
1577  *
1578  * We are given Paths for the nonrecursive and recursive terms.
1579  */
1580 void
cost_recursive_union(Path * runion,Path * nrterm,Path * rterm)1581 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1582 {
1583 	Cost		startup_cost;
1584 	Cost		total_cost;
1585 	double		total_rows;
1586 
1587 	/* We probably have decent estimates for the non-recursive term */
1588 	startup_cost = nrterm->startup_cost;
1589 	total_cost = nrterm->total_cost;
1590 	total_rows = nrterm->rows;
1591 
1592 	/*
1593 	 * We arbitrarily assume that about 10 recursive iterations will be
1594 	 * needed, and that we've managed to get a good fix on the cost and output
1595 	 * size of each one of them.  These are mighty shaky assumptions but it's
1596 	 * hard to see how to do better.
1597 	 */
1598 	total_cost += 10 * rterm->total_cost;
1599 	total_rows += 10 * rterm->rows;
1600 
1601 	/*
1602 	 * Also charge cpu_tuple_cost per row to account for the costs of
1603 	 * manipulating the tuplestores.  (We don't worry about possible
1604 	 * spill-to-disk costs.)
1605 	 */
1606 	total_cost += cpu_tuple_cost * total_rows;
1607 
1608 	runion->startup_cost = startup_cost;
1609 	runion->total_cost = total_cost;
1610 	runion->rows = total_rows;
1611 	runion->pathtarget->width = Max(nrterm->pathtarget->width,
1612 									rterm->pathtarget->width);
1613 }
1614 
1615 /*
1616  * cost_sort
1617  *	  Determines and returns the cost of sorting a relation, including
1618  *	  the cost of reading the input data.
1619  *
1620  * If the total volume of data to sort is less than sort_mem, we will do
1621  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1622  * comparisons for t tuples.
1623  *
1624  * If the total volume exceeds sort_mem, we switch to a tape-style merge
1625  * algorithm.  There will still be about t*log2(t) tuple comparisons in
1626  * total, but we will also need to write and read each tuple once per
1627  * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
1628  * number of initial runs formed and M is the merge order used by tuplesort.c.
1629  * Since the average initial run should be about sort_mem, we have
1630  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1631  *		cpu = comparison_cost * t * log2(t)
1632  *
1633  * If the sort is bounded (i.e., only the first k result tuples are needed)
1634  * and k tuples can fit into sort_mem, we use a heap method that keeps only
1635  * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1636  *
1637  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1638  * accesses (XXX can't we refine that guess?)
1639  *
1640  * By default, we charge two operator evals per tuple comparison, which should
1641  * be in the right ballpark in most cases.  The caller can tweak this by
1642  * specifying nonzero comparison_cost; typically that's used for any extra
1643  * work that has to be done to prepare the inputs to the comparison operators.
1644  *
1645  * 'pathkeys' is a list of sort keys
1646  * 'input_cost' is the total cost for reading the input data
1647  * 'tuples' is the number of tuples in the relation
1648  * 'width' is the average tuple width in bytes
1649  * 'comparison_cost' is the extra cost per comparison, if any
1650  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1651  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1652  *
1653  * NOTE: some callers currently pass NIL for pathkeys because they
1654  * can't conveniently supply the sort keys.  Since this routine doesn't
1655  * currently do anything with pathkeys anyway, that doesn't matter...
1656  * but if it ever does, it should react gracefully to lack of key data.
1657  * (Actually, the thing we'd most likely be interested in is just the number
1658  * of sort keys, which all callers *could* supply.)
1659  */
1660 void
cost_sort(Path * path,PlannerInfo * root,List * pathkeys,Cost input_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1661 cost_sort(Path *path, PlannerInfo *root,
1662 		  List *pathkeys, Cost input_cost, double tuples, int width,
1663 		  Cost comparison_cost, int sort_mem,
1664 		  double limit_tuples)
1665 {
1666 	Cost		startup_cost = input_cost;
1667 	Cost		run_cost = 0;
1668 	double		input_bytes = relation_byte_size(tuples, width);
1669 	double		output_bytes;
1670 	double		output_tuples;
1671 	long		sort_mem_bytes = sort_mem * 1024L;
1672 
1673 	if (!enable_sort)
1674 		startup_cost += disable_cost;
1675 
1676 	path->rows = tuples;
1677 
1678 	/*
1679 	 * We want to be sure the cost of a sort is never estimated as zero, even
1680 	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1681 	 */
1682 	if (tuples < 2.0)
1683 		tuples = 2.0;
1684 
1685 	/* Include the default cost-per-comparison */
1686 	comparison_cost += 2.0 * cpu_operator_cost;
1687 
1688 	/* Do we have a useful LIMIT? */
1689 	if (limit_tuples > 0 && limit_tuples < tuples)
1690 	{
1691 		output_tuples = limit_tuples;
1692 		output_bytes = relation_byte_size(output_tuples, width);
1693 	}
1694 	else
1695 	{
1696 		output_tuples = tuples;
1697 		output_bytes = input_bytes;
1698 	}
1699 
1700 	if (output_bytes > sort_mem_bytes)
1701 	{
1702 		/*
1703 		 * We'll have to use a disk-based sort of all the tuples
1704 		 */
1705 		double		npages = ceil(input_bytes / BLCKSZ);
1706 		double		nruns = input_bytes / sort_mem_bytes;
1707 		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
1708 		double		log_runs;
1709 		double		npageaccesses;
1710 
1711 		/*
1712 		 * CPU costs
1713 		 *
1714 		 * Assume about N log2 N comparisons
1715 		 */
1716 		startup_cost += comparison_cost * tuples * LOG2(tuples);
1717 
1718 		/* Disk costs */
1719 
1720 		/* Compute logM(r) as log(r) / log(M) */
1721 		if (nruns > mergeorder)
1722 			log_runs = ceil(log(nruns) / log(mergeorder));
1723 		else
1724 			log_runs = 1.0;
1725 		npageaccesses = 2.0 * npages * log_runs;
1726 		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
1727 		startup_cost += npageaccesses *
1728 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
1729 	}
1730 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1731 	{
1732 		/*
1733 		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1734 		 * a total number of tuple comparisons of N log2 K; but the constant
1735 		 * factor is a bit higher than for quicksort.  Tweak it so that the
1736 		 * cost curve is continuous at the crossover point.
1737 		 */
1738 		startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples);
1739 	}
1740 	else
1741 	{
1742 		/* We'll use plain quicksort on all the input tuples */
1743 		startup_cost += comparison_cost * tuples * LOG2(tuples);
1744 	}
1745 
1746 	/*
1747 	 * Also charge a small amount (arbitrarily set equal to operator cost) per
1748 	 * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
1749 	 * doesn't do qual-checking or projection, so it has less overhead than
1750 	 * most plan nodes.  Note it's correct to use tuples not output_tuples
1751 	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1752 	 * counting the LIMIT otherwise.
1753 	 */
1754 	run_cost += cpu_operator_cost * tuples;
1755 
1756 	path->startup_cost = startup_cost;
1757 	path->total_cost = startup_cost + run_cost;
1758 }
1759 
1760 /*
1761  * append_nonpartial_cost
1762  *	  Estimate the cost of the non-partial paths in a Parallel Append.
1763  *	  The non-partial paths are assumed to be the first "numpaths" paths
1764  *	  from the subpaths list, and to be in order of decreasing cost.
1765  */
1766 static Cost
append_nonpartial_cost(List * subpaths,int numpaths,int parallel_workers)1767 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1768 {
1769 	Cost	   *costarr;
1770 	int			arrlen;
1771 	ListCell   *l;
1772 	ListCell   *cell;
1773 	int			i;
1774 	int			path_index;
1775 	int			min_index;
1776 	int			max_index;
1777 
1778 	if (numpaths == 0)
1779 		return 0;
1780 
1781 	/*
1782 	 * Array length is number of workers or number of relevants paths,
1783 	 * whichever is less.
1784 	 */
1785 	arrlen = Min(parallel_workers, numpaths);
1786 	costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1787 
1788 	/* The first few paths will each be claimed by a different worker. */
1789 	path_index = 0;
1790 	foreach(cell, subpaths)
1791 	{
1792 		Path	   *subpath = (Path *) lfirst(cell);
1793 
1794 		if (path_index == arrlen)
1795 			break;
1796 		costarr[path_index++] = subpath->total_cost;
1797 	}
1798 
1799 	/*
1800 	 * Since subpaths are sorted by decreasing cost, the last one will have
1801 	 * the minimum cost.
1802 	 */
1803 	min_index = arrlen - 1;
1804 
1805 	/*
1806 	 * For each of the remaining subpaths, add its cost to the array element
1807 	 * with minimum cost.
1808 	 */
1809 	for_each_cell(l, cell)
1810 	{
1811 		Path	   *subpath = (Path *) lfirst(l);
1812 		int			i;
1813 
1814 		/* Consider only the non-partial paths */
1815 		if (path_index++ == numpaths)
1816 			break;
1817 
1818 		costarr[min_index] += subpath->total_cost;
1819 
1820 		/* Update the new min cost array index */
1821 		for (min_index = i = 0; i < arrlen; i++)
1822 		{
1823 			if (costarr[i] < costarr[min_index])
1824 				min_index = i;
1825 		}
1826 	}
1827 
1828 	/* Return the highest cost from the array */
1829 	for (max_index = i = 0; i < arrlen; i++)
1830 	{
1831 		if (costarr[i] > costarr[max_index])
1832 			max_index = i;
1833 	}
1834 
1835 	return costarr[max_index];
1836 }
1837 
1838 /*
1839  * cost_append
1840  *	  Determines and returns the cost of an Append node.
1841  */
1842 void
cost_append(AppendPath * apath)1843 cost_append(AppendPath *apath)
1844 {
1845 	ListCell   *l;
1846 
1847 	apath->path.startup_cost = 0;
1848 	apath->path.total_cost = 0;
1849 
1850 	if (apath->subpaths == NIL)
1851 		return;
1852 
1853 	if (!apath->path.parallel_aware)
1854 	{
1855 		Path	   *subpath = (Path *) linitial(apath->subpaths);
1856 
1857 		/*
1858 		 * Startup cost of non-parallel-aware Append is the startup cost of
1859 		 * first subpath.
1860 		 */
1861 		apath->path.startup_cost = subpath->startup_cost;
1862 
1863 		/* Compute rows and costs as sums of subplan rows and costs. */
1864 		foreach(l, apath->subpaths)
1865 		{
1866 			Path	   *subpath = (Path *) lfirst(l);
1867 
1868 			apath->path.rows += subpath->rows;
1869 			apath->path.total_cost += subpath->total_cost;
1870 		}
1871 	}
1872 	else						/* parallel-aware */
1873 	{
1874 		int			i = 0;
1875 		double		parallel_divisor = get_parallel_divisor(&apath->path);
1876 
1877 		/* Calculate startup cost. */
1878 		foreach(l, apath->subpaths)
1879 		{
1880 			Path	   *subpath = (Path *) lfirst(l);
1881 
1882 			/*
1883 			 * Append will start returning tuples when the child node having
1884 			 * lowest startup cost is done setting up. We consider only the
1885 			 * first few subplans that immediately get a worker assigned.
1886 			 */
1887 			if (i == 0)
1888 				apath->path.startup_cost = subpath->startup_cost;
1889 			else if (i < apath->path.parallel_workers)
1890 				apath->path.startup_cost = Min(apath->path.startup_cost,
1891 											   subpath->startup_cost);
1892 
1893 			/*
1894 			 * Apply parallel divisor to subpaths.  Scale the number of rows
1895 			 * for each partial subpath based on the ratio of the parallel
1896 			 * divisor originally used for the subpath to the one we adopted.
1897 			 * Also add the cost of partial paths to the total cost, but
1898 			 * ignore non-partial paths for now.
1899 			 */
1900 			if (i < apath->first_partial_path)
1901 				apath->path.rows += subpath->rows / parallel_divisor;
1902 			else
1903 			{
1904 				double		subpath_parallel_divisor;
1905 
1906 				subpath_parallel_divisor = get_parallel_divisor(subpath);
1907 				apath->path.rows += subpath->rows * (subpath_parallel_divisor /
1908 													 parallel_divisor);
1909 				apath->path.total_cost += subpath->total_cost;
1910 			}
1911 
1912 			apath->path.rows = clamp_row_est(apath->path.rows);
1913 
1914 			i++;
1915 		}
1916 
1917 		/* Add cost for non-partial subpaths. */
1918 		apath->path.total_cost +=
1919 			append_nonpartial_cost(apath->subpaths,
1920 								   apath->first_partial_path,
1921 								   apath->path.parallel_workers);
1922 	}
1923 
1924 	/*
1925 	 * Although Append does not do any selection or projection, it's not free;
1926 	 * add a small per-tuple overhead.
1927 	 */
1928 	apath->path.total_cost +=
1929 		cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
1930 }
1931 
1932 /*
1933  * cost_merge_append
1934  *	  Determines and returns the cost of a MergeAppend node.
1935  *
1936  * MergeAppend merges several pre-sorted input streams, using a heap that
1937  * at any given instant holds the next tuple from each stream.  If there
1938  * are N streams, we need about N*log2(N) tuple comparisons to construct
1939  * the heap at startup, and then for each output tuple, about log2(N)
1940  * comparisons to replace the top entry.
1941  *
1942  * (The effective value of N will drop once some of the input streams are
1943  * exhausted, but it seems unlikely to be worth trying to account for that.)
1944  *
1945  * The heap is never spilled to disk, since we assume N is not very large.
1946  * So this is much simpler than cost_sort.
1947  *
1948  * As in cost_sort, we charge two operator evals per tuple comparison.
1949  *
1950  * 'pathkeys' is a list of sort keys
1951  * 'n_streams' is the number of input streams
1952  * 'input_startup_cost' is the sum of the input streams' startup costs
1953  * 'input_total_cost' is the sum of the input streams' total costs
1954  * 'tuples' is the number of tuples in all the streams
1955  */
1956 void
cost_merge_append(Path * path,PlannerInfo * root,List * pathkeys,int n_streams,Cost input_startup_cost,Cost input_total_cost,double tuples)1957 cost_merge_append(Path *path, PlannerInfo *root,
1958 				  List *pathkeys, int n_streams,
1959 				  Cost input_startup_cost, Cost input_total_cost,
1960 				  double tuples)
1961 {
1962 	Cost		startup_cost = 0;
1963 	Cost		run_cost = 0;
1964 	Cost		comparison_cost;
1965 	double		N;
1966 	double		logN;
1967 
1968 	/*
1969 	 * Avoid log(0)...
1970 	 */
1971 	N = (n_streams < 2) ? 2.0 : (double) n_streams;
1972 	logN = LOG2(N);
1973 
1974 	/* Assumed cost per tuple comparison */
1975 	comparison_cost = 2.0 * cpu_operator_cost;
1976 
1977 	/* Heap creation cost */
1978 	startup_cost += comparison_cost * N * logN;
1979 
1980 	/* Per-tuple heap maintenance cost */
1981 	run_cost += tuples * comparison_cost * logN;
1982 
1983 	/*
1984 	 * Although MergeAppend does not do any selection or projection, it's not
1985 	 * free; add a small per-tuple overhead.
1986 	 */
1987 	run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
1988 
1989 	path->startup_cost = startup_cost + input_startup_cost;
1990 	path->total_cost = startup_cost + run_cost + input_total_cost;
1991 }
1992 
1993 /*
1994  * cost_material
1995  *	  Determines and returns the cost of materializing a relation, including
1996  *	  the cost of reading the input data.
1997  *
1998  * If the total volume of data to materialize exceeds work_mem, we will need
1999  * to write it to disk, so the cost is much higher in that case.
2000  *
2001  * Note that here we are estimating the costs for the first scan of the
2002  * relation, so the materialization is all overhead --- any savings will
2003  * occur only on rescan, which is estimated in cost_rescan.
2004  */
2005 void
cost_material(Path * path,Cost input_startup_cost,Cost input_total_cost,double tuples,int width)2006 cost_material(Path *path,
2007 			  Cost input_startup_cost, Cost input_total_cost,
2008 			  double tuples, int width)
2009 {
2010 	Cost		startup_cost = input_startup_cost;
2011 	Cost		run_cost = input_total_cost - input_startup_cost;
2012 	double		nbytes = relation_byte_size(tuples, width);
2013 	long		work_mem_bytes = work_mem * 1024L;
2014 
2015 	path->rows = tuples;
2016 
2017 	/*
2018 	 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2019 	 * reflect bookkeeping overhead.  (This rate must be more than what
2020 	 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2021 	 * if it is exactly the same then there will be a cost tie between
2022 	 * nestloop with A outer, materialized B inner and nestloop with B outer,
2023 	 * materialized A inner.  The extra cost ensures we'll prefer
2024 	 * materializing the smaller rel.)	Note that this is normally a good deal
2025 	 * less than cpu_tuple_cost; which is OK because a Material plan node
2026 	 * doesn't do qual-checking or projection, so it's got less overhead than
2027 	 * most plan nodes.
2028 	 */
2029 	run_cost += 2 * cpu_operator_cost * tuples;
2030 
2031 	/*
2032 	 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2033 	 * This cost is assumed to be evenly spread through the plan run phase,
2034 	 * which isn't exactly accurate but our cost model doesn't allow for
2035 	 * nonuniform costs within the run phase.
2036 	 */
2037 	if (nbytes > work_mem_bytes)
2038 	{
2039 		double		npages = ceil(nbytes / BLCKSZ);
2040 
2041 		run_cost += seq_page_cost * npages;
2042 	}
2043 
2044 	path->startup_cost = startup_cost;
2045 	path->total_cost = startup_cost + run_cost;
2046 }
2047 
2048 /*
2049  * cost_agg
2050  *		Determines and returns the cost of performing an Agg plan node,
2051  *		including the cost of its input.
2052  *
2053  * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2054  * we are using a hashed Agg node just to do grouping).
2055  *
2056  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2057  * are for appropriately-sorted input.
2058  */
2059 void
cost_agg(Path * path,PlannerInfo * root,AggStrategy aggstrategy,const AggClauseCosts * aggcosts,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2060 cost_agg(Path *path, PlannerInfo *root,
2061 		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2062 		 int numGroupCols, double numGroups,
2063 		 List *quals,
2064 		 Cost input_startup_cost, Cost input_total_cost,
2065 		 double input_tuples)
2066 {
2067 	double		output_tuples;
2068 	Cost		startup_cost;
2069 	Cost		total_cost;
2070 	AggClauseCosts dummy_aggcosts;
2071 
2072 	/* Use all-zero per-aggregate costs if NULL is passed */
2073 	if (aggcosts == NULL)
2074 	{
2075 		Assert(aggstrategy == AGG_HASHED);
2076 		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2077 		aggcosts = &dummy_aggcosts;
2078 	}
2079 
2080 	/*
2081 	 * The transCost.per_tuple component of aggcosts should be charged once
2082 	 * per input tuple, corresponding to the costs of evaluating the aggregate
2083 	 * transfns and their input expressions (with any startup cost of course
2084 	 * charged but once).  The finalCost component is charged once per output
2085 	 * tuple, corresponding to the costs of evaluating the finalfns.
2086 	 *
2087 	 * If we are grouping, we charge an additional cpu_operator_cost per
2088 	 * grouping column per input tuple for grouping comparisons.
2089 	 *
2090 	 * We will produce a single output tuple if not grouping, and a tuple per
2091 	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
2092 	 *
2093 	 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2094 	 * same total CPU cost, but AGG_SORTED has lower startup cost.  If the
2095 	 * input path is already sorted appropriately, AGG_SORTED should be
2096 	 * preferred (since it has no risk of memory overflow).  This will happen
2097 	 * as long as the computed total costs are indeed exactly equal --- but if
2098 	 * there's roundoff error we might do the wrong thing.  So be sure that
2099 	 * the computations below form the same intermediate values in the same
2100 	 * order.
2101 	 */
2102 	if (aggstrategy == AGG_PLAIN)
2103 	{
2104 		startup_cost = input_total_cost;
2105 		startup_cost += aggcosts->transCost.startup;
2106 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2107 		startup_cost += aggcosts->finalCost;
2108 		/* we aren't grouping */
2109 		total_cost = startup_cost + cpu_tuple_cost;
2110 		output_tuples = 1;
2111 	}
2112 	else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2113 	{
2114 		/* Here we are able to deliver output on-the-fly */
2115 		startup_cost = input_startup_cost;
2116 		total_cost = input_total_cost;
2117 		if (aggstrategy == AGG_MIXED && !enable_hashagg)
2118 		{
2119 			startup_cost += disable_cost;
2120 			total_cost += disable_cost;
2121 		}
2122 		/* calcs phrased this way to match HASHED case, see note above */
2123 		total_cost += aggcosts->transCost.startup;
2124 		total_cost += aggcosts->transCost.per_tuple * input_tuples;
2125 		total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2126 		total_cost += aggcosts->finalCost * numGroups;
2127 		total_cost += cpu_tuple_cost * numGroups;
2128 		output_tuples = numGroups;
2129 	}
2130 	else
2131 	{
2132 		/* must be AGG_HASHED */
2133 		startup_cost = input_total_cost;
2134 		if (!enable_hashagg)
2135 			startup_cost += disable_cost;
2136 		startup_cost += aggcosts->transCost.startup;
2137 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2138 		startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2139 		total_cost = startup_cost;
2140 		total_cost += aggcosts->finalCost * numGroups;
2141 		total_cost += cpu_tuple_cost * numGroups;
2142 		output_tuples = numGroups;
2143 	}
2144 
2145 	/*
2146 	 * If there are quals (HAVING quals), account for their cost and
2147 	 * selectivity.
2148 	 */
2149 	if (quals)
2150 	{
2151 		QualCost	qual_cost;
2152 
2153 		cost_qual_eval(&qual_cost, quals, root);
2154 		startup_cost += qual_cost.startup;
2155 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2156 
2157 		output_tuples = clamp_row_est(output_tuples *
2158 									  clauselist_selectivity(root,
2159 															 quals,
2160 															 0,
2161 															 JOIN_INNER,
2162 															 NULL));
2163 	}
2164 
2165 	path->rows = output_tuples;
2166 	path->startup_cost = startup_cost;
2167 	path->total_cost = total_cost;
2168 }
2169 
2170 /*
2171  * cost_windowagg
2172  *		Determines and returns the cost of performing a WindowAgg plan node,
2173  *		including the cost of its input.
2174  *
2175  * Input is assumed already properly sorted.
2176  */
2177 void
cost_windowagg(Path * path,PlannerInfo * root,List * windowFuncs,int numPartCols,int numOrderCols,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2178 cost_windowagg(Path *path, PlannerInfo *root,
2179 			   List *windowFuncs, int numPartCols, int numOrderCols,
2180 			   Cost input_startup_cost, Cost input_total_cost,
2181 			   double input_tuples)
2182 {
2183 	Cost		startup_cost;
2184 	Cost		total_cost;
2185 	ListCell   *lc;
2186 
2187 	startup_cost = input_startup_cost;
2188 	total_cost = input_total_cost;
2189 
2190 	/*
2191 	 * Window functions are assumed to cost their stated execution cost, plus
2192 	 * the cost of evaluating their input expressions, per tuple.  Since they
2193 	 * may in fact evaluate their inputs at multiple rows during each cycle,
2194 	 * this could be a drastic underestimate; but without a way to know how
2195 	 * many rows the window function will fetch, it's hard to do better.  In
2196 	 * any case, it's a good estimate for all the built-in window functions,
2197 	 * so we'll just do this for now.
2198 	 */
2199 	foreach(lc, windowFuncs)
2200 	{
2201 		WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2202 		Cost		wfunccost;
2203 		QualCost	argcosts;
2204 
2205 		wfunccost = get_func_cost(wfunc->winfnoid) * cpu_operator_cost;
2206 
2207 		/* also add the input expressions' cost to per-input-row costs */
2208 		cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2209 		startup_cost += argcosts.startup;
2210 		wfunccost += argcosts.per_tuple;
2211 
2212 		/*
2213 		 * Add the filter's cost to per-input-row costs.  XXX We should reduce
2214 		 * input expression costs according to filter selectivity.
2215 		 */
2216 		cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2217 		startup_cost += argcosts.startup;
2218 		wfunccost += argcosts.per_tuple;
2219 
2220 		total_cost += wfunccost * input_tuples;
2221 	}
2222 
2223 	/*
2224 	 * We also charge cpu_operator_cost per grouping column per tuple for
2225 	 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2226 	 * overhead.
2227 	 *
2228 	 * XXX this neglects costs of spooling the data to disk when it overflows
2229 	 * work_mem.  Sooner or later that should get accounted for.
2230 	 */
2231 	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2232 	total_cost += cpu_tuple_cost * input_tuples;
2233 
2234 	path->rows = input_tuples;
2235 	path->startup_cost = startup_cost;
2236 	path->total_cost = total_cost;
2237 }
2238 
2239 /*
2240  * cost_group
2241  *		Determines and returns the cost of performing a Group plan node,
2242  *		including the cost of its input.
2243  *
2244  * Note: caller must ensure that input costs are for appropriately-sorted
2245  * input.
2246  */
2247 void
cost_group(Path * path,PlannerInfo * root,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2248 cost_group(Path *path, PlannerInfo *root,
2249 		   int numGroupCols, double numGroups,
2250 		   List *quals,
2251 		   Cost input_startup_cost, Cost input_total_cost,
2252 		   double input_tuples)
2253 {
2254 	double		output_tuples;
2255 	Cost		startup_cost;
2256 	Cost		total_cost;
2257 
2258 	output_tuples = numGroups;
2259 	startup_cost = input_startup_cost;
2260 	total_cost = input_total_cost;
2261 
2262 	/*
2263 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2264 	 * all columns get compared at most of the tuples.
2265 	 */
2266 	total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2267 
2268 	/*
2269 	 * If there are quals (HAVING quals), account for their cost and
2270 	 * selectivity.
2271 	 */
2272 	if (quals)
2273 	{
2274 		QualCost	qual_cost;
2275 
2276 		cost_qual_eval(&qual_cost, quals, root);
2277 		startup_cost += qual_cost.startup;
2278 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2279 
2280 		output_tuples = clamp_row_est(output_tuples *
2281 									  clauselist_selectivity(root,
2282 															 quals,
2283 															 0,
2284 															 JOIN_INNER,
2285 															 NULL));
2286 	}
2287 
2288 	path->rows = output_tuples;
2289 	path->startup_cost = startup_cost;
2290 	path->total_cost = total_cost;
2291 }
2292 
2293 /*
2294  * initial_cost_nestloop
2295  *	  Preliminary estimate of the cost of a nestloop join path.
2296  *
2297  * This must quickly produce lower-bound estimates of the path's startup and
2298  * total costs.  If we are unable to eliminate the proposed path from
2299  * consideration using the lower bounds, final_cost_nestloop will be called
2300  * to obtain the final estimates.
2301  *
2302  * The exact division of labor between this function and final_cost_nestloop
2303  * is private to them, and represents a tradeoff between speed of the initial
2304  * estimate and getting a tight lower bound.  We choose to not examine the
2305  * join quals here, since that's by far the most expensive part of the
2306  * calculations.  The end result is that CPU-cost considerations must be
2307  * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2308  * incorporation of the inner path's run cost.
2309  *
2310  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2311  *		other data to be used by final_cost_nestloop
2312  * 'jointype' is the type of join to be performed
2313  * 'outer_path' is the outer input to the join
2314  * 'inner_path' is the inner input to the join
2315  * 'extra' contains miscellaneous information about the join
2316  */
2317 void
initial_cost_nestloop(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,Path * outer_path,Path * inner_path,JoinPathExtraData * extra)2318 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2319 					  JoinType jointype,
2320 					  Path *outer_path, Path *inner_path,
2321 					  JoinPathExtraData *extra)
2322 {
2323 	Cost		startup_cost = 0;
2324 	Cost		run_cost = 0;
2325 	double		outer_path_rows = outer_path->rows;
2326 	Cost		inner_rescan_start_cost;
2327 	Cost		inner_rescan_total_cost;
2328 	Cost		inner_run_cost;
2329 	Cost		inner_rescan_run_cost;
2330 
2331 	/* estimate costs to rescan the inner relation */
2332 	cost_rescan(root, inner_path,
2333 				&inner_rescan_start_cost,
2334 				&inner_rescan_total_cost);
2335 
2336 	/* cost of source data */
2337 
2338 	/*
2339 	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2340 	 * before we can start returning tuples, so the join's startup cost is
2341 	 * their sum.  We'll also pay the inner path's rescan startup cost
2342 	 * multiple times.
2343 	 */
2344 	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2345 	run_cost += outer_path->total_cost - outer_path->startup_cost;
2346 	if (outer_path_rows > 1)
2347 		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2348 
2349 	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2350 	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2351 
2352 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2353 		extra->inner_unique)
2354 	{
2355 		/*
2356 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2357 		 * executor will stop after the first match.
2358 		 *
2359 		 * Getting decent estimates requires inspection of the join quals,
2360 		 * which we choose to postpone to final_cost_nestloop.
2361 		 */
2362 
2363 		/* Save private data for final_cost_nestloop */
2364 		workspace->inner_run_cost = inner_run_cost;
2365 		workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2366 	}
2367 	else
2368 	{
2369 		/* Normal case; we'll scan whole input rel for each outer row */
2370 		run_cost += inner_run_cost;
2371 		if (outer_path_rows > 1)
2372 			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2373 	}
2374 
2375 	/* CPU costs left for later */
2376 
2377 	/* Public result fields */
2378 	workspace->startup_cost = startup_cost;
2379 	workspace->total_cost = startup_cost + run_cost;
2380 	/* Save private data for final_cost_nestloop */
2381 	workspace->run_cost = run_cost;
2382 }
2383 
2384 /*
2385  * final_cost_nestloop
2386  *	  Final estimate of the cost and result size of a nestloop join path.
2387  *
2388  * 'path' is already filled in except for the rows and cost fields
2389  * 'workspace' is the result from initial_cost_nestloop
2390  * 'extra' contains miscellaneous information about the join
2391  */
2392 void
final_cost_nestloop(PlannerInfo * root,NestPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2393 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2394 					JoinCostWorkspace *workspace,
2395 					JoinPathExtraData *extra)
2396 {
2397 	Path	   *outer_path = path->outerjoinpath;
2398 	Path	   *inner_path = path->innerjoinpath;
2399 	double		outer_path_rows = outer_path->rows;
2400 	double		inner_path_rows = inner_path->rows;
2401 	Cost		startup_cost = workspace->startup_cost;
2402 	Cost		run_cost = workspace->run_cost;
2403 	Cost		cpu_per_tuple;
2404 	QualCost	restrict_qual_cost;
2405 	double		ntuples;
2406 
2407 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2408 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2409 		outer_path_rows = 1;
2410 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2411 		inner_path_rows = 1;
2412 
2413 	/* Mark the path with the correct row estimate */
2414 	if (path->path.param_info)
2415 		path->path.rows = path->path.param_info->ppi_rows;
2416 	else
2417 		path->path.rows = path->path.parent->rows;
2418 
2419 	/* For partial paths, scale row estimate. */
2420 	if (path->path.parallel_workers > 0)
2421 	{
2422 		double		parallel_divisor = get_parallel_divisor(&path->path);
2423 
2424 		path->path.rows =
2425 			clamp_row_est(path->path.rows / parallel_divisor);
2426 	}
2427 
2428 	/*
2429 	 * We could include disable_cost in the preliminary estimate, but that
2430 	 * would amount to optimizing for the case where the join method is
2431 	 * disabled, which doesn't seem like the way to bet.
2432 	 */
2433 	if (!enable_nestloop)
2434 		startup_cost += disable_cost;
2435 
2436 	/* cost of inner-relation source data (we already dealt with outer rel) */
2437 
2438 	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2439 		extra->inner_unique)
2440 	{
2441 		/*
2442 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2443 		 * executor will stop after the first match.
2444 		 */
2445 		Cost		inner_run_cost = workspace->inner_run_cost;
2446 		Cost		inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2447 		double		outer_matched_rows;
2448 		double		outer_unmatched_rows;
2449 		Selectivity inner_scan_frac;
2450 
2451 		/*
2452 		 * For an outer-rel row that has at least one match, we can expect the
2453 		 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2454 		 * rows, if the matches are evenly distributed.  Since they probably
2455 		 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2456 		 * that fraction.  (If we used a larger fuzz factor, we'd have to
2457 		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2458 		 * least 1, no such clamp is needed now.)
2459 		 */
2460 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2461 		outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2462 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2463 
2464 		/*
2465 		 * Compute number of tuples processed (not number emitted!).  First,
2466 		 * account for successfully-matched outer rows.
2467 		 */
2468 		ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2469 
2470 		/*
2471 		 * Now we need to estimate the actual costs of scanning the inner
2472 		 * relation, which may be quite a bit less than N times inner_run_cost
2473 		 * due to early scan stops.  We consider two cases.  If the inner path
2474 		 * is an indexscan using all the joinquals as indexquals, then an
2475 		 * unmatched outer row results in an indexscan returning no rows,
2476 		 * which is probably quite cheap.  Otherwise, the executor will have
2477 		 * to scan the whole inner rel for an unmatched row; not so cheap.
2478 		 */
2479 		if (has_indexed_join_quals(path))
2480 		{
2481 			/*
2482 			 * Successfully-matched outer rows will only require scanning
2483 			 * inner_scan_frac of the inner relation.  In this case, we don't
2484 			 * need to charge the full inner_run_cost even when that's more
2485 			 * than inner_rescan_run_cost, because we can assume that none of
2486 			 * the inner scans ever scan the whole inner relation.  So it's
2487 			 * okay to assume that all the inner scan executions can be
2488 			 * fractions of the full cost, even if materialization is reducing
2489 			 * the rescan cost.  At this writing, it's impossible to get here
2490 			 * for a materialized inner scan, so inner_run_cost and
2491 			 * inner_rescan_run_cost will be the same anyway; but just in
2492 			 * case, use inner_run_cost for the first matched tuple and
2493 			 * inner_rescan_run_cost for additional ones.
2494 			 */
2495 			run_cost += inner_run_cost * inner_scan_frac;
2496 			if (outer_matched_rows > 1)
2497 				run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2498 
2499 			/*
2500 			 * Add the cost of inner-scan executions for unmatched outer rows.
2501 			 * We estimate this as the same cost as returning the first tuple
2502 			 * of a nonempty scan.  We consider that these are all rescans,
2503 			 * since we used inner_run_cost once already.
2504 			 */
2505 			run_cost += outer_unmatched_rows *
2506 				inner_rescan_run_cost / inner_path_rows;
2507 
2508 			/*
2509 			 * We won't be evaluating any quals at all for unmatched rows, so
2510 			 * don't add them to ntuples.
2511 			 */
2512 		}
2513 		else
2514 		{
2515 			/*
2516 			 * Here, a complicating factor is that rescans may be cheaper than
2517 			 * first scans.  If we never scan all the way to the end of the
2518 			 * inner rel, it might be (depending on the plan type) that we'd
2519 			 * never pay the whole inner first-scan run cost.  However it is
2520 			 * difficult to estimate whether that will happen (and it could
2521 			 * not happen if there are any unmatched outer rows!), so be
2522 			 * conservative and always charge the whole first-scan cost once.
2523 			 * We consider this charge to correspond to the first unmatched
2524 			 * outer row, unless there isn't one in our estimate, in which
2525 			 * case blame it on the first matched row.
2526 			 */
2527 
2528 			/* First, count all unmatched join tuples as being processed */
2529 			ntuples += outer_unmatched_rows * inner_path_rows;
2530 
2531 			/* Now add the forced full scan, and decrement appropriate count */
2532 			run_cost += inner_run_cost;
2533 			if (outer_unmatched_rows >= 1)
2534 				outer_unmatched_rows -= 1;
2535 			else
2536 				outer_matched_rows -= 1;
2537 
2538 			/* Add inner run cost for additional outer tuples having matches */
2539 			if (outer_matched_rows > 0)
2540 				run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2541 
2542 			/* Add inner run cost for additional unmatched outer tuples */
2543 			if (outer_unmatched_rows > 0)
2544 				run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2545 		}
2546 	}
2547 	else
2548 	{
2549 		/* Normal-case source costs were included in preliminary estimate */
2550 
2551 		/* Compute number of tuples processed (not number emitted!) */
2552 		ntuples = outer_path_rows * inner_path_rows;
2553 	}
2554 
2555 	/* CPU costs */
2556 	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2557 	startup_cost += restrict_qual_cost.startup;
2558 	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2559 	run_cost += cpu_per_tuple * ntuples;
2560 
2561 	/* tlist eval costs are paid per output row, not per tuple scanned */
2562 	startup_cost += path->path.pathtarget->cost.startup;
2563 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2564 
2565 	path->path.startup_cost = startup_cost;
2566 	path->path.total_cost = startup_cost + run_cost;
2567 }
2568 
2569 /*
2570  * initial_cost_mergejoin
2571  *	  Preliminary estimate of the cost of a mergejoin path.
2572  *
2573  * This must quickly produce lower-bound estimates of the path's startup and
2574  * total costs.  If we are unable to eliminate the proposed path from
2575  * consideration using the lower bounds, final_cost_mergejoin will be called
2576  * to obtain the final estimates.
2577  *
2578  * The exact division of labor between this function and final_cost_mergejoin
2579  * is private to them, and represents a tradeoff between speed of the initial
2580  * estimate and getting a tight lower bound.  We choose to not examine the
2581  * join quals here, except for obtaining the scan selectivity estimate which
2582  * is really essential (but fortunately, use of caching keeps the cost of
2583  * getting that down to something reasonable).
2584  * We also assume that cost_sort is cheap enough to use here.
2585  *
2586  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2587  *		other data to be used by final_cost_mergejoin
2588  * 'jointype' is the type of join to be performed
2589  * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2590  * 'outer_path' is the outer input to the join
2591  * 'inner_path' is the inner input to the join
2592  * 'outersortkeys' is the list of sort keys for the outer path
2593  * 'innersortkeys' is the list of sort keys for the inner path
2594  * 'extra' contains miscellaneous information about the join
2595  *
2596  * Note: outersortkeys and innersortkeys should be NIL if no explicit
2597  * sort is needed because the respective source path is already ordered.
2598  */
2599 void
initial_cost_mergejoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * mergeclauses,Path * outer_path,Path * inner_path,List * outersortkeys,List * innersortkeys,JoinPathExtraData * extra)2600 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2601 					   JoinType jointype,
2602 					   List *mergeclauses,
2603 					   Path *outer_path, Path *inner_path,
2604 					   List *outersortkeys, List *innersortkeys,
2605 					   JoinPathExtraData *extra)
2606 {
2607 	Cost		startup_cost = 0;
2608 	Cost		run_cost = 0;
2609 	double		outer_path_rows = outer_path->rows;
2610 	double		inner_path_rows = inner_path->rows;
2611 	Cost		inner_run_cost;
2612 	double		outer_rows,
2613 				inner_rows,
2614 				outer_skip_rows,
2615 				inner_skip_rows;
2616 	Selectivity outerstartsel,
2617 				outerendsel,
2618 				innerstartsel,
2619 				innerendsel;
2620 	Path		sort_path;		/* dummy for result of cost_sort */
2621 
2622 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2623 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2624 		outer_path_rows = 1;
2625 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2626 		inner_path_rows = 1;
2627 
2628 	/*
2629 	 * A merge join will stop as soon as it exhausts either input stream
2630 	 * (unless it's an outer join, in which case the outer side has to be
2631 	 * scanned all the way anyway).  Estimate fraction of the left and right
2632 	 * inputs that will actually need to be scanned.  Likewise, we can
2633 	 * estimate the number of rows that will be skipped before the first join
2634 	 * pair is found, which should be factored into startup cost. We use only
2635 	 * the first (most significant) merge clause for this purpose. Since
2636 	 * mergejoinscansel() is a fairly expensive computation, we cache the
2637 	 * results in the merge clause RestrictInfo.
2638 	 */
2639 	if (mergeclauses && jointype != JOIN_FULL)
2640 	{
2641 		RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2642 		List	   *opathkeys;
2643 		List	   *ipathkeys;
2644 		PathKey    *opathkey;
2645 		PathKey    *ipathkey;
2646 		MergeScanSelCache *cache;
2647 
2648 		/* Get the input pathkeys to determine the sort-order details */
2649 		opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2650 		ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2651 		Assert(opathkeys);
2652 		Assert(ipathkeys);
2653 		opathkey = (PathKey *) linitial(opathkeys);
2654 		ipathkey = (PathKey *) linitial(ipathkeys);
2655 		/* debugging check */
2656 		if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2657 			opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2658 			opathkey->pk_strategy != ipathkey->pk_strategy ||
2659 			opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2660 			elog(ERROR, "left and right pathkeys do not match in mergejoin");
2661 
2662 		/* Get the selectivity with caching */
2663 		cache = cached_scansel(root, firstclause, opathkey);
2664 
2665 		if (bms_is_subset(firstclause->left_relids,
2666 						  outer_path->parent->relids))
2667 		{
2668 			/* left side of clause is outer */
2669 			outerstartsel = cache->leftstartsel;
2670 			outerendsel = cache->leftendsel;
2671 			innerstartsel = cache->rightstartsel;
2672 			innerendsel = cache->rightendsel;
2673 		}
2674 		else
2675 		{
2676 			/* left side of clause is inner */
2677 			outerstartsel = cache->rightstartsel;
2678 			outerendsel = cache->rightendsel;
2679 			innerstartsel = cache->leftstartsel;
2680 			innerendsel = cache->leftendsel;
2681 		}
2682 		if (jointype == JOIN_LEFT ||
2683 			jointype == JOIN_ANTI)
2684 		{
2685 			outerstartsel = 0.0;
2686 			outerendsel = 1.0;
2687 		}
2688 		else if (jointype == JOIN_RIGHT)
2689 		{
2690 			innerstartsel = 0.0;
2691 			innerendsel = 1.0;
2692 		}
2693 	}
2694 	else
2695 	{
2696 		/* cope with clauseless or full mergejoin */
2697 		outerstartsel = innerstartsel = 0.0;
2698 		outerendsel = innerendsel = 1.0;
2699 	}
2700 
2701 	/*
2702 	 * Convert selectivities to row counts.  We force outer_rows and
2703 	 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
2704 	 */
2705 	outer_skip_rows = rint(outer_path_rows * outerstartsel);
2706 	inner_skip_rows = rint(inner_path_rows * innerstartsel);
2707 	outer_rows = clamp_row_est(outer_path_rows * outerendsel);
2708 	inner_rows = clamp_row_est(inner_path_rows * innerendsel);
2709 
2710 	/* skip rows can become NaN when path rows has become infinite */
2711 	Assert(outer_skip_rows <= outer_rows || isnan(outer_skip_rows));
2712 	Assert(inner_skip_rows <= inner_rows || isnan(inner_skip_rows));
2713 
2714 	/*
2715 	 * Readjust scan selectivities to account for above rounding.  This is
2716 	 * normally an insignificant effect, but when there are only a few rows in
2717 	 * the inputs, failing to do this makes for a large percentage error.
2718 	 */
2719 	outerstartsel = outer_skip_rows / outer_path_rows;
2720 	innerstartsel = inner_skip_rows / inner_path_rows;
2721 	outerendsel = outer_rows / outer_path_rows;
2722 	innerendsel = inner_rows / inner_path_rows;
2723 
2724 	/* start sel can become NaN when path rows has become infinite */
2725 	Assert(outerstartsel <= outerendsel || isnan(outerstartsel));
2726 	Assert(innerstartsel <= innerendsel || isnan(innerstartsel));
2727 
2728 	/* cost of source data */
2729 
2730 	if (outersortkeys)			/* do we need to sort outer? */
2731 	{
2732 		cost_sort(&sort_path,
2733 				  root,
2734 				  outersortkeys,
2735 				  outer_path->total_cost,
2736 				  outer_path_rows,
2737 				  outer_path->pathtarget->width,
2738 				  0.0,
2739 				  work_mem,
2740 				  -1.0);
2741 		startup_cost += sort_path.startup_cost;
2742 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2743 			* outerstartsel;
2744 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
2745 			* (outerendsel - outerstartsel);
2746 	}
2747 	else
2748 	{
2749 		startup_cost += outer_path->startup_cost;
2750 		startup_cost += (outer_path->total_cost - outer_path->startup_cost)
2751 			* outerstartsel;
2752 		run_cost += (outer_path->total_cost - outer_path->startup_cost)
2753 			* (outerendsel - outerstartsel);
2754 	}
2755 
2756 	if (innersortkeys)			/* do we need to sort inner? */
2757 	{
2758 		cost_sort(&sort_path,
2759 				  root,
2760 				  innersortkeys,
2761 				  inner_path->total_cost,
2762 				  inner_path_rows,
2763 				  inner_path->pathtarget->width,
2764 				  0.0,
2765 				  work_mem,
2766 				  -1.0);
2767 		startup_cost += sort_path.startup_cost;
2768 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
2769 			* innerstartsel;
2770 		inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
2771 			* (innerendsel - innerstartsel);
2772 	}
2773 	else
2774 	{
2775 		startup_cost += inner_path->startup_cost;
2776 		startup_cost += (inner_path->total_cost - inner_path->startup_cost)
2777 			* innerstartsel;
2778 		inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
2779 			* (innerendsel - innerstartsel);
2780 	}
2781 
2782 	/*
2783 	 * We can't yet determine whether rescanning occurs, or whether
2784 	 * materialization of the inner input should be done.  The minimum
2785 	 * possible inner input cost, regardless of rescan and materialization
2786 	 * considerations, is inner_run_cost.  We include that in
2787 	 * workspace->total_cost, but not yet in run_cost.
2788 	 */
2789 
2790 	/* CPU costs left for later */
2791 
2792 	/* Public result fields */
2793 	workspace->startup_cost = startup_cost;
2794 	workspace->total_cost = startup_cost + run_cost + inner_run_cost;
2795 	/* Save private data for final_cost_mergejoin */
2796 	workspace->run_cost = run_cost;
2797 	workspace->inner_run_cost = inner_run_cost;
2798 	workspace->outer_rows = outer_rows;
2799 	workspace->inner_rows = inner_rows;
2800 	workspace->outer_skip_rows = outer_skip_rows;
2801 	workspace->inner_skip_rows = inner_skip_rows;
2802 }
2803 
2804 /*
2805  * final_cost_mergejoin
2806  *	  Final estimate of the cost and result size of a mergejoin path.
2807  *
2808  * Unlike other costsize functions, this routine makes two actual decisions:
2809  * whether the executor will need to do mark/restore, and whether we should
2810  * materialize the inner path.  It would be logically cleaner to build
2811  * separate paths testing these alternatives, but that would require repeating
2812  * most of the cost calculations, which are not all that cheap.  Since the
2813  * choice will not affect output pathkeys or startup cost, only total cost,
2814  * there is no possibility of wanting to keep more than one path.  So it seems
2815  * best to make the decisions here and record them in the path's
2816  * skip_mark_restore and materialize_inner fields.
2817  *
2818  * Mark/restore overhead is usually required, but can be skipped if we know
2819  * that the executor need find only one match per outer tuple, and that the
2820  * mergeclauses are sufficient to identify a match.
2821  *
2822  * We materialize the inner path if we need mark/restore and either the inner
2823  * path can't support mark/restore, or it's cheaper to use an interposed
2824  * Material node to handle mark/restore.
2825  *
2826  * 'path' is already filled in except for the rows and cost fields and
2827  *		skip_mark_restore and materialize_inner
2828  * 'workspace' is the result from initial_cost_mergejoin
2829  * 'extra' contains miscellaneous information about the join
2830  */
2831 void
final_cost_mergejoin(PlannerInfo * root,MergePath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2832 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
2833 					 JoinCostWorkspace *workspace,
2834 					 JoinPathExtraData *extra)
2835 {
2836 	Path	   *outer_path = path->jpath.outerjoinpath;
2837 	Path	   *inner_path = path->jpath.innerjoinpath;
2838 	double		inner_path_rows = inner_path->rows;
2839 	List	   *mergeclauses = path->path_mergeclauses;
2840 	List	   *innersortkeys = path->innersortkeys;
2841 	Cost		startup_cost = workspace->startup_cost;
2842 	Cost		run_cost = workspace->run_cost;
2843 	Cost		inner_run_cost = workspace->inner_run_cost;
2844 	double		outer_rows = workspace->outer_rows;
2845 	double		inner_rows = workspace->inner_rows;
2846 	double		outer_skip_rows = workspace->outer_skip_rows;
2847 	double		inner_skip_rows = workspace->inner_skip_rows;
2848 	Cost		cpu_per_tuple,
2849 				bare_inner_cost,
2850 				mat_inner_cost;
2851 	QualCost	merge_qual_cost;
2852 	QualCost	qp_qual_cost;
2853 	double		mergejointuples,
2854 				rescannedtuples;
2855 	double		rescanratio;
2856 
2857 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2858 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2859 		inner_path_rows = 1;
2860 
2861 	/* Mark the path with the correct row estimate */
2862 	if (path->jpath.path.param_info)
2863 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
2864 	else
2865 		path->jpath.path.rows = path->jpath.path.parent->rows;
2866 
2867 	/* For partial paths, scale row estimate. */
2868 	if (path->jpath.path.parallel_workers > 0)
2869 	{
2870 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
2871 
2872 		path->jpath.path.rows =
2873 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
2874 	}
2875 
2876 	/*
2877 	 * We could include disable_cost in the preliminary estimate, but that
2878 	 * would amount to optimizing for the case where the join method is
2879 	 * disabled, which doesn't seem like the way to bet.
2880 	 */
2881 	if (!enable_mergejoin)
2882 		startup_cost += disable_cost;
2883 
2884 	/*
2885 	 * Compute cost of the mergequals and qpquals (other restriction clauses)
2886 	 * separately.
2887 	 */
2888 	cost_qual_eval(&merge_qual_cost, mergeclauses, root);
2889 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
2890 	qp_qual_cost.startup -= merge_qual_cost.startup;
2891 	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
2892 
2893 	/*
2894 	 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2895 	 * executor will stop scanning for matches after the first match.  When
2896 	 * all the joinclauses are merge clauses, this means we don't ever need to
2897 	 * back up the merge, and so we can skip mark/restore overhead.
2898 	 */
2899 	if ((path->jpath.jointype == JOIN_SEMI ||
2900 		 path->jpath.jointype == JOIN_ANTI ||
2901 		 extra->inner_unique) &&
2902 		(list_length(path->jpath.joinrestrictinfo) ==
2903 		 list_length(path->path_mergeclauses)))
2904 		path->skip_mark_restore = true;
2905 	else
2906 		path->skip_mark_restore = false;
2907 
2908 	/*
2909 	 * Get approx # tuples passing the mergequals.  We use approx_tuple_count
2910 	 * here because we need an estimate done with JOIN_INNER semantics.
2911 	 */
2912 	mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
2913 
2914 	/*
2915 	 * When there are equal merge keys in the outer relation, the mergejoin
2916 	 * must rescan any matching tuples in the inner relation. This means
2917 	 * re-fetching inner tuples; we have to estimate how often that happens.
2918 	 *
2919 	 * For regular inner and outer joins, the number of re-fetches can be
2920 	 * estimated approximately as size of merge join output minus size of
2921 	 * inner relation. Assume that the distinct key values are 1, 2, ..., and
2922 	 * denote the number of values of each key in the outer relation as m1,
2923 	 * m2, ...; in the inner relation, n1, n2, ...  Then we have
2924 	 *
2925 	 * size of join = m1 * n1 + m2 * n2 + ...
2926 	 *
2927 	 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
2928 	 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
2929 	 * relation
2930 	 *
2931 	 * This equation works correctly for outer tuples having no inner match
2932 	 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
2933 	 * are effectively subtracting those from the number of rescanned tuples,
2934 	 * when we should not.  Can we do better without expensive selectivity
2935 	 * computations?
2936 	 *
2937 	 * The whole issue is moot if we are working from a unique-ified outer
2938 	 * input, or if we know we don't need to mark/restore at all.
2939 	 */
2940 	if (IsA(outer_path, UniquePath) ||path->skip_mark_restore)
2941 		rescannedtuples = 0;
2942 	else
2943 	{
2944 		rescannedtuples = mergejointuples - inner_path_rows;
2945 		/* Must clamp because of possible underestimate */
2946 		if (rescannedtuples < 0)
2947 			rescannedtuples = 0;
2948 	}
2949 
2950 	/*
2951 	 * We'll inflate various costs this much to account for rescanning.  Note
2952 	 * that this is to be multiplied by something involving inner_rows, or
2953 	 * another number related to the portion of the inner rel we'll scan.
2954 	 */
2955 	rescanratio = 1.0 + (rescannedtuples / inner_rows);
2956 
2957 	/*
2958 	 * Decide whether we want to materialize the inner input to shield it from
2959 	 * mark/restore and performing re-fetches.  Our cost model for regular
2960 	 * re-fetches is that a re-fetch costs the same as an original fetch,
2961 	 * which is probably an overestimate; but on the other hand we ignore the
2962 	 * bookkeeping costs of mark/restore.  Not clear if it's worth developing
2963 	 * a more refined model.  So we just need to inflate the inner run cost by
2964 	 * rescanratio.
2965 	 */
2966 	bare_inner_cost = inner_run_cost * rescanratio;
2967 
2968 	/*
2969 	 * When we interpose a Material node the re-fetch cost is assumed to be
2970 	 * just cpu_operator_cost per tuple, independently of the underlying
2971 	 * plan's cost; and we charge an extra cpu_operator_cost per original
2972 	 * fetch as well.  Note that we're assuming the materialize node will
2973 	 * never spill to disk, since it only has to remember tuples back to the
2974 	 * last mark.  (If there are a huge number of duplicates, our other cost
2975 	 * factors will make the path so expensive that it probably won't get
2976 	 * chosen anyway.)	So we don't use cost_rescan here.
2977 	 *
2978 	 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
2979 	 * of the generated Material node.
2980 	 */
2981 	mat_inner_cost = inner_run_cost +
2982 		cpu_operator_cost * inner_rows * rescanratio;
2983 
2984 	/*
2985 	 * If we don't need mark/restore at all, we don't need materialization.
2986 	 */
2987 	if (path->skip_mark_restore)
2988 		path->materialize_inner = false;
2989 
2990 	/*
2991 	 * Prefer materializing if it looks cheaper, unless the user has asked to
2992 	 * suppress materialization.
2993 	 */
2994 	else if (enable_material && mat_inner_cost < bare_inner_cost)
2995 		path->materialize_inner = true;
2996 
2997 	/*
2998 	 * Even if materializing doesn't look cheaper, we *must* do it if the
2999 	 * inner path is to be used directly (without sorting) and it doesn't
3000 	 * support mark/restore.
3001 	 *
3002 	 * Since the inner side must be ordered, and only Sorts and IndexScans can
3003 	 * create order to begin with, and they both support mark/restore, you
3004 	 * might think there's no problem --- but you'd be wrong.  Nestloop and
3005 	 * merge joins can *preserve* the order of their inputs, so they can be
3006 	 * selected as the input of a mergejoin, and they don't support
3007 	 * mark/restore at present.
3008 	 *
3009 	 * We don't test the value of enable_material here, because
3010 	 * materialization is required for correctness in this case, and turning
3011 	 * it off does not entitle us to deliver an invalid plan.
3012 	 */
3013 	else if (innersortkeys == NIL &&
3014 			 !ExecSupportsMarkRestore(inner_path))
3015 		path->materialize_inner = true;
3016 
3017 	/*
3018 	 * Also, force materializing if the inner path is to be sorted and the
3019 	 * sort is expected to spill to disk.  This is because the final merge
3020 	 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3021 	 * We don't try to adjust the cost estimates for this consideration,
3022 	 * though.
3023 	 *
3024 	 * Since materialization is a performance optimization in this case,
3025 	 * rather than necessary for correctness, we skip it if enable_material is
3026 	 * off.
3027 	 */
3028 	else if (enable_material && innersortkeys != NIL &&
3029 			 relation_byte_size(inner_path_rows,
3030 								inner_path->pathtarget->width) >
3031 			 (work_mem * 1024L))
3032 		path->materialize_inner = true;
3033 	else
3034 		path->materialize_inner = false;
3035 
3036 	/* Charge the right incremental cost for the chosen case */
3037 	if (path->materialize_inner)
3038 		run_cost += mat_inner_cost;
3039 	else
3040 		run_cost += bare_inner_cost;
3041 
3042 	/* CPU costs */
3043 
3044 	/*
3045 	 * The number of tuple comparisons needed is approximately number of outer
3046 	 * rows plus number of inner rows plus number of rescanned tuples (can we
3047 	 * refine this?).  At each one, we need to evaluate the mergejoin quals.
3048 	 */
3049 	startup_cost += merge_qual_cost.startup;
3050 	startup_cost += merge_qual_cost.per_tuple *
3051 		(outer_skip_rows + inner_skip_rows * rescanratio);
3052 	run_cost += merge_qual_cost.per_tuple *
3053 		((outer_rows - outer_skip_rows) +
3054 		 (inner_rows - inner_skip_rows) * rescanratio);
3055 
3056 	/*
3057 	 * For each tuple that gets through the mergejoin proper, we charge
3058 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3059 	 * clauses that are to be applied at the join.  (This is pessimistic since
3060 	 * not all of the quals may get evaluated at each tuple.)
3061 	 *
3062 	 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3063 	 * evaluations here, but it's probably not worth the trouble.
3064 	 */
3065 	startup_cost += qp_qual_cost.startup;
3066 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3067 	run_cost += cpu_per_tuple * mergejointuples;
3068 
3069 	/* tlist eval costs are paid per output row, not per tuple scanned */
3070 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3071 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3072 
3073 	path->jpath.path.startup_cost = startup_cost;
3074 	path->jpath.path.total_cost = startup_cost + run_cost;
3075 }
3076 
3077 /*
3078  * run mergejoinscansel() with caching
3079  */
3080 static MergeScanSelCache *
cached_scansel(PlannerInfo * root,RestrictInfo * rinfo,PathKey * pathkey)3081 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3082 {
3083 	MergeScanSelCache *cache;
3084 	ListCell   *lc;
3085 	Selectivity leftstartsel,
3086 				leftendsel,
3087 				rightstartsel,
3088 				rightendsel;
3089 	MemoryContext oldcontext;
3090 
3091 	/* Do we have this result already? */
3092 	foreach(lc, rinfo->scansel_cache)
3093 	{
3094 		cache = (MergeScanSelCache *) lfirst(lc);
3095 		if (cache->opfamily == pathkey->pk_opfamily &&
3096 			cache->collation == pathkey->pk_eclass->ec_collation &&
3097 			cache->strategy == pathkey->pk_strategy &&
3098 			cache->nulls_first == pathkey->pk_nulls_first)
3099 			return cache;
3100 	}
3101 
3102 	/* Nope, do the computation */
3103 	mergejoinscansel(root,
3104 					 (Node *) rinfo->clause,
3105 					 pathkey->pk_opfamily,
3106 					 pathkey->pk_strategy,
3107 					 pathkey->pk_nulls_first,
3108 					 &leftstartsel,
3109 					 &leftendsel,
3110 					 &rightstartsel,
3111 					 &rightendsel);
3112 
3113 	/* Cache the result in suitably long-lived workspace */
3114 	oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3115 
3116 	cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3117 	cache->opfamily = pathkey->pk_opfamily;
3118 	cache->collation = pathkey->pk_eclass->ec_collation;
3119 	cache->strategy = pathkey->pk_strategy;
3120 	cache->nulls_first = pathkey->pk_nulls_first;
3121 	cache->leftstartsel = leftstartsel;
3122 	cache->leftendsel = leftendsel;
3123 	cache->rightstartsel = rightstartsel;
3124 	cache->rightendsel = rightendsel;
3125 
3126 	rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3127 
3128 	MemoryContextSwitchTo(oldcontext);
3129 
3130 	return cache;
3131 }
3132 
3133 /*
3134  * initial_cost_hashjoin
3135  *	  Preliminary estimate of the cost of a hashjoin path.
3136  *
3137  * This must quickly produce lower-bound estimates of the path's startup and
3138  * total costs.  If we are unable to eliminate the proposed path from
3139  * consideration using the lower bounds, final_cost_hashjoin will be called
3140  * to obtain the final estimates.
3141  *
3142  * The exact division of labor between this function and final_cost_hashjoin
3143  * is private to them, and represents a tradeoff between speed of the initial
3144  * estimate and getting a tight lower bound.  We choose to not examine the
3145  * join quals here (other than by counting the number of hash clauses),
3146  * so we can't do much with CPU costs.  We do assume that
3147  * ExecChooseHashTableSize is cheap enough to use here.
3148  *
3149  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3150  *		other data to be used by final_cost_hashjoin
3151  * 'jointype' is the type of join to be performed
3152  * 'hashclauses' is the list of joinclauses to be used as hash clauses
3153  * 'outer_path' is the outer input to the join
3154  * 'inner_path' is the inner input to the join
3155  * 'extra' contains miscellaneous information about the join
3156  * 'parallel_hash' indicates that inner_path is partial and that a shared
3157  *		hash table will be built in parallel
3158  */
3159 void
initial_cost_hashjoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * hashclauses,Path * outer_path,Path * inner_path,JoinPathExtraData * extra,bool parallel_hash)3160 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3161 					  JoinType jointype,
3162 					  List *hashclauses,
3163 					  Path *outer_path, Path *inner_path,
3164 					  JoinPathExtraData *extra,
3165 					  bool parallel_hash)
3166 {
3167 	Cost		startup_cost = 0;
3168 	Cost		run_cost = 0;
3169 	double		outer_path_rows = outer_path->rows;
3170 	double		inner_path_rows = inner_path->rows;
3171 	double		inner_path_rows_total = inner_path_rows;
3172 	int			num_hashclauses = list_length(hashclauses);
3173 	int			numbuckets;
3174 	int			numbatches;
3175 	int			num_skew_mcvs;
3176 	size_t		space_allowed;	/* unused */
3177 
3178 	/* cost of source data */
3179 	startup_cost += outer_path->startup_cost;
3180 	run_cost += outer_path->total_cost - outer_path->startup_cost;
3181 	startup_cost += inner_path->total_cost;
3182 
3183 	/*
3184 	 * Cost of computing hash function: must do it once per input tuple. We
3185 	 * charge one cpu_operator_cost for each column's hash function.  Also,
3186 	 * tack on one cpu_tuple_cost per inner row, to model the costs of
3187 	 * inserting the row into the hashtable.
3188 	 *
3189 	 * XXX when a hashclause is more complex than a single operator, we really
3190 	 * should charge the extra eval costs of the left or right side, as
3191 	 * appropriate, here.  This seems more work than it's worth at the moment.
3192 	 */
3193 	startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3194 		* inner_path_rows;
3195 	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3196 
3197 	/*
3198 	 * If this is a parallel hash build, then the value we have for
3199 	 * inner_rows_total currently refers only to the rows returned by each
3200 	 * participant.  For shared hash table size estimation, we need the total
3201 	 * number, so we need to undo the division.
3202 	 */
3203 	if (parallel_hash)
3204 		inner_path_rows_total *= get_parallel_divisor(inner_path);
3205 
3206 	/*
3207 	 * Get hash table size that executor would use for inner relation.
3208 	 *
3209 	 * XXX for the moment, always assume that skew optimization will be
3210 	 * performed.  As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
3211 	 * trying to determine that for sure.
3212 	 *
3213 	 * XXX at some point it might be interesting to try to account for skew
3214 	 * optimization in the cost estimate, but for now, we don't.
3215 	 */
3216 	ExecChooseHashTableSize(inner_path_rows_total,
3217 							inner_path->pathtarget->width,
3218 							true,	/* useskew */
3219 							parallel_hash,	/* try_combined_work_mem */
3220 							outer_path->parallel_workers,
3221 							&space_allowed,
3222 							&numbuckets,
3223 							&numbatches,
3224 							&num_skew_mcvs);
3225 
3226 	/*
3227 	 * If inner relation is too big then we will need to "batch" the join,
3228 	 * which implies writing and reading most of the tuples to disk an extra
3229 	 * time.  Charge seq_page_cost per page, since the I/O should be nice and
3230 	 * sequential.  Writing the inner rel counts as startup cost, all the rest
3231 	 * as run cost.
3232 	 */
3233 	if (numbatches > 1)
3234 	{
3235 		double		outerpages = page_size(outer_path_rows,
3236 										   outer_path->pathtarget->width);
3237 		double		innerpages = page_size(inner_path_rows,
3238 										   inner_path->pathtarget->width);
3239 
3240 		startup_cost += seq_page_cost * innerpages;
3241 		run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3242 	}
3243 
3244 	/* CPU costs left for later */
3245 
3246 	/* Public result fields */
3247 	workspace->startup_cost = startup_cost;
3248 	workspace->total_cost = startup_cost + run_cost;
3249 	/* Save private data for final_cost_hashjoin */
3250 	workspace->run_cost = run_cost;
3251 	workspace->numbuckets = numbuckets;
3252 	workspace->numbatches = numbatches;
3253 	workspace->inner_rows_total = inner_path_rows_total;
3254 }
3255 
3256 /*
3257  * final_cost_hashjoin
3258  *	  Final estimate of the cost and result size of a hashjoin path.
3259  *
3260  * Note: the numbatches estimate is also saved into 'path' for use later
3261  *
3262  * 'path' is already filled in except for the rows and cost fields and
3263  *		num_batches
3264  * 'workspace' is the result from initial_cost_hashjoin
3265  * 'extra' contains miscellaneous information about the join
3266  */
3267 void
final_cost_hashjoin(PlannerInfo * root,HashPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3268 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3269 					JoinCostWorkspace *workspace,
3270 					JoinPathExtraData *extra)
3271 {
3272 	Path	   *outer_path = path->jpath.outerjoinpath;
3273 	Path	   *inner_path = path->jpath.innerjoinpath;
3274 	double		outer_path_rows = outer_path->rows;
3275 	double		inner_path_rows = inner_path->rows;
3276 	double		inner_path_rows_total = workspace->inner_rows_total;
3277 	List	   *hashclauses = path->path_hashclauses;
3278 	Cost		startup_cost = workspace->startup_cost;
3279 	Cost		run_cost = workspace->run_cost;
3280 	int			numbuckets = workspace->numbuckets;
3281 	int			numbatches = workspace->numbatches;
3282 	Cost		cpu_per_tuple;
3283 	QualCost	hash_qual_cost;
3284 	QualCost	qp_qual_cost;
3285 	double		hashjointuples;
3286 	double		virtualbuckets;
3287 	Selectivity innerbucketsize;
3288 	Selectivity innermcvfreq;
3289 	ListCell   *hcl;
3290 
3291 	/* Mark the path with the correct row estimate */
3292 	if (path->jpath.path.param_info)
3293 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3294 	else
3295 		path->jpath.path.rows = path->jpath.path.parent->rows;
3296 
3297 	/* For partial paths, scale row estimate. */
3298 	if (path->jpath.path.parallel_workers > 0)
3299 	{
3300 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
3301 
3302 		path->jpath.path.rows =
3303 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
3304 	}
3305 
3306 	/*
3307 	 * We could include disable_cost in the preliminary estimate, but that
3308 	 * would amount to optimizing for the case where the join method is
3309 	 * disabled, which doesn't seem like the way to bet.
3310 	 */
3311 	if (!enable_hashjoin)
3312 		startup_cost += disable_cost;
3313 
3314 	/* mark the path with estimated # of batches */
3315 	path->num_batches = numbatches;
3316 
3317 	/* store the total number of tuples (sum of partial row estimates) */
3318 	path->inner_rows_total = inner_path_rows_total;
3319 
3320 	/* and compute the number of "virtual" buckets in the whole join */
3321 	virtualbuckets = (double) numbuckets * (double) numbatches;
3322 
3323 	/*
3324 	 * Determine bucketsize fraction and MCV frequency for the inner relation.
3325 	 * We use the smallest bucketsize or MCV frequency estimated for any
3326 	 * individual hashclause; this is undoubtedly conservative.
3327 	 *
3328 	 * BUT: if inner relation has been unique-ified, we can assume it's good
3329 	 * for hashing.  This is important both because it's the right answer, and
3330 	 * because we avoid contaminating the cache with a value that's wrong for
3331 	 * non-unique-ified paths.
3332 	 */
3333 	if (IsA(inner_path, UniquePath))
3334 	{
3335 		innerbucketsize = 1.0 / virtualbuckets;
3336 		innermcvfreq = 0.0;
3337 	}
3338 	else
3339 	{
3340 		innerbucketsize = 1.0;
3341 		innermcvfreq = 1.0;
3342 		foreach(hcl, hashclauses)
3343 		{
3344 			RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3345 			Selectivity thisbucketsize;
3346 			Selectivity thismcvfreq;
3347 
3348 			/*
3349 			 * First we have to figure out which side of the hashjoin clause
3350 			 * is the inner side.
3351 			 *
3352 			 * Since we tend to visit the same clauses over and over when
3353 			 * planning a large query, we cache the bucket stats estimates in
3354 			 * the RestrictInfo node to avoid repeated lookups of statistics.
3355 			 */
3356 			if (bms_is_subset(restrictinfo->right_relids,
3357 							  inner_path->parent->relids))
3358 			{
3359 				/* righthand side is inner */
3360 				thisbucketsize = restrictinfo->right_bucketsize;
3361 				if (thisbucketsize < 0)
3362 				{
3363 					/* not cached yet */
3364 					estimate_hash_bucket_stats(root,
3365 											   get_rightop(restrictinfo->clause),
3366 											   virtualbuckets,
3367 											   &restrictinfo->right_mcvfreq,
3368 											   &restrictinfo->right_bucketsize);
3369 					thisbucketsize = restrictinfo->right_bucketsize;
3370 				}
3371 				thismcvfreq = restrictinfo->right_mcvfreq;
3372 			}
3373 			else
3374 			{
3375 				Assert(bms_is_subset(restrictinfo->left_relids,
3376 									 inner_path->parent->relids));
3377 				/* lefthand side is inner */
3378 				thisbucketsize = restrictinfo->left_bucketsize;
3379 				if (thisbucketsize < 0)
3380 				{
3381 					/* not cached yet */
3382 					estimate_hash_bucket_stats(root,
3383 											   get_leftop(restrictinfo->clause),
3384 											   virtualbuckets,
3385 											   &restrictinfo->left_mcvfreq,
3386 											   &restrictinfo->left_bucketsize);
3387 					thisbucketsize = restrictinfo->left_bucketsize;
3388 				}
3389 				thismcvfreq = restrictinfo->left_mcvfreq;
3390 			}
3391 
3392 			if (innerbucketsize > thisbucketsize)
3393 				innerbucketsize = thisbucketsize;
3394 			if (innermcvfreq > thismcvfreq)
3395 				innermcvfreq = thismcvfreq;
3396 		}
3397 	}
3398 
3399 	/*
3400 	 * If the bucket holding the inner MCV would exceed work_mem, we don't
3401 	 * want to hash unless there is really no other alternative, so apply
3402 	 * disable_cost.  (The executor normally copes with excessive memory usage
3403 	 * by splitting batches, but obviously it cannot separate equal values
3404 	 * that way, so it will be unable to drive the batch size below work_mem
3405 	 * when this is true.)
3406 	 */
3407 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3408 						   inner_path->pathtarget->width) >
3409 		(work_mem * 1024L))
3410 		startup_cost += disable_cost;
3411 
3412 	/*
3413 	 * Compute cost of the hashquals and qpquals (other restriction clauses)
3414 	 * separately.
3415 	 */
3416 	cost_qual_eval(&hash_qual_cost, hashclauses, root);
3417 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3418 	qp_qual_cost.startup -= hash_qual_cost.startup;
3419 	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3420 
3421 	/* CPU costs */
3422 
3423 	if (path->jpath.jointype == JOIN_SEMI ||
3424 		path->jpath.jointype == JOIN_ANTI ||
3425 		extra->inner_unique)
3426 	{
3427 		double		outer_matched_rows;
3428 		Selectivity inner_scan_frac;
3429 
3430 		/*
3431 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3432 		 * executor will stop after the first match.
3433 		 *
3434 		 * For an outer-rel row that has at least one match, we can expect the
3435 		 * bucket scan to stop after a fraction 1/(match_count+1) of the
3436 		 * bucket's rows, if the matches are evenly distributed.  Since they
3437 		 * probably aren't quite evenly distributed, we apply a fuzz factor of
3438 		 * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
3439 		 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3440 		 * at least 1, no such clamp is needed now.)
3441 		 */
3442 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3443 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3444 
3445 		startup_cost += hash_qual_cost.startup;
3446 		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3447 			clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3448 
3449 		/*
3450 		 * For unmatched outer-rel rows, the picture is quite a lot different.
3451 		 * In the first place, there is no reason to assume that these rows
3452 		 * preferentially hit heavily-populated buckets; instead assume they
3453 		 * are uncorrelated with the inner distribution and so they see an
3454 		 * average bucket size of inner_path_rows / virtualbuckets.  In the
3455 		 * second place, it seems likely that they will have few if any exact
3456 		 * hash-code matches and so very few of the tuples in the bucket will
3457 		 * actually require eval of the hash quals.  We don't have any good
3458 		 * way to estimate how many will, but for the moment assume that the
3459 		 * effective cost per bucket entry is one-tenth what it is for
3460 		 * matchable tuples.
3461 		 */
3462 		run_cost += hash_qual_cost.per_tuple *
3463 			(outer_path_rows - outer_matched_rows) *
3464 			clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3465 
3466 		/* Get # of tuples that will pass the basic join */
3467 		if (path->jpath.jointype == JOIN_ANTI)
3468 			hashjointuples = outer_path_rows - outer_matched_rows;
3469 		else
3470 			hashjointuples = outer_matched_rows;
3471 	}
3472 	else
3473 	{
3474 		/*
3475 		 * The number of tuple comparisons needed is the number of outer
3476 		 * tuples times the typical number of tuples in a hash bucket, which
3477 		 * is the inner relation size times its bucketsize fraction.  At each
3478 		 * one, we need to evaluate the hashjoin quals.  But actually,
3479 		 * charging the full qual eval cost at each tuple is pessimistic,
3480 		 * since we don't evaluate the quals unless the hash values match
3481 		 * exactly.  For lack of a better idea, halve the cost estimate to
3482 		 * allow for that.
3483 		 */
3484 		startup_cost += hash_qual_cost.startup;
3485 		run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3486 			clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3487 
3488 		/*
3489 		 * Get approx # tuples passing the hashquals.  We use
3490 		 * approx_tuple_count here because we need an estimate done with
3491 		 * JOIN_INNER semantics.
3492 		 */
3493 		hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3494 	}
3495 
3496 	/*
3497 	 * For each tuple that gets through the hashjoin proper, we charge
3498 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3499 	 * clauses that are to be applied at the join.  (This is pessimistic since
3500 	 * not all of the quals may get evaluated at each tuple.)
3501 	 */
3502 	startup_cost += qp_qual_cost.startup;
3503 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3504 	run_cost += cpu_per_tuple * hashjointuples;
3505 
3506 	/* tlist eval costs are paid per output row, not per tuple scanned */
3507 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3508 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3509 
3510 	path->jpath.path.startup_cost = startup_cost;
3511 	path->jpath.path.total_cost = startup_cost + run_cost;
3512 }
3513 
3514 
3515 /*
3516  * cost_subplan
3517  *		Figure the costs for a SubPlan (or initplan).
3518  *
3519  * Note: we could dig the subplan's Plan out of the root list, but in practice
3520  * all callers have it handy already, so we make them pass it.
3521  */
3522 void
cost_subplan(PlannerInfo * root,SubPlan * subplan,Plan * plan)3523 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3524 {
3525 	QualCost	sp_cost;
3526 
3527 	/* Figure any cost for evaluating the testexpr */
3528 	cost_qual_eval(&sp_cost,
3529 				   make_ands_implicit((Expr *) subplan->testexpr),
3530 				   root);
3531 
3532 	if (subplan->useHashTable)
3533 	{
3534 		/*
3535 		 * If we are using a hash table for the subquery outputs, then the
3536 		 * cost of evaluating the query is a one-time cost.  We charge one
3537 		 * cpu_operator_cost per tuple for the work of loading the hashtable,
3538 		 * too.
3539 		 */
3540 		sp_cost.startup += plan->total_cost +
3541 			cpu_operator_cost * plan->plan_rows;
3542 
3543 		/*
3544 		 * The per-tuple costs include the cost of evaluating the lefthand
3545 		 * expressions, plus the cost of probing the hashtable.  We already
3546 		 * accounted for the lefthand expressions as part of the testexpr, and
3547 		 * will also have counted one cpu_operator_cost for each comparison
3548 		 * operator.  That is probably too low for the probing cost, but it's
3549 		 * hard to make a better estimate, so live with it for now.
3550 		 */
3551 	}
3552 	else
3553 	{
3554 		/*
3555 		 * Otherwise we will be rescanning the subplan output on each
3556 		 * evaluation.  We need to estimate how much of the output we will
3557 		 * actually need to scan.  NOTE: this logic should agree with the
3558 		 * tuple_fraction estimates used by make_subplan() in
3559 		 * plan/subselect.c.
3560 		 */
3561 		Cost		plan_run_cost = plan->total_cost - plan->startup_cost;
3562 
3563 		if (subplan->subLinkType == EXISTS_SUBLINK)
3564 		{
3565 			/* we only need to fetch 1 tuple; clamp to avoid zero divide */
3566 			sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3567 		}
3568 		else if (subplan->subLinkType == ALL_SUBLINK ||
3569 				 subplan->subLinkType == ANY_SUBLINK)
3570 		{
3571 			/* assume we need 50% of the tuples */
3572 			sp_cost.per_tuple += 0.50 * plan_run_cost;
3573 			/* also charge a cpu_operator_cost per row examined */
3574 			sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3575 		}
3576 		else
3577 		{
3578 			/* assume we need all tuples */
3579 			sp_cost.per_tuple += plan_run_cost;
3580 		}
3581 
3582 		/*
3583 		 * Also account for subplan's startup cost. If the subplan is
3584 		 * uncorrelated or undirect correlated, AND its topmost node is one
3585 		 * that materializes its output, assume that we'll only need to pay
3586 		 * its startup cost once; otherwise assume we pay the startup cost
3587 		 * every time.
3588 		 */
3589 		if (subplan->parParam == NIL &&
3590 			ExecMaterializesOutput(nodeTag(plan)))
3591 			sp_cost.startup += plan->startup_cost;
3592 		else
3593 			sp_cost.per_tuple += plan->startup_cost;
3594 	}
3595 
3596 	subplan->startup_cost = sp_cost.startup;
3597 	subplan->per_call_cost = sp_cost.per_tuple;
3598 }
3599 
3600 
3601 /*
3602  * cost_rescan
3603  *		Given a finished Path, estimate the costs of rescanning it after
3604  *		having done so the first time.  For some Path types a rescan is
3605  *		cheaper than an original scan (if no parameters change), and this
3606  *		function embodies knowledge about that.  The default is to return
3607  *		the same costs stored in the Path.  (Note that the cost estimates
3608  *		actually stored in Paths are always for first scans.)
3609  *
3610  * This function is not currently intended to model effects such as rescans
3611  * being cheaper due to disk block caching; what we are concerned with is
3612  * plan types wherein the executor caches results explicitly, or doesn't
3613  * redo startup calculations, etc.
3614  */
3615 static void
cost_rescan(PlannerInfo * root,Path * path,Cost * rescan_startup_cost,Cost * rescan_total_cost)3616 cost_rescan(PlannerInfo *root, Path *path,
3617 			Cost *rescan_startup_cost,	/* output parameters */
3618 			Cost *rescan_total_cost)
3619 {
3620 	switch (path->pathtype)
3621 	{
3622 		case T_FunctionScan:
3623 
3624 			/*
3625 			 * Currently, nodeFunctionscan.c always executes the function to
3626 			 * completion before returning any rows, and caches the results in
3627 			 * a tuplestore.  So the function eval cost is all startup cost
3628 			 * and isn't paid over again on rescans. However, all run costs
3629 			 * will be paid over again.
3630 			 */
3631 			*rescan_startup_cost = 0;
3632 			*rescan_total_cost = path->total_cost - path->startup_cost;
3633 			break;
3634 		case T_HashJoin:
3635 
3636 			/*
3637 			 * If it's a single-batch join, we don't need to rebuild the hash
3638 			 * table during a rescan.
3639 			 */
3640 			if (((HashPath *) path)->num_batches == 1)
3641 			{
3642 				/* Startup cost is exactly the cost of hash table building */
3643 				*rescan_startup_cost = 0;
3644 				*rescan_total_cost = path->total_cost - path->startup_cost;
3645 			}
3646 			else
3647 			{
3648 				/* Otherwise, no special treatment */
3649 				*rescan_startup_cost = path->startup_cost;
3650 				*rescan_total_cost = path->total_cost;
3651 			}
3652 			break;
3653 		case T_CteScan:
3654 		case T_WorkTableScan:
3655 			{
3656 				/*
3657 				 * These plan types materialize their final result in a
3658 				 * tuplestore or tuplesort object.  So the rescan cost is only
3659 				 * cpu_tuple_cost per tuple, unless the result is large enough
3660 				 * to spill to disk.
3661 				 */
3662 				Cost		run_cost = cpu_tuple_cost * path->rows;
3663 				double		nbytes = relation_byte_size(path->rows,
3664 														path->pathtarget->width);
3665 				long		work_mem_bytes = work_mem * 1024L;
3666 
3667 				if (nbytes > work_mem_bytes)
3668 				{
3669 					/* It will spill, so account for re-read cost */
3670 					double		npages = ceil(nbytes / BLCKSZ);
3671 
3672 					run_cost += seq_page_cost * npages;
3673 				}
3674 				*rescan_startup_cost = 0;
3675 				*rescan_total_cost = run_cost;
3676 			}
3677 			break;
3678 		case T_Material:
3679 		case T_Sort:
3680 			{
3681 				/*
3682 				 * These plan types not only materialize their results, but do
3683 				 * not implement qual filtering or projection.  So they are
3684 				 * even cheaper to rescan than the ones above.  We charge only
3685 				 * cpu_operator_cost per tuple.  (Note: keep that in sync with
3686 				 * the run_cost charge in cost_sort, and also see comments in
3687 				 * cost_material before you change it.)
3688 				 */
3689 				Cost		run_cost = cpu_operator_cost * path->rows;
3690 				double		nbytes = relation_byte_size(path->rows,
3691 														path->pathtarget->width);
3692 				long		work_mem_bytes = work_mem * 1024L;
3693 
3694 				if (nbytes > work_mem_bytes)
3695 				{
3696 					/* It will spill, so account for re-read cost */
3697 					double		npages = ceil(nbytes / BLCKSZ);
3698 
3699 					run_cost += seq_page_cost * npages;
3700 				}
3701 				*rescan_startup_cost = 0;
3702 				*rescan_total_cost = run_cost;
3703 			}
3704 			break;
3705 		default:
3706 			*rescan_startup_cost = path->startup_cost;
3707 			*rescan_total_cost = path->total_cost;
3708 			break;
3709 	}
3710 }
3711 
3712 
3713 /*
3714  * cost_qual_eval
3715  *		Estimate the CPU costs of evaluating a WHERE clause.
3716  *		The input can be either an implicitly-ANDed list of boolean
3717  *		expressions, or a list of RestrictInfo nodes.  (The latter is
3718  *		preferred since it allows caching of the results.)
3719  *		The result includes both a one-time (startup) component,
3720  *		and a per-evaluation component.
3721  */
3722 void
cost_qual_eval(QualCost * cost,List * quals,PlannerInfo * root)3723 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
3724 {
3725 	cost_qual_eval_context context;
3726 	ListCell   *l;
3727 
3728 	context.root = root;
3729 	context.total.startup = 0;
3730 	context.total.per_tuple = 0;
3731 
3732 	/* We don't charge any cost for the implicit ANDing at top level ... */
3733 
3734 	foreach(l, quals)
3735 	{
3736 		Node	   *qual = (Node *) lfirst(l);
3737 
3738 		cost_qual_eval_walker(qual, &context);
3739 	}
3740 
3741 	*cost = context.total;
3742 }
3743 
3744 /*
3745  * cost_qual_eval_node
3746  *		As above, for a single RestrictInfo or expression.
3747  */
3748 void
cost_qual_eval_node(QualCost * cost,Node * qual,PlannerInfo * root)3749 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
3750 {
3751 	cost_qual_eval_context context;
3752 
3753 	context.root = root;
3754 	context.total.startup = 0;
3755 	context.total.per_tuple = 0;
3756 
3757 	cost_qual_eval_walker(qual, &context);
3758 
3759 	*cost = context.total;
3760 }
3761 
3762 static bool
cost_qual_eval_walker(Node * node,cost_qual_eval_context * context)3763 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
3764 {
3765 	if (node == NULL)
3766 		return false;
3767 
3768 	/*
3769 	 * RestrictInfo nodes contain an eval_cost field reserved for this
3770 	 * routine's use, so that it's not necessary to evaluate the qual clause's
3771 	 * cost more than once.  If the clause's cost hasn't been computed yet,
3772 	 * the field's startup value will contain -1.
3773 	 */
3774 	if (IsA(node, RestrictInfo))
3775 	{
3776 		RestrictInfo *rinfo = (RestrictInfo *) node;
3777 
3778 		if (rinfo->eval_cost.startup < 0)
3779 		{
3780 			cost_qual_eval_context locContext;
3781 
3782 			locContext.root = context->root;
3783 			locContext.total.startup = 0;
3784 			locContext.total.per_tuple = 0;
3785 
3786 			/*
3787 			 * For an OR clause, recurse into the marked-up tree so that we
3788 			 * set the eval_cost for contained RestrictInfos too.
3789 			 */
3790 			if (rinfo->orclause)
3791 				cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
3792 			else
3793 				cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
3794 
3795 			/*
3796 			 * If the RestrictInfo is marked pseudoconstant, it will be tested
3797 			 * only once, so treat its cost as all startup cost.
3798 			 */
3799 			if (rinfo->pseudoconstant)
3800 			{
3801 				/* count one execution during startup */
3802 				locContext.total.startup += locContext.total.per_tuple;
3803 				locContext.total.per_tuple = 0;
3804 			}
3805 			rinfo->eval_cost = locContext.total;
3806 		}
3807 		context->total.startup += rinfo->eval_cost.startup;
3808 		context->total.per_tuple += rinfo->eval_cost.per_tuple;
3809 		/* do NOT recurse into children */
3810 		return false;
3811 	}
3812 
3813 	/*
3814 	 * For each operator or function node in the given tree, we charge the
3815 	 * estimated execution cost given by pg_proc.procost (remember to multiply
3816 	 * this by cpu_operator_cost).
3817 	 *
3818 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
3819 	 * OR, NOT). Simplistic, but a lot better than no model at all.
3820 	 *
3821 	 * Should we try to account for the possibility of short-circuit
3822 	 * evaluation of AND/OR?  Probably *not*, because that would make the
3823 	 * results depend on the clause ordering, and we are not in any position
3824 	 * to expect that the current ordering of the clauses is the one that's
3825 	 * going to end up being used.  The above per-RestrictInfo caching would
3826 	 * not mix well with trying to re-order clauses anyway.
3827 	 *
3828 	 * Another issue that is entirely ignored here is that if a set-returning
3829 	 * function is below top level in the tree, the functions/operators above
3830 	 * it will need to be evaluated multiple times.  In practical use, such
3831 	 * cases arise so seldom as to not be worth the added complexity needed;
3832 	 * moreover, since our rowcount estimates for functions tend to be pretty
3833 	 * phony, the results would also be pretty phony.
3834 	 */
3835 	if (IsA(node, FuncExpr))
3836 	{
3837 		context->total.per_tuple +=
3838 			get_func_cost(((FuncExpr *) node)->funcid) * cpu_operator_cost;
3839 	}
3840 	else if (IsA(node, OpExpr) ||
3841 			 IsA(node, DistinctExpr) ||
3842 			 IsA(node, NullIfExpr))
3843 	{
3844 		/* rely on struct equivalence to treat these all alike */
3845 		set_opfuncid((OpExpr *) node);
3846 		context->total.per_tuple +=
3847 			get_func_cost(((OpExpr *) node)->opfuncid) * cpu_operator_cost;
3848 	}
3849 	else if (IsA(node, ScalarArrayOpExpr))
3850 	{
3851 		/*
3852 		 * Estimate that the operator will be applied to about half of the
3853 		 * array elements before the answer is determined.
3854 		 */
3855 		ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
3856 		Node	   *arraynode = (Node *) lsecond(saop->args);
3857 
3858 		set_sa_opfuncid(saop);
3859 		context->total.per_tuple += get_func_cost(saop->opfuncid) *
3860 			cpu_operator_cost * estimate_array_length(arraynode) * 0.5;
3861 	}
3862 	else if (IsA(node, Aggref) ||
3863 			 IsA(node, WindowFunc))
3864 	{
3865 		/*
3866 		 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
3867 		 * ie, zero execution cost in the current model, because they behave
3868 		 * essentially like Vars at execution.  We disregard the costs of
3869 		 * their input expressions for the same reason.  The actual execution
3870 		 * costs of the aggregate/window functions and their arguments have to
3871 		 * be factored into plan-node-specific costing of the Agg or WindowAgg
3872 		 * plan node.
3873 		 */
3874 		return false;			/* don't recurse into children */
3875 	}
3876 	else if (IsA(node, CoerceViaIO))
3877 	{
3878 		CoerceViaIO *iocoerce = (CoerceViaIO *) node;
3879 		Oid			iofunc;
3880 		Oid			typioparam;
3881 		bool		typisvarlena;
3882 
3883 		/* check the result type's input function */
3884 		getTypeInputInfo(iocoerce->resulttype,
3885 						 &iofunc, &typioparam);
3886 		context->total.per_tuple += get_func_cost(iofunc) * cpu_operator_cost;
3887 		/* check the input type's output function */
3888 		getTypeOutputInfo(exprType((Node *) iocoerce->arg),
3889 						  &iofunc, &typisvarlena);
3890 		context->total.per_tuple += get_func_cost(iofunc) * cpu_operator_cost;
3891 	}
3892 	else if (IsA(node, ArrayCoerceExpr))
3893 	{
3894 		ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
3895 		QualCost	perelemcost;
3896 
3897 		cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
3898 							context->root);
3899 		context->total.startup += perelemcost.startup;
3900 		if (perelemcost.per_tuple > 0)
3901 			context->total.per_tuple += perelemcost.per_tuple *
3902 				estimate_array_length((Node *) acoerce->arg);
3903 	}
3904 	else if (IsA(node, RowCompareExpr))
3905 	{
3906 		/* Conservatively assume we will check all the columns */
3907 		RowCompareExpr *rcexpr = (RowCompareExpr *) node;
3908 		ListCell   *lc;
3909 
3910 		foreach(lc, rcexpr->opnos)
3911 		{
3912 			Oid			opid = lfirst_oid(lc);
3913 
3914 			context->total.per_tuple += get_func_cost(get_opcode(opid)) *
3915 				cpu_operator_cost;
3916 		}
3917 	}
3918 	else if (IsA(node, MinMaxExpr) ||
3919 			 IsA(node, SQLValueFunction) ||
3920 			 IsA(node, XmlExpr) ||
3921 			 IsA(node, CoerceToDomain) ||
3922 			 IsA(node, NextValueExpr))
3923 	{
3924 		/* Treat all these as having cost 1 */
3925 		context->total.per_tuple += cpu_operator_cost;
3926 	}
3927 	else if (IsA(node, CurrentOfExpr))
3928 	{
3929 		/* Report high cost to prevent selection of anything but TID scan */
3930 		context->total.startup += disable_cost;
3931 	}
3932 	else if (IsA(node, SubLink))
3933 	{
3934 		/* This routine should not be applied to un-planned expressions */
3935 		elog(ERROR, "cannot handle unplanned sub-select");
3936 	}
3937 	else if (IsA(node, SubPlan))
3938 	{
3939 		/*
3940 		 * A subplan node in an expression typically indicates that the
3941 		 * subplan will be executed on each evaluation, so charge accordingly.
3942 		 * (Sub-selects that can be executed as InitPlans have already been
3943 		 * removed from the expression.)
3944 		 */
3945 		SubPlan    *subplan = (SubPlan *) node;
3946 
3947 		context->total.startup += subplan->startup_cost;
3948 		context->total.per_tuple += subplan->per_call_cost;
3949 
3950 		/*
3951 		 * We don't want to recurse into the testexpr, because it was already
3952 		 * counted in the SubPlan node's costs.  So we're done.
3953 		 */
3954 		return false;
3955 	}
3956 	else if (IsA(node, AlternativeSubPlan))
3957 	{
3958 		/*
3959 		 * Arbitrarily use the first alternative plan for costing.  (We should
3960 		 * certainly only include one alternative, and we don't yet have
3961 		 * enough information to know which one the executor is most likely to
3962 		 * use.)
3963 		 */
3964 		AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
3965 
3966 		return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
3967 									 context);
3968 	}
3969 	else if (IsA(node, PlaceHolderVar))
3970 	{
3971 		/*
3972 		 * A PlaceHolderVar should be given cost zero when considering general
3973 		 * expression evaluation costs.  The expense of doing the contained
3974 		 * expression is charged as part of the tlist eval costs of the scan
3975 		 * or join where the PHV is first computed (see set_rel_width and
3976 		 * add_placeholders_to_joinrel).  If we charged it again here, we'd be
3977 		 * double-counting the cost for each level of plan that the PHV
3978 		 * bubbles up through.  Hence, return without recursing into the
3979 		 * phexpr.
3980 		 */
3981 		return false;
3982 	}
3983 
3984 	/* recurse into children */
3985 	return expression_tree_walker(node, cost_qual_eval_walker,
3986 								  (void *) context);
3987 }
3988 
3989 /*
3990  * get_restriction_qual_cost
3991  *	  Compute evaluation costs of a baserel's restriction quals, plus any
3992  *	  movable join quals that have been pushed down to the scan.
3993  *	  Results are returned into *qpqual_cost.
3994  *
3995  * This is a convenience subroutine that works for seqscans and other cases
3996  * where all the given quals will be evaluated the hard way.  It's not useful
3997  * for cost_index(), for example, where the index machinery takes care of
3998  * some of the quals.  We assume baserestrictcost was previously set by
3999  * set_baserel_size_estimates().
4000  */
4001 static void
get_restriction_qual_cost(PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,QualCost * qpqual_cost)4002 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4003 						  ParamPathInfo *param_info,
4004 						  QualCost *qpqual_cost)
4005 {
4006 	if (param_info)
4007 	{
4008 		/* Include costs of pushed-down clauses */
4009 		cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4010 
4011 		qpqual_cost->startup += baserel->baserestrictcost.startup;
4012 		qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4013 	}
4014 	else
4015 		*qpqual_cost = baserel->baserestrictcost;
4016 }
4017 
4018 
4019 /*
4020  * compute_semi_anti_join_factors
4021  *	  Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4022  *	  can be expected to scan.
4023  *
4024  * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4025  * inner rows as soon as it finds a match to the current outer row.
4026  * The same happens if we have detected the inner rel is unique.
4027  * We should therefore adjust some of the cost components for this effect.
4028  * This function computes some estimates needed for these adjustments.
4029  * These estimates will be the same regardless of the particular paths used
4030  * for the outer and inner relation, so we compute these once and then pass
4031  * them to all the join cost estimation functions.
4032  *
4033  * Input parameters:
4034  *	joinrel: join relation under consideration
4035  *	outerrel: outer relation under consideration
4036  *	innerrel: inner relation under consideration
4037  *	jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4038  *	sjinfo: SpecialJoinInfo relevant to this join
4039  *	restrictlist: join quals
4040  * Output parameters:
4041  *	*semifactors is filled in (see relation.h for field definitions)
4042  */
4043 void
compute_semi_anti_join_factors(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outerrel,RelOptInfo * innerrel,JoinType jointype,SpecialJoinInfo * sjinfo,List * restrictlist,SemiAntiJoinFactors * semifactors)4044 compute_semi_anti_join_factors(PlannerInfo *root,
4045 							   RelOptInfo *joinrel,
4046 							   RelOptInfo *outerrel,
4047 							   RelOptInfo *innerrel,
4048 							   JoinType jointype,
4049 							   SpecialJoinInfo *sjinfo,
4050 							   List *restrictlist,
4051 							   SemiAntiJoinFactors *semifactors)
4052 {
4053 	Selectivity jselec;
4054 	Selectivity nselec;
4055 	Selectivity avgmatch;
4056 	SpecialJoinInfo norm_sjinfo;
4057 	List	   *joinquals;
4058 	ListCell   *l;
4059 
4060 	/*
4061 	 * In an ANTI join, we must ignore clauses that are "pushed down", since
4062 	 * those won't affect the match logic.  In a SEMI join, we do not
4063 	 * distinguish joinquals from "pushed down" quals, so just use the whole
4064 	 * restrictinfo list.  For other outer join types, we should consider only
4065 	 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4066 	 */
4067 	if (IS_OUTER_JOIN(jointype))
4068 	{
4069 		joinquals = NIL;
4070 		foreach(l, restrictlist)
4071 		{
4072 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4073 
4074 			if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4075 				joinquals = lappend(joinquals, rinfo);
4076 		}
4077 	}
4078 	else
4079 		joinquals = restrictlist;
4080 
4081 	/*
4082 	 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4083 	 */
4084 	jselec = clauselist_selectivity(root,
4085 									joinquals,
4086 									0,
4087 									(jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4088 									sjinfo);
4089 
4090 	/*
4091 	 * Also get the normal inner-join selectivity of the join clauses.
4092 	 */
4093 	norm_sjinfo.type = T_SpecialJoinInfo;
4094 	norm_sjinfo.min_lefthand = outerrel->relids;
4095 	norm_sjinfo.min_righthand = innerrel->relids;
4096 	norm_sjinfo.syn_lefthand = outerrel->relids;
4097 	norm_sjinfo.syn_righthand = innerrel->relids;
4098 	norm_sjinfo.jointype = JOIN_INNER;
4099 	/* we don't bother trying to make the remaining fields valid */
4100 	norm_sjinfo.lhs_strict = false;
4101 	norm_sjinfo.delay_upper_joins = false;
4102 	norm_sjinfo.semi_can_btree = false;
4103 	norm_sjinfo.semi_can_hash = false;
4104 	norm_sjinfo.semi_operators = NIL;
4105 	norm_sjinfo.semi_rhs_exprs = NIL;
4106 
4107 	nselec = clauselist_selectivity(root,
4108 									joinquals,
4109 									0,
4110 									JOIN_INNER,
4111 									&norm_sjinfo);
4112 
4113 	/* Avoid leaking a lot of ListCells */
4114 	if (IS_OUTER_JOIN(jointype))
4115 		list_free(joinquals);
4116 
4117 	/*
4118 	 * jselec can be interpreted as the fraction of outer-rel rows that have
4119 	 * any matches (this is true for both SEMI and ANTI cases).  And nselec is
4120 	 * the fraction of the Cartesian product that matches.  So, the average
4121 	 * number of matches for each outer-rel row that has at least one match is
4122 	 * nselec * inner_rows / jselec.
4123 	 *
4124 	 * Note: it is correct to use the inner rel's "rows" count here, even
4125 	 * though we might later be considering a parameterized inner path with
4126 	 * fewer rows.  This is because we have included all the join clauses in
4127 	 * the selectivity estimate.
4128 	 */
4129 	if (jselec > 0)				/* protect against zero divide */
4130 	{
4131 		avgmatch = nselec * innerrel->rows / jselec;
4132 		/* Clamp to sane range */
4133 		avgmatch = Max(1.0, avgmatch);
4134 	}
4135 	else
4136 		avgmatch = 1.0;
4137 
4138 	semifactors->outer_match_frac = jselec;
4139 	semifactors->match_count = avgmatch;
4140 }
4141 
4142 /*
4143  * has_indexed_join_quals
4144  *	  Check whether all the joinquals of a nestloop join are used as
4145  *	  inner index quals.
4146  *
4147  * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4148  * indexscan) that uses all the joinquals as indexquals, we can assume that an
4149  * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4150  * expensive.
4151  */
4152 static bool
has_indexed_join_quals(NestPath * joinpath)4153 has_indexed_join_quals(NestPath *joinpath)
4154 {
4155 	Relids		joinrelids = joinpath->path.parent->relids;
4156 	Path	   *innerpath = joinpath->innerjoinpath;
4157 	List	   *indexclauses;
4158 	bool		found_one;
4159 	ListCell   *lc;
4160 
4161 	/* If join still has quals to evaluate, it's not fast */
4162 	if (joinpath->joinrestrictinfo != NIL)
4163 		return false;
4164 	/* Nor if the inner path isn't parameterized at all */
4165 	if (innerpath->param_info == NULL)
4166 		return false;
4167 
4168 	/* Find the indexclauses list for the inner scan */
4169 	switch (innerpath->pathtype)
4170 	{
4171 		case T_IndexScan:
4172 		case T_IndexOnlyScan:
4173 			indexclauses = ((IndexPath *) innerpath)->indexclauses;
4174 			break;
4175 		case T_BitmapHeapScan:
4176 			{
4177 				/* Accept only a simple bitmap scan, not AND/OR cases */
4178 				Path	   *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4179 
4180 				if (IsA(bmqual, IndexPath))
4181 					indexclauses = ((IndexPath *) bmqual)->indexclauses;
4182 				else
4183 					return false;
4184 				break;
4185 			}
4186 		default:
4187 
4188 			/*
4189 			 * If it's not a simple indexscan, it probably doesn't run quickly
4190 			 * for zero rows out, even if it's a parameterized path using all
4191 			 * the joinquals.
4192 			 */
4193 			return false;
4194 	}
4195 
4196 	/*
4197 	 * Examine the inner path's param clauses.  Any that are from the outer
4198 	 * path must be found in the indexclauses list, either exactly or in an
4199 	 * equivalent form generated by equivclass.c.  Also, we must find at least
4200 	 * one such clause, else it's a clauseless join which isn't fast.
4201 	 */
4202 	found_one = false;
4203 	foreach(lc, innerpath->param_info->ppi_clauses)
4204 	{
4205 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4206 
4207 		if (join_clause_is_movable_into(rinfo,
4208 										innerpath->parent->relids,
4209 										joinrelids))
4210 		{
4211 			if (!(list_member_ptr(indexclauses, rinfo) ||
4212 				  is_redundant_derived_clause(rinfo, indexclauses)))
4213 				return false;
4214 			found_one = true;
4215 		}
4216 	}
4217 	return found_one;
4218 }
4219 
4220 
4221 /*
4222  * approx_tuple_count
4223  *		Quick-and-dirty estimation of the number of join rows passing
4224  *		a set of qual conditions.
4225  *
4226  * The quals can be either an implicitly-ANDed list of boolean expressions,
4227  * or a list of RestrictInfo nodes (typically the latter).
4228  *
4229  * We intentionally compute the selectivity under JOIN_INNER rules, even
4230  * if it's some type of outer join.  This is appropriate because we are
4231  * trying to figure out how many tuples pass the initial merge or hash
4232  * join step.
4233  *
4234  * This is quick-and-dirty because we bypass clauselist_selectivity, and
4235  * simply multiply the independent clause selectivities together.  Now
4236  * clauselist_selectivity often can't do any better than that anyhow, but
4237  * for some situations (such as range constraints) it is smarter.  However,
4238  * we can't effectively cache the results of clauselist_selectivity, whereas
4239  * the individual clause selectivities can be and are cached.
4240  *
4241  * Since we are only using the results to estimate how many potential
4242  * output tuples are generated and passed through qpqual checking, it
4243  * seems OK to live with the approximation.
4244  */
4245 static double
approx_tuple_count(PlannerInfo * root,JoinPath * path,List * quals)4246 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4247 {
4248 	double		tuples;
4249 	double		outer_tuples = path->outerjoinpath->rows;
4250 	double		inner_tuples = path->innerjoinpath->rows;
4251 	SpecialJoinInfo sjinfo;
4252 	Selectivity selec = 1.0;
4253 	ListCell   *l;
4254 
4255 	/*
4256 	 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4257 	 */
4258 	sjinfo.type = T_SpecialJoinInfo;
4259 	sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4260 	sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4261 	sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4262 	sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4263 	sjinfo.jointype = JOIN_INNER;
4264 	/* we don't bother trying to make the remaining fields valid */
4265 	sjinfo.lhs_strict = false;
4266 	sjinfo.delay_upper_joins = false;
4267 	sjinfo.semi_can_btree = false;
4268 	sjinfo.semi_can_hash = false;
4269 	sjinfo.semi_operators = NIL;
4270 	sjinfo.semi_rhs_exprs = NIL;
4271 
4272 	/* Get the approximate selectivity */
4273 	foreach(l, quals)
4274 	{
4275 		Node	   *qual = (Node *) lfirst(l);
4276 
4277 		/* Note that clause_selectivity will be able to cache its result */
4278 		selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4279 	}
4280 
4281 	/* Apply it to the input relation sizes */
4282 	tuples = selec * outer_tuples * inner_tuples;
4283 
4284 	return clamp_row_est(tuples);
4285 }
4286 
4287 
4288 /*
4289  * set_baserel_size_estimates
4290  *		Set the size estimates for the given base relation.
4291  *
4292  * The rel's targetlist and restrictinfo list must have been constructed
4293  * already, and rel->tuples must be set.
4294  *
4295  * We set the following fields of the rel node:
4296  *	rows: the estimated number of output tuples (after applying
4297  *		  restriction clauses).
4298  *	width: the estimated average output tuple width in bytes.
4299  *	baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4300  */
4301 void
set_baserel_size_estimates(PlannerInfo * root,RelOptInfo * rel)4302 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4303 {
4304 	double		nrows;
4305 
4306 	/* Should only be applied to base relations */
4307 	Assert(rel->relid > 0);
4308 
4309 	nrows = rel->tuples *
4310 		clauselist_selectivity(root,
4311 							   rel->baserestrictinfo,
4312 							   0,
4313 							   JOIN_INNER,
4314 							   NULL);
4315 
4316 	rel->rows = clamp_row_est(nrows);
4317 
4318 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4319 
4320 	set_rel_width(root, rel);
4321 }
4322 
4323 /*
4324  * get_parameterized_baserel_size
4325  *		Make a size estimate for a parameterized scan of a base relation.
4326  *
4327  * 'param_clauses' lists the additional join clauses to be used.
4328  *
4329  * set_baserel_size_estimates must have been applied already.
4330  */
4331 double
get_parameterized_baserel_size(PlannerInfo * root,RelOptInfo * rel,List * param_clauses)4332 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4333 							   List *param_clauses)
4334 {
4335 	List	   *allclauses;
4336 	double		nrows;
4337 
4338 	/*
4339 	 * Estimate the number of rows returned by the parameterized scan, knowing
4340 	 * that it will apply all the extra join clauses as well as the rel's own
4341 	 * restriction clauses.  Note that we force the clauses to be treated as
4342 	 * non-join clauses during selectivity estimation.
4343 	 */
4344 	allclauses = list_concat(list_copy(param_clauses),
4345 							 rel->baserestrictinfo);
4346 	nrows = rel->tuples *
4347 		clauselist_selectivity(root,
4348 							   allclauses,
4349 							   rel->relid,	/* do not use 0! */
4350 							   JOIN_INNER,
4351 							   NULL);
4352 	nrows = clamp_row_est(nrows);
4353 	/* For safety, make sure result is not more than the base estimate */
4354 	if (nrows > rel->rows)
4355 		nrows = rel->rows;
4356 	return nrows;
4357 }
4358 
4359 /*
4360  * set_joinrel_size_estimates
4361  *		Set the size estimates for the given join relation.
4362  *
4363  * The rel's targetlist must have been constructed already, and a
4364  * restriction clause list that matches the given component rels must
4365  * be provided.
4366  *
4367  * Since there is more than one way to make a joinrel for more than two
4368  * base relations, the results we get here could depend on which component
4369  * rel pair is provided.  In theory we should get the same answers no matter
4370  * which pair is provided; in practice, since the selectivity estimation
4371  * routines don't handle all cases equally well, we might not.  But there's
4372  * not much to be done about it.  (Would it make sense to repeat the
4373  * calculations for each pair of input rels that's encountered, and somehow
4374  * average the results?  Probably way more trouble than it's worth, and
4375  * anyway we must keep the rowcount estimate the same for all paths for the
4376  * joinrel.)
4377  *
4378  * We set only the rows field here.  The reltarget field was already set by
4379  * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4380  */
4381 void
set_joinrel_size_estimates(PlannerInfo * root,RelOptInfo * rel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,SpecialJoinInfo * sjinfo,List * restrictlist)4382 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4383 						   RelOptInfo *outer_rel,
4384 						   RelOptInfo *inner_rel,
4385 						   SpecialJoinInfo *sjinfo,
4386 						   List *restrictlist)
4387 {
4388 	rel->rows = calc_joinrel_size_estimate(root,
4389 										   rel,
4390 										   outer_rel,
4391 										   inner_rel,
4392 										   outer_rel->rows,
4393 										   inner_rel->rows,
4394 										   sjinfo,
4395 										   restrictlist);
4396 }
4397 
4398 /*
4399  * get_parameterized_joinrel_size
4400  *		Make a size estimate for a parameterized scan of a join relation.
4401  *
4402  * 'rel' is the joinrel under consideration.
4403  * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4404  *		produce the relations being joined.
4405  * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4406  * 'restrict_clauses' lists the join clauses that need to be applied at the
4407  * join node (including any movable clauses that were moved down to this join,
4408  * and not including any movable clauses that were pushed down into the
4409  * child paths).
4410  *
4411  * set_joinrel_size_estimates must have been applied already.
4412  */
4413 double
get_parameterized_joinrel_size(PlannerInfo * root,RelOptInfo * rel,Path * outer_path,Path * inner_path,SpecialJoinInfo * sjinfo,List * restrict_clauses)4414 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4415 							   Path *outer_path,
4416 							   Path *inner_path,
4417 							   SpecialJoinInfo *sjinfo,
4418 							   List *restrict_clauses)
4419 {
4420 	double		nrows;
4421 
4422 	/*
4423 	 * Estimate the number of rows returned by the parameterized join as the
4424 	 * sizes of the input paths times the selectivity of the clauses that have
4425 	 * ended up at this join node.
4426 	 *
4427 	 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4428 	 * on the pair of input paths provided, though ideally we'd get the same
4429 	 * estimate for any pair with the same parameterization.
4430 	 */
4431 	nrows = calc_joinrel_size_estimate(root,
4432 									   rel,
4433 									   outer_path->parent,
4434 									   inner_path->parent,
4435 									   outer_path->rows,
4436 									   inner_path->rows,
4437 									   sjinfo,
4438 									   restrict_clauses);
4439 	/* For safety, make sure result is not more than the base estimate */
4440 	if (nrows > rel->rows)
4441 		nrows = rel->rows;
4442 	return nrows;
4443 }
4444 
4445 /*
4446  * calc_joinrel_size_estimate
4447  *		Workhorse for set_joinrel_size_estimates and
4448  *		get_parameterized_joinrel_size.
4449  *
4450  * outer_rel/inner_rel are the relations being joined, but they should be
4451  * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4452  * than what rel->rows says, when we are considering parameterized paths.
4453  */
4454 static double
calc_joinrel_size_estimate(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,double outer_rows,double inner_rows,SpecialJoinInfo * sjinfo,List * restrictlist_in)4455 calc_joinrel_size_estimate(PlannerInfo *root,
4456 						   RelOptInfo *joinrel,
4457 						   RelOptInfo *outer_rel,
4458 						   RelOptInfo *inner_rel,
4459 						   double outer_rows,
4460 						   double inner_rows,
4461 						   SpecialJoinInfo *sjinfo,
4462 						   List *restrictlist_in)
4463 {
4464 	/* This apparently-useless variable dodges a compiler bug in VS2013: */
4465 	List	   *restrictlist = restrictlist_in;
4466 	JoinType	jointype = sjinfo->jointype;
4467 	Selectivity fkselec;
4468 	Selectivity jselec;
4469 	Selectivity pselec;
4470 	double		nrows;
4471 
4472 	/*
4473 	 * Compute joinclause selectivity.  Note that we are only considering
4474 	 * clauses that become restriction clauses at this join level; we are not
4475 	 * double-counting them because they were not considered in estimating the
4476 	 * sizes of the component rels.
4477 	 *
4478 	 * First, see whether any of the joinclauses can be matched to known FK
4479 	 * constraints.  If so, drop those clauses from the restrictlist, and
4480 	 * instead estimate their selectivity using FK semantics.  (We do this
4481 	 * without regard to whether said clauses are local or "pushed down".
4482 	 * Probably, an FK-matching clause could never be seen as pushed down at
4483 	 * an outer join, since it would be strict and hence would be grounds for
4484 	 * join strength reduction.)  fkselec gets the net selectivity for
4485 	 * FK-matching clauses, or 1.0 if there are none.
4486 	 */
4487 	fkselec = get_foreign_key_join_selectivity(root,
4488 											   outer_rel->relids,
4489 											   inner_rel->relids,
4490 											   sjinfo,
4491 											   &restrictlist);
4492 
4493 	/*
4494 	 * For an outer join, we have to distinguish the selectivity of the join's
4495 	 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4496 	 * down".  For inner joins we just count them all as joinclauses.
4497 	 */
4498 	if (IS_OUTER_JOIN(jointype))
4499 	{
4500 		List	   *joinquals = NIL;
4501 		List	   *pushedquals = NIL;
4502 		ListCell   *l;
4503 
4504 		/* Grovel through the clauses to separate into two lists */
4505 		foreach(l, restrictlist)
4506 		{
4507 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4508 
4509 			if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4510 				pushedquals = lappend(pushedquals, rinfo);
4511 			else
4512 				joinquals = lappend(joinquals, rinfo);
4513 		}
4514 
4515 		/* Get the separate selectivities */
4516 		jselec = clauselist_selectivity(root,
4517 										joinquals,
4518 										0,
4519 										jointype,
4520 										sjinfo);
4521 		pselec = clauselist_selectivity(root,
4522 										pushedquals,
4523 										0,
4524 										jointype,
4525 										sjinfo);
4526 
4527 		/* Avoid leaking a lot of ListCells */
4528 		list_free(joinquals);
4529 		list_free(pushedquals);
4530 	}
4531 	else
4532 	{
4533 		jselec = clauselist_selectivity(root,
4534 										restrictlist,
4535 										0,
4536 										jointype,
4537 										sjinfo);
4538 		pselec = 0.0;			/* not used, keep compiler quiet */
4539 	}
4540 
4541 	/*
4542 	 * Basically, we multiply size of Cartesian product by selectivity.
4543 	 *
4544 	 * If we are doing an outer join, take that into account: the joinqual
4545 	 * selectivity has to be clamped using the knowledge that the output must
4546 	 * be at least as large as the non-nullable input.  However, any
4547 	 * pushed-down quals are applied after the outer join, so their
4548 	 * selectivity applies fully.
4549 	 *
4550 	 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4551 	 * of LHS rows that have matches, and we apply that straightforwardly.
4552 	 */
4553 	switch (jointype)
4554 	{
4555 		case JOIN_INNER:
4556 			nrows = outer_rows * inner_rows * fkselec * jselec;
4557 			/* pselec not used */
4558 			break;
4559 		case JOIN_LEFT:
4560 			nrows = outer_rows * inner_rows * fkselec * jselec;
4561 			if (nrows < outer_rows)
4562 				nrows = outer_rows;
4563 			nrows *= pselec;
4564 			break;
4565 		case JOIN_FULL:
4566 			nrows = outer_rows * inner_rows * fkselec * jselec;
4567 			if (nrows < outer_rows)
4568 				nrows = outer_rows;
4569 			if (nrows < inner_rows)
4570 				nrows = inner_rows;
4571 			nrows *= pselec;
4572 			break;
4573 		case JOIN_SEMI:
4574 			nrows = outer_rows * fkselec * jselec;
4575 			/* pselec not used */
4576 			break;
4577 		case JOIN_ANTI:
4578 			nrows = outer_rows * (1.0 - fkselec * jselec);
4579 			nrows *= pselec;
4580 			break;
4581 		default:
4582 			/* other values not expected here */
4583 			elog(ERROR, "unrecognized join type: %d", (int) jointype);
4584 			nrows = 0;			/* keep compiler quiet */
4585 			break;
4586 	}
4587 
4588 	return clamp_row_est(nrows);
4589 }
4590 
4591 /*
4592  * get_foreign_key_join_selectivity
4593  *		Estimate join selectivity for foreign-key-related clauses.
4594  *
4595  * Remove any clauses that can be matched to FK constraints from *restrictlist,
4596  * and return a substitute estimate of their selectivity.  1.0 is returned
4597  * when there are no such clauses.
4598  *
4599  * The reason for treating such clauses specially is that we can get better
4600  * estimates this way than by relying on clauselist_selectivity(), especially
4601  * for multi-column FKs where that function's assumption that the clauses are
4602  * independent falls down badly.  But even with single-column FKs, we may be
4603  * able to get a better answer when the pg_statistic stats are missing or out
4604  * of date.
4605  */
4606 static Selectivity
get_foreign_key_join_selectivity(PlannerInfo * root,Relids outer_relids,Relids inner_relids,SpecialJoinInfo * sjinfo,List ** restrictlist)4607 get_foreign_key_join_selectivity(PlannerInfo *root,
4608 								 Relids outer_relids,
4609 								 Relids inner_relids,
4610 								 SpecialJoinInfo *sjinfo,
4611 								 List **restrictlist)
4612 {
4613 	Selectivity fkselec = 1.0;
4614 	JoinType	jointype = sjinfo->jointype;
4615 	List	   *worklist = *restrictlist;
4616 	ListCell   *lc;
4617 
4618 	/* Consider each FK constraint that is known to match the query */
4619 	foreach(lc, root->fkey_list)
4620 	{
4621 		ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4622 		bool		ref_is_outer;
4623 		List	   *removedlist;
4624 		ListCell   *cell;
4625 		ListCell   *prev;
4626 		ListCell   *next;
4627 
4628 		/*
4629 		 * This FK is not relevant unless it connects a baserel on one side of
4630 		 * this join to a baserel on the other side.
4631 		 */
4632 		if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4633 			bms_is_member(fkinfo->ref_relid, inner_relids))
4634 			ref_is_outer = false;
4635 		else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4636 				 bms_is_member(fkinfo->con_relid, inner_relids))
4637 			ref_is_outer = true;
4638 		else
4639 			continue;
4640 
4641 		/*
4642 		 * If we're dealing with a semi/anti join, and the FK's referenced
4643 		 * relation is on the outside, then knowledge of the FK doesn't help
4644 		 * us figure out what we need to know (which is the fraction of outer
4645 		 * rows that have matches).  On the other hand, if the referenced rel
4646 		 * is on the inside, then all outer rows must have matches in the
4647 		 * referenced table (ignoring nulls).  But any restriction or join
4648 		 * clauses that filter that table will reduce the fraction of matches.
4649 		 * We can account for restriction clauses, but it's too hard to guess
4650 		 * how many table rows would get through a join that's inside the RHS.
4651 		 * Hence, if either case applies, punt and ignore the FK.
4652 		 */
4653 		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4654 			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4655 			continue;
4656 
4657 		/*
4658 		 * Modify the restrictlist by removing clauses that match the FK (and
4659 		 * putting them into removedlist instead).  It seems unsafe to modify
4660 		 * the originally-passed List structure, so we make a shallow copy the
4661 		 * first time through.
4662 		 */
4663 		if (worklist == *restrictlist)
4664 			worklist = list_copy(worklist);
4665 
4666 		removedlist = NIL;
4667 		prev = NULL;
4668 		for (cell = list_head(worklist); cell; cell = next)
4669 		{
4670 			RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
4671 			bool		remove_it = false;
4672 			int			i;
4673 
4674 			next = lnext(cell);
4675 			/* Drop this clause if it matches any column of the FK */
4676 			for (i = 0; i < fkinfo->nkeys; i++)
4677 			{
4678 				if (rinfo->parent_ec)
4679 				{
4680 					/*
4681 					 * EC-derived clauses can only match by EC.  It is okay to
4682 					 * consider any clause derived from the same EC as
4683 					 * matching the FK: even if equivclass.c chose to generate
4684 					 * a clause equating some other pair of Vars, it could
4685 					 * have generated one equating the FK's Vars.  So for
4686 					 * purposes of estimation, we can act as though it did so.
4687 					 *
4688 					 * Note: checking parent_ec is a bit of a cheat because
4689 					 * there are EC-derived clauses that don't have parent_ec
4690 					 * set; but such clauses must compare expressions that
4691 					 * aren't just Vars, so they cannot match the FK anyway.
4692 					 */
4693 					if (fkinfo->eclass[i] == rinfo->parent_ec)
4694 					{
4695 						remove_it = true;
4696 						break;
4697 					}
4698 				}
4699 				else
4700 				{
4701 					/*
4702 					 * Otherwise, see if rinfo was previously matched to FK as
4703 					 * a "loose" clause.
4704 					 */
4705 					if (list_member_ptr(fkinfo->rinfos[i], rinfo))
4706 					{
4707 						remove_it = true;
4708 						break;
4709 					}
4710 				}
4711 			}
4712 			if (remove_it)
4713 			{
4714 				worklist = list_delete_cell(worklist, cell, prev);
4715 				removedlist = lappend(removedlist, rinfo);
4716 			}
4717 			else
4718 				prev = cell;
4719 		}
4720 
4721 		/*
4722 		 * If we failed to remove all the matching clauses we expected to
4723 		 * find, chicken out and ignore this FK; applying its selectivity
4724 		 * might result in double-counting.  Put any clauses we did manage to
4725 		 * remove back into the worklist.
4726 		 *
4727 		 * Since the matching clauses are known not outerjoin-delayed, they
4728 		 * should certainly have appeared in the initial joinclause list.  If
4729 		 * we didn't find them, they must have been matched to, and removed
4730 		 * by, some other FK in a previous iteration of this loop.  (A likely
4731 		 * case is that two FKs are matched to the same EC; there will be only
4732 		 * one EC-derived clause in the initial list, so the first FK will
4733 		 * consume it.)  Applying both FKs' selectivity independently risks
4734 		 * underestimating the join size; in particular, this would undo one
4735 		 * of the main things that ECs were invented for, namely to avoid
4736 		 * double-counting the selectivity of redundant equality conditions.
4737 		 * Later we might think of a reasonable way to combine the estimates,
4738 		 * but for now, just punt, since this is a fairly uncommon situation.
4739 		 */
4740 		if (list_length(removedlist) !=
4741 			(fkinfo->nmatched_ec + fkinfo->nmatched_ri))
4742 		{
4743 			worklist = list_concat(worklist, removedlist);
4744 			continue;
4745 		}
4746 
4747 		/*
4748 		 * Finally we get to the payoff: estimate selectivity using the
4749 		 * knowledge that each referencing row will match exactly one row in
4750 		 * the referenced table.
4751 		 *
4752 		 * XXX that's not true in the presence of nulls in the referencing
4753 		 * column(s), so in principle we should derate the estimate for those.
4754 		 * However (1) if there are any strict restriction clauses for the
4755 		 * referencing column(s) elsewhere in the query, derating here would
4756 		 * be double-counting the null fraction, and (2) it's not very clear
4757 		 * how to combine null fractions for multiple referencing columns. So
4758 		 * we do nothing for now about correcting for nulls.
4759 		 *
4760 		 * XXX another point here is that if either side of an FK constraint
4761 		 * is an inheritance parent, we estimate as though the constraint
4762 		 * covers all its children as well.  This is not an unreasonable
4763 		 * assumption for a referencing table, ie the user probably applied
4764 		 * identical constraints to all child tables (though perhaps we ought
4765 		 * to check that).  But it's not possible to have done that for a
4766 		 * referenced table.  Fortunately, precisely because that doesn't
4767 		 * work, it is uncommon in practice to have an FK referencing a parent
4768 		 * table.  So, at least for now, disregard inheritance here.
4769 		 */
4770 		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
4771 		{
4772 			/*
4773 			 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
4774 			 * referenced table is exactly the inside of the join.  The join
4775 			 * selectivity is defined as the fraction of LHS rows that have
4776 			 * matches.  The FK implies that every LHS row has a match *in the
4777 			 * referenced table*; but any restriction clauses on it will
4778 			 * reduce the number of matches.  Hence we take the join
4779 			 * selectivity as equal to the selectivity of the table's
4780 			 * restriction clauses, which is rows / tuples; but we must guard
4781 			 * against tuples == 0.
4782 			 */
4783 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4784 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
4785 
4786 			fkselec *= ref_rel->rows / ref_tuples;
4787 		}
4788 		else
4789 		{
4790 			/*
4791 			 * Otherwise, selectivity is exactly 1/referenced-table-size; but
4792 			 * guard against tuples == 0.  Note we should use the raw table
4793 			 * tuple count, not any estimate of its filtered or joined size.
4794 			 */
4795 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
4796 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
4797 
4798 			fkselec *= 1.0 / ref_tuples;
4799 		}
4800 	}
4801 
4802 	*restrictlist = worklist;
4803 	return fkselec;
4804 }
4805 
4806 /*
4807  * set_subquery_size_estimates
4808  *		Set the size estimates for a base relation that is a subquery.
4809  *
4810  * The rel's targetlist and restrictinfo list must have been constructed
4811  * already, and the Paths for the subquery must have been completed.
4812  * We look at the subquery's PlannerInfo to extract data.
4813  *
4814  * We set the same fields as set_baserel_size_estimates.
4815  */
4816 void
set_subquery_size_estimates(PlannerInfo * root,RelOptInfo * rel)4817 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4818 {
4819 	PlannerInfo *subroot = rel->subroot;
4820 	RelOptInfo *sub_final_rel;
4821 	ListCell   *lc;
4822 
4823 	/* Should only be applied to base relations that are subqueries */
4824 	Assert(rel->relid > 0);
4825 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
4826 
4827 	/*
4828 	 * Copy raw number of output rows from subquery.  All of its paths should
4829 	 * have the same output rowcount, so just look at cheapest-total.
4830 	 */
4831 	sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
4832 	rel->tuples = sub_final_rel->cheapest_total_path->rows;
4833 
4834 	/*
4835 	 * Compute per-output-column width estimates by examining the subquery's
4836 	 * targetlist.  For any output that is a plain Var, get the width estimate
4837 	 * that was made while planning the subquery.  Otherwise, we leave it to
4838 	 * set_rel_width to fill in a datatype-based default estimate.
4839 	 */
4840 	foreach(lc, subroot->parse->targetList)
4841 	{
4842 		TargetEntry *te = lfirst_node(TargetEntry, lc);
4843 		Node	   *texpr = (Node *) te->expr;
4844 		int32		item_width = 0;
4845 
4846 		/* junk columns aren't visible to upper query */
4847 		if (te->resjunk)
4848 			continue;
4849 
4850 		/*
4851 		 * The subquery could be an expansion of a view that's had columns
4852 		 * added to it since the current query was parsed, so that there are
4853 		 * non-junk tlist columns in it that don't correspond to any column
4854 		 * visible at our query level.  Ignore such columns.
4855 		 */
4856 		if (te->resno < rel->min_attr || te->resno > rel->max_attr)
4857 			continue;
4858 
4859 		/*
4860 		 * XXX This currently doesn't work for subqueries containing set
4861 		 * operations, because the Vars in their tlists are bogus references
4862 		 * to the first leaf subquery, which wouldn't give the right answer
4863 		 * even if we could still get to its PlannerInfo.
4864 		 *
4865 		 * Also, the subquery could be an appendrel for which all branches are
4866 		 * known empty due to constraint exclusion, in which case
4867 		 * set_append_rel_pathlist will have left the attr_widths set to zero.
4868 		 *
4869 		 * In either case, we just leave the width estimate zero until
4870 		 * set_rel_width fixes it.
4871 		 */
4872 		if (IsA(texpr, Var) &&
4873 			subroot->parse->setOperations == NULL)
4874 		{
4875 			Var		   *var = (Var *) texpr;
4876 			RelOptInfo *subrel = find_base_rel(subroot, var->varno);
4877 
4878 			item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
4879 		}
4880 		rel->attr_widths[te->resno - rel->min_attr] = item_width;
4881 	}
4882 
4883 	/* Now estimate number of output rows, etc */
4884 	set_baserel_size_estimates(root, rel);
4885 }
4886 
4887 /*
4888  * set_function_size_estimates
4889  *		Set the size estimates for a base relation that is a function call.
4890  *
4891  * The rel's targetlist and restrictinfo list must have been constructed
4892  * already.
4893  *
4894  * We set the same fields as set_baserel_size_estimates.
4895  */
4896 void
set_function_size_estimates(PlannerInfo * root,RelOptInfo * rel)4897 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4898 {
4899 	RangeTblEntry *rte;
4900 	ListCell   *lc;
4901 
4902 	/* Should only be applied to base relations that are functions */
4903 	Assert(rel->relid > 0);
4904 	rte = planner_rt_fetch(rel->relid, root);
4905 	Assert(rte->rtekind == RTE_FUNCTION);
4906 
4907 	/*
4908 	 * Estimate number of rows the functions will return. The rowcount of the
4909 	 * node is that of the largest function result.
4910 	 */
4911 	rel->tuples = 0;
4912 	foreach(lc, rte->functions)
4913 	{
4914 		RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
4915 		double		ntup = expression_returns_set_rows(rtfunc->funcexpr);
4916 
4917 		if (ntup > rel->tuples)
4918 			rel->tuples = ntup;
4919 	}
4920 
4921 	/* Now estimate number of output rows, etc */
4922 	set_baserel_size_estimates(root, rel);
4923 }
4924 
4925 /*
4926  * set_function_size_estimates
4927  *		Set the size estimates for a base relation that is a function call.
4928  *
4929  * The rel's targetlist and restrictinfo list must have been constructed
4930  * already.
4931  *
4932  * We set the same fields as set_tablefunc_size_estimates.
4933  */
4934 void
set_tablefunc_size_estimates(PlannerInfo * root,RelOptInfo * rel)4935 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4936 {
4937 	/* Should only be applied to base relations that are functions */
4938 	Assert(rel->relid > 0);
4939 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
4940 
4941 	rel->tuples = 100;
4942 
4943 	/* Now estimate number of output rows, etc */
4944 	set_baserel_size_estimates(root, rel);
4945 }
4946 
4947 /*
4948  * set_values_size_estimates
4949  *		Set the size estimates for a base relation that is a values list.
4950  *
4951  * The rel's targetlist and restrictinfo list must have been constructed
4952  * already.
4953  *
4954  * We set the same fields as set_baserel_size_estimates.
4955  */
4956 void
set_values_size_estimates(PlannerInfo * root,RelOptInfo * rel)4957 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4958 {
4959 	RangeTblEntry *rte;
4960 
4961 	/* Should only be applied to base relations that are values lists */
4962 	Assert(rel->relid > 0);
4963 	rte = planner_rt_fetch(rel->relid, root);
4964 	Assert(rte->rtekind == RTE_VALUES);
4965 
4966 	/*
4967 	 * Estimate number of rows the values list will return. We know this
4968 	 * precisely based on the list length (well, barring set-returning
4969 	 * functions in list items, but that's a refinement not catered for
4970 	 * anywhere else either).
4971 	 */
4972 	rel->tuples = list_length(rte->values_lists);
4973 
4974 	/* Now estimate number of output rows, etc */
4975 	set_baserel_size_estimates(root, rel);
4976 }
4977 
4978 /*
4979  * set_cte_size_estimates
4980  *		Set the size estimates for a base relation that is a CTE reference.
4981  *
4982  * The rel's targetlist and restrictinfo list must have been constructed
4983  * already, and we need an estimate of the number of rows returned by the CTE
4984  * (if a regular CTE) or the non-recursive term (if a self-reference).
4985  *
4986  * We set the same fields as set_baserel_size_estimates.
4987  */
4988 void
set_cte_size_estimates(PlannerInfo * root,RelOptInfo * rel,double cte_rows)4989 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
4990 {
4991 	RangeTblEntry *rte;
4992 
4993 	/* Should only be applied to base relations that are CTE references */
4994 	Assert(rel->relid > 0);
4995 	rte = planner_rt_fetch(rel->relid, root);
4996 	Assert(rte->rtekind == RTE_CTE);
4997 
4998 	if (rte->self_reference)
4999 	{
5000 		/*
5001 		 * In a self-reference, arbitrarily assume the average worktable size
5002 		 * is about 10 times the nonrecursive term's size.
5003 		 */
5004 		rel->tuples = 10 * cte_rows;
5005 	}
5006 	else
5007 	{
5008 		/* Otherwise just believe the CTE's rowcount estimate */
5009 		rel->tuples = cte_rows;
5010 	}
5011 
5012 	/* Now estimate number of output rows, etc */
5013 	set_baserel_size_estimates(root, rel);
5014 }
5015 
5016 /*
5017  * set_namedtuplestore_size_estimates
5018  *		Set the size estimates for a base relation that is a tuplestore reference.
5019  *
5020  * The rel's targetlist and restrictinfo list must have been constructed
5021  * already.
5022  *
5023  * We set the same fields as set_baserel_size_estimates.
5024  */
5025 void
set_namedtuplestore_size_estimates(PlannerInfo * root,RelOptInfo * rel)5026 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5027 {
5028 	RangeTblEntry *rte;
5029 
5030 	/* Should only be applied to base relations that are tuplestore references */
5031 	Assert(rel->relid > 0);
5032 	rte = planner_rt_fetch(rel->relid, root);
5033 	Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5034 
5035 	/*
5036 	 * Use the estimate provided by the code which is generating the named
5037 	 * tuplestore.  In some cases, the actual number might be available; in
5038 	 * others the same plan will be re-used, so a "typical" value might be
5039 	 * estimated and used.
5040 	 */
5041 	rel->tuples = rte->enrtuples;
5042 	if (rel->tuples < 0)
5043 		rel->tuples = 1000;
5044 
5045 	/* Now estimate number of output rows, etc */
5046 	set_baserel_size_estimates(root, rel);
5047 }
5048 
5049 /*
5050  * set_foreign_size_estimates
5051  *		Set the size estimates for a base relation that is a foreign table.
5052  *
5053  * There is not a whole lot that we can do here; the foreign-data wrapper
5054  * is responsible for producing useful estimates.  We can do a decent job
5055  * of estimating baserestrictcost, so we set that, and we also set up width
5056  * using what will be purely datatype-driven estimates from the targetlist.
5057  * There is no way to do anything sane with the rows value, so we just put
5058  * a default estimate and hope that the wrapper can improve on it.  The
5059  * wrapper's GetForeignRelSize function will be called momentarily.
5060  *
5061  * The rel's targetlist and restrictinfo list must have been constructed
5062  * already.
5063  */
5064 void
set_foreign_size_estimates(PlannerInfo * root,RelOptInfo * rel)5065 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5066 {
5067 	/* Should only be applied to base relations */
5068 	Assert(rel->relid > 0);
5069 
5070 	rel->rows = 1000;			/* entirely bogus default estimate */
5071 
5072 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5073 
5074 	set_rel_width(root, rel);
5075 }
5076 
5077 
5078 /*
5079  * set_rel_width
5080  *		Set the estimated output width of a base relation.
5081  *
5082  * The estimated output width is the sum of the per-attribute width estimates
5083  * for the actually-referenced columns, plus any PHVs or other expressions
5084  * that have to be calculated at this relation.  This is the amount of data
5085  * we'd need to pass upwards in case of a sort, hash, etc.
5086  *
5087  * This function also sets reltarget->cost, so it's a bit misnamed now.
5088  *
5089  * NB: this works best on plain relations because it prefers to look at
5090  * real Vars.  For subqueries, set_subquery_size_estimates will already have
5091  * copied up whatever per-column estimates were made within the subquery,
5092  * and for other types of rels there isn't much we can do anyway.  We fall
5093  * back on (fairly stupid) datatype-based width estimates if we can't get
5094  * any better number.
5095  *
5096  * The per-attribute width estimates are cached for possible re-use while
5097  * building join relations or post-scan/join pathtargets.
5098  */
5099 static void
set_rel_width(PlannerInfo * root,RelOptInfo * rel)5100 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5101 {
5102 	Oid			reloid = planner_rt_fetch(rel->relid, root)->relid;
5103 	int32		tuple_width = 0;
5104 	bool		have_wholerow_var = false;
5105 	ListCell   *lc;
5106 
5107 	/* Vars are assumed to have cost zero, but other exprs do not */
5108 	rel->reltarget->cost.startup = 0;
5109 	rel->reltarget->cost.per_tuple = 0;
5110 
5111 	foreach(lc, rel->reltarget->exprs)
5112 	{
5113 		Node	   *node = (Node *) lfirst(lc);
5114 
5115 		/*
5116 		 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5117 		 * but there are corner cases involving LATERAL references where that
5118 		 * isn't so.  If the Var has the wrong varno, fall through to the
5119 		 * generic case (it doesn't seem worth the trouble to be any smarter).
5120 		 */
5121 		if (IsA(node, Var) &&
5122 			((Var *) node)->varno == rel->relid)
5123 		{
5124 			Var		   *var = (Var *) node;
5125 			int			ndx;
5126 			int32		item_width;
5127 
5128 			Assert(var->varattno >= rel->min_attr);
5129 			Assert(var->varattno <= rel->max_attr);
5130 
5131 			ndx = var->varattno - rel->min_attr;
5132 
5133 			/*
5134 			 * If it's a whole-row Var, we'll deal with it below after we have
5135 			 * already cached as many attr widths as possible.
5136 			 */
5137 			if (var->varattno == 0)
5138 			{
5139 				have_wholerow_var = true;
5140 				continue;
5141 			}
5142 
5143 			/*
5144 			 * The width may have been cached already (especially if it's a
5145 			 * subquery), so don't duplicate effort.
5146 			 */
5147 			if (rel->attr_widths[ndx] > 0)
5148 			{
5149 				tuple_width += rel->attr_widths[ndx];
5150 				continue;
5151 			}
5152 
5153 			/* Try to get column width from statistics */
5154 			if (reloid != InvalidOid && var->varattno > 0)
5155 			{
5156 				item_width = get_attavgwidth(reloid, var->varattno);
5157 				if (item_width > 0)
5158 				{
5159 					rel->attr_widths[ndx] = item_width;
5160 					tuple_width += item_width;
5161 					continue;
5162 				}
5163 			}
5164 
5165 			/*
5166 			 * Not a plain relation, or can't find statistics for it. Estimate
5167 			 * using just the type info.
5168 			 */
5169 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5170 			Assert(item_width > 0);
5171 			rel->attr_widths[ndx] = item_width;
5172 			tuple_width += item_width;
5173 		}
5174 		else if (IsA(node, PlaceHolderVar))
5175 		{
5176 			/*
5177 			 * We will need to evaluate the PHV's contained expression while
5178 			 * scanning this rel, so be sure to include it in reltarget->cost.
5179 			 */
5180 			PlaceHolderVar *phv = (PlaceHolderVar *) node;
5181 			PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5182 			QualCost	cost;
5183 
5184 			tuple_width += phinfo->ph_width;
5185 			cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5186 			rel->reltarget->cost.startup += cost.startup;
5187 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5188 		}
5189 		else
5190 		{
5191 			/*
5192 			 * We could be looking at an expression pulled up from a subquery,
5193 			 * or a ROW() representing a whole-row child Var, etc.  Do what we
5194 			 * can using the expression type information.
5195 			 */
5196 			int32		item_width;
5197 			QualCost	cost;
5198 
5199 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5200 			Assert(item_width > 0);
5201 			tuple_width += item_width;
5202 			/* Not entirely clear if we need to account for cost, but do so */
5203 			cost_qual_eval_node(&cost, node, root);
5204 			rel->reltarget->cost.startup += cost.startup;
5205 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5206 		}
5207 	}
5208 
5209 	/*
5210 	 * If we have a whole-row reference, estimate its width as the sum of
5211 	 * per-column widths plus heap tuple header overhead.
5212 	 */
5213 	if (have_wholerow_var)
5214 	{
5215 		int32		wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5216 
5217 		if (reloid != InvalidOid)
5218 		{
5219 			/* Real relation, so estimate true tuple width */
5220 			wholerow_width += get_relation_data_width(reloid,
5221 													  rel->attr_widths - rel->min_attr);
5222 		}
5223 		else
5224 		{
5225 			/* Do what we can with info for a phony rel */
5226 			AttrNumber	i;
5227 
5228 			for (i = 1; i <= rel->max_attr; i++)
5229 				wholerow_width += rel->attr_widths[i - rel->min_attr];
5230 		}
5231 
5232 		rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5233 
5234 		/*
5235 		 * Include the whole-row Var as part of the output tuple.  Yes, that
5236 		 * really is what happens at runtime.
5237 		 */
5238 		tuple_width += wholerow_width;
5239 	}
5240 
5241 	Assert(tuple_width >= 0);
5242 	rel->reltarget->width = tuple_width;
5243 }
5244 
5245 /*
5246  * set_pathtarget_cost_width
5247  *		Set the estimated eval cost and output width of a PathTarget tlist.
5248  *
5249  * As a notational convenience, returns the same PathTarget pointer passed in.
5250  *
5251  * Most, though not quite all, uses of this function occur after we've run
5252  * set_rel_width() for base relations; so we can usually obtain cached width
5253  * estimates for Vars.  If we can't, fall back on datatype-based width
5254  * estimates.  Present early-planning uses of PathTargets don't need accurate
5255  * widths badly enough to justify going to the catalogs for better data.
5256  */
5257 PathTarget *
set_pathtarget_cost_width(PlannerInfo * root,PathTarget * target)5258 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5259 {
5260 	int32		tuple_width = 0;
5261 	ListCell   *lc;
5262 
5263 	/* Vars are assumed to have cost zero, but other exprs do not */
5264 	target->cost.startup = 0;
5265 	target->cost.per_tuple = 0;
5266 
5267 	foreach(lc, target->exprs)
5268 	{
5269 		Node	   *node = (Node *) lfirst(lc);
5270 
5271 		if (IsA(node, Var))
5272 		{
5273 			Var		   *var = (Var *) node;
5274 			int32		item_width;
5275 
5276 			/* We should not see any upper-level Vars here */
5277 			Assert(var->varlevelsup == 0);
5278 
5279 			/* Try to get data from RelOptInfo cache */
5280 			if (var->varno < root->simple_rel_array_size)
5281 			{
5282 				RelOptInfo *rel = root->simple_rel_array[var->varno];
5283 
5284 				if (rel != NULL &&
5285 					var->varattno >= rel->min_attr &&
5286 					var->varattno <= rel->max_attr)
5287 				{
5288 					int			ndx = var->varattno - rel->min_attr;
5289 
5290 					if (rel->attr_widths[ndx] > 0)
5291 					{
5292 						tuple_width += rel->attr_widths[ndx];
5293 						continue;
5294 					}
5295 				}
5296 			}
5297 
5298 			/*
5299 			 * No cached data available, so estimate using just the type info.
5300 			 */
5301 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5302 			Assert(item_width > 0);
5303 			tuple_width += item_width;
5304 		}
5305 		else
5306 		{
5307 			/*
5308 			 * Handle general expressions using type info.
5309 			 */
5310 			int32		item_width;
5311 			QualCost	cost;
5312 
5313 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5314 			Assert(item_width > 0);
5315 			tuple_width += item_width;
5316 
5317 			/* Account for cost, too */
5318 			cost_qual_eval_node(&cost, node, root);
5319 			target->cost.startup += cost.startup;
5320 			target->cost.per_tuple += cost.per_tuple;
5321 		}
5322 	}
5323 
5324 	Assert(tuple_width >= 0);
5325 	target->width = tuple_width;
5326 
5327 	return target;
5328 }
5329 
5330 /*
5331  * relation_byte_size
5332  *	  Estimate the storage space in bytes for a given number of tuples
5333  *	  of a given width (size in bytes).
5334  */
5335 static double
relation_byte_size(double tuples,int width)5336 relation_byte_size(double tuples, int width)
5337 {
5338 	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5339 }
5340 
5341 /*
5342  * page_size
5343  *	  Returns an estimate of the number of pages covered by a given
5344  *	  number of tuples of a given width (size in bytes).
5345  */
5346 static double
page_size(double tuples,int width)5347 page_size(double tuples, int width)
5348 {
5349 	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5350 }
5351 
5352 /*
5353  * Estimate the fraction of the work that each worker will do given the
5354  * number of workers budgeted for the path.
5355  */
5356 static double
get_parallel_divisor(Path * path)5357 get_parallel_divisor(Path *path)
5358 {
5359 	double		parallel_divisor = path->parallel_workers;
5360 
5361 	/*
5362 	 * Early experience with parallel query suggests that when there is only
5363 	 * one worker, the leader often makes a very substantial contribution to
5364 	 * executing the parallel portion of the plan, but as more workers are
5365 	 * added, it does less and less, because it's busy reading tuples from the
5366 	 * workers and doing whatever non-parallel post-processing is needed.  By
5367 	 * the time we reach 4 workers, the leader no longer makes a meaningful
5368 	 * contribution.  Thus, for now, estimate that the leader spends 30% of
5369 	 * its time servicing each worker, and the remainder executing the
5370 	 * parallel plan.
5371 	 */
5372 	if (parallel_leader_participation)
5373 	{
5374 		double		leader_contribution;
5375 
5376 		leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5377 		if (leader_contribution > 0)
5378 			parallel_divisor += leader_contribution;
5379 	}
5380 
5381 	return parallel_divisor;
5382 }
5383 
5384 /*
5385  * compute_bitmap_pages
5386  *
5387  * compute number of pages fetched from heap in bitmap heap scan.
5388  */
5389 double
compute_bitmap_pages(PlannerInfo * root,RelOptInfo * baserel,Path * bitmapqual,int loop_count,Cost * cost,double * tuple)5390 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5391 					 int loop_count, Cost *cost, double *tuple)
5392 {
5393 	Cost		indexTotalCost;
5394 	Selectivity indexSelectivity;
5395 	double		T;
5396 	double		pages_fetched;
5397 	double		tuples_fetched;
5398 	double		heap_pages;
5399 	long		maxentries;
5400 
5401 	/*
5402 	 * Fetch total cost of obtaining the bitmap, as well as its total
5403 	 * selectivity.
5404 	 */
5405 	cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5406 
5407 	/*
5408 	 * Estimate number of main-table pages fetched.
5409 	 */
5410 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5411 
5412 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5413 
5414 	/*
5415 	 * For a single scan, the number of heap pages that need to be fetched is
5416 	 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5417 	 * re-reads needed).
5418 	 */
5419 	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5420 
5421 	/*
5422 	 * Calculate the number of pages fetched from the heap.  Then based on
5423 	 * current work_mem estimate get the estimated maxentries in the bitmap.
5424 	 * (Note that we always do this calculation based on the number of pages
5425 	 * that would be fetched in a single iteration, even if loop_count > 1.
5426 	 * That's correct, because only that number of entries will be stored in
5427 	 * the bitmap at one time.)
5428 	 */
5429 	heap_pages = Min(pages_fetched, baserel->pages);
5430 	maxentries = tbm_calculate_entries(work_mem * 1024L);
5431 
5432 	if (loop_count > 1)
5433 	{
5434 		/*
5435 		 * For repeated bitmap scans, scale up the number of tuples fetched in
5436 		 * the Mackert and Lohman formula by the number of scans, so that we
5437 		 * estimate the number of pages fetched by all the scans. Then
5438 		 * pro-rate for one scan.
5439 		 */
5440 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5441 											baserel->pages,
5442 											get_indexpath_pages(bitmapqual),
5443 											root);
5444 		pages_fetched /= loop_count;
5445 	}
5446 
5447 	if (pages_fetched >= T)
5448 		pages_fetched = T;
5449 	else
5450 		pages_fetched = ceil(pages_fetched);
5451 
5452 	if (maxentries < heap_pages)
5453 	{
5454 		double		exact_pages;
5455 		double		lossy_pages;
5456 
5457 		/*
5458 		 * Crude approximation of the number of lossy pages.  Because of the
5459 		 * way tbm_lossify() is coded, the number of lossy pages increases
5460 		 * very sharply as soon as we run short of memory; this formula has
5461 		 * that property and seems to perform adequately in testing, but it's
5462 		 * possible we could do better somehow.
5463 		 */
5464 		lossy_pages = Max(0, heap_pages - maxentries / 2);
5465 		exact_pages = heap_pages - lossy_pages;
5466 
5467 		/*
5468 		 * If there are lossy pages then recompute the  number of tuples
5469 		 * processed by the bitmap heap node.  We assume here that the chance
5470 		 * of a given tuple coming from an exact page is the same as the
5471 		 * chance that a given page is exact.  This might not be true, but
5472 		 * it's not clear how we can do any better.
5473 		 */
5474 		if (lossy_pages > 0)
5475 			tuples_fetched =
5476 				clamp_row_est(indexSelectivity *
5477 							  (exact_pages / heap_pages) * baserel->tuples +
5478 							  (lossy_pages / heap_pages) * baserel->tuples);
5479 	}
5480 
5481 	if (cost)
5482 		*cost = indexTotalCost;
5483 	if (tuple)
5484 		*tuple = tuples_fetched;
5485 
5486 	return pages_fetched;
5487 }
5488