1 /*-------------------------------------------------------------------------
2  *
3  * costsize.c
4  *	  Routines to compute (and set) relation sizes and path costs
5  *
6  * Path costs are measured in arbitrary units established by these basic
7  * parameters:
8  *
9  *	seq_page_cost		Cost of a sequential page fetch
10  *	random_page_cost	Cost of a non-sequential page fetch
11  *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
12  *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
13  *	cpu_operator_cost	Cost of CPU time to execute an operator or function
14  *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
15  *	parallel_setup_cost Cost of setting up shared memory for parallelism
16  *
17  * We expect that the kernel will typically do some amount of read-ahead
18  * optimization; this in conjunction with seek costs means that seq_page_cost
19  * is normally considerably less than random_page_cost.  (However, if the
20  * database is fully cached in RAM, it is reasonable to set them equal.)
21  *
22  * We also use a rough estimate "effective_cache_size" of the number of
23  * disk pages in Postgres + OS-level disk cache.  (We can't simply use
24  * NBuffers for this purpose because that would ignore the effects of
25  * the kernel's disk cache.)
26  *
27  * Obviously, taking constants for these values is an oversimplification,
28  * but it's tough enough to get any useful estimates even at this level of
29  * detail.  Note that all of these parameters are user-settable, in case
30  * the default values are drastically off for a particular platform.
31  *
32  * seq_page_cost and random_page_cost can also be overridden for an individual
33  * tablespace, in case some data is on a fast disk and other data is on a slow
34  * disk.  Per-tablespace overrides never apply to temporary work files such as
35  * an external sort or a materialize node that overflows work_mem.
36  *
37  * We compute two separate costs for each path:
38  *		total_cost: total estimated cost to fetch all tuples
39  *		startup_cost: cost that is expended before first tuple is fetched
40  * In some scenarios, such as when there is a LIMIT or we are implementing
41  * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the
42  * path's result.  A caller can estimate the cost of fetching a partial
43  * result by interpolating between startup_cost and total_cost.  In detail:
44  *		actual_cost = startup_cost +
45  *			(total_cost - startup_cost) * tuples_to_fetch / path->rows;
46  * Note that a base relation's rows count (and, by extension, plan_rows for
47  * plan nodes below the LIMIT node) are set without regard to any LIMIT, so
48  * that this equation works properly.  (Note: while path->rows is never zero
49  * for ordinary relations, it is zero for paths for provably-empty relations,
50  * so beware of division-by-zero.)	The LIMIT is applied as a top-level
51  * plan node.
52  *
53  * For largely historical reasons, most of the routines in this module use
54  * the passed result Path only to store their results (rows, startup_cost and
55  * total_cost) into.  All the input data they need is passed as separate
56  * parameters, even though much of it could be extracted from the Path.
57  * An exception is made for the cost_XXXjoin() routines, which expect all
58  * the other fields of the passed XXXPath to be filled in, and similarly
59  * cost_index() assumes the passed IndexPath is valid except for its output
60  * values.
61  *
62  *
63  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
64  * Portions Copyright (c) 1994, Regents of the University of California
65  *
66  * IDENTIFICATION
67  *	  src/backend/optimizer/path/costsize.c
68  *
69  *-------------------------------------------------------------------------
70  */
71 
72 #include "postgres.h"
73 
74 #include <math.h>
75 
76 #include "access/amapi.h"
77 #include "access/htup_details.h"
78 #include "access/tsmapi.h"
79 #include "executor/executor.h"
80 #include "executor/nodeAgg.h"
81 #include "executor/nodeHash.h"
82 #include "miscadmin.h"
83 #include "nodes/makefuncs.h"
84 #include "nodes/nodeFuncs.h"
85 #include "optimizer/clauses.h"
86 #include "optimizer/cost.h"
87 #include "optimizer/optimizer.h"
88 #include "optimizer/pathnode.h"
89 #include "optimizer/paths.h"
90 #include "optimizer/placeholder.h"
91 #include "optimizer/plancat.h"
92 #include "optimizer/planmain.h"
93 #include "optimizer/restrictinfo.h"
94 #include "parser/parsetree.h"
95 #include "utils/lsyscache.h"
96 #include "utils/selfuncs.h"
97 #include "utils/spccache.h"
98 #include "utils/tuplesort.h"
99 
100 
101 /* source-code-compatibility hacks for pull_varnos() API change */
102 #define pull_varnos(a,b) pull_varnos_new(a,b)
103 
104 #define LOG2(x)  (log(x) / 0.693147180559945)
105 
106 /*
107  * Append and MergeAppend nodes are less expensive than some other operations
108  * which use cpu_tuple_cost; instead of adding a separate GUC, estimate the
109  * per-tuple cost as cpu_tuple_cost multiplied by this value.
110  */
111 #define APPEND_CPU_COST_MULTIPLIER 0.5
112 
113 
114 double		seq_page_cost = DEFAULT_SEQ_PAGE_COST;
115 double		random_page_cost = DEFAULT_RANDOM_PAGE_COST;
116 double		cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
117 double		cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
118 double		cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
119 double		parallel_tuple_cost = DEFAULT_PARALLEL_TUPLE_COST;
120 double		parallel_setup_cost = DEFAULT_PARALLEL_SETUP_COST;
121 
122 int			effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
123 
124 Cost		disable_cost = 1.0e10;
125 
126 int			max_parallel_workers_per_gather = 2;
127 
128 bool		enable_seqscan = true;
129 bool		enable_indexscan = true;
130 bool		enable_indexonlyscan = true;
131 bool		enable_bitmapscan = true;
132 bool		enable_tidscan = true;
133 bool		enable_sort = true;
134 bool		enable_incremental_sort = true;
135 bool		enable_hashagg = true;
136 bool		enable_nestloop = true;
137 bool		enable_material = true;
138 bool		enable_mergejoin = true;
139 bool		enable_hashjoin = true;
140 bool		enable_gathermerge = true;
141 bool		enable_partitionwise_join = false;
142 bool		enable_partitionwise_aggregate = false;
143 bool		enable_parallel_append = true;
144 bool		enable_parallel_hash = true;
145 bool		enable_partition_pruning = true;
146 
147 typedef struct
148 {
149 	PlannerInfo *root;
150 	QualCost	total;
151 } cost_qual_eval_context;
152 
153 static List *extract_nonindex_conditions(List *qual_clauses, List *indexclauses);
154 static MergeScanSelCache *cached_scansel(PlannerInfo *root,
155 										 RestrictInfo *rinfo,
156 										 PathKey *pathkey);
157 static void cost_rescan(PlannerInfo *root, Path *path,
158 						Cost *rescan_startup_cost, Cost *rescan_total_cost);
159 static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
160 static void get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
161 									  ParamPathInfo *param_info,
162 									  QualCost *qpqual_cost);
163 static bool has_indexed_join_quals(NestPath *joinpath);
164 static double approx_tuple_count(PlannerInfo *root, JoinPath *path,
165 								 List *quals);
166 static double calc_joinrel_size_estimate(PlannerInfo *root,
167 										 RelOptInfo *joinrel,
168 										 RelOptInfo *outer_rel,
169 										 RelOptInfo *inner_rel,
170 										 double outer_rows,
171 										 double inner_rows,
172 										 SpecialJoinInfo *sjinfo,
173 										 List *restrictlist);
174 static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
175 													Relids outer_relids,
176 													Relids inner_relids,
177 													SpecialJoinInfo *sjinfo,
178 													List **restrictlist);
179 static Cost append_nonpartial_cost(List *subpaths, int numpaths,
180 								   int parallel_workers);
181 static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
182 static double relation_byte_size(double tuples, int width);
183 static double page_size(double tuples, int width);
184 static double get_parallel_divisor(Path *path);
185 
186 
187 /*
188  * clamp_row_est
189  *		Force a row-count estimate to a sane value.
190  */
191 double
clamp_row_est(double nrows)192 clamp_row_est(double nrows)
193 {
194 	/*
195 	 * Force estimate to be at least one row, to make explain output look
196 	 * better and to avoid possible divide-by-zero when interpolating costs.
197 	 * Make it an integer, too.
198 	 */
199 	if (nrows <= 1.0)
200 		nrows = 1.0;
201 	else
202 		nrows = rint(nrows);
203 
204 	return nrows;
205 }
206 
207 
208 /*
209  * cost_seqscan
210  *	  Determines and returns the cost of scanning a relation sequentially.
211  *
212  * 'baserel' is the relation to be scanned
213  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
214  */
215 void
cost_seqscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)216 cost_seqscan(Path *path, PlannerInfo *root,
217 			 RelOptInfo *baserel, ParamPathInfo *param_info)
218 {
219 	Cost		startup_cost = 0;
220 	Cost		cpu_run_cost;
221 	Cost		disk_run_cost;
222 	double		spc_seq_page_cost;
223 	QualCost	qpqual_cost;
224 	Cost		cpu_per_tuple;
225 
226 	/* Should only be applied to base relations */
227 	Assert(baserel->relid > 0);
228 	Assert(baserel->rtekind == RTE_RELATION);
229 
230 	/* Mark the path with the correct row estimate */
231 	if (param_info)
232 		path->rows = param_info->ppi_rows;
233 	else
234 		path->rows = baserel->rows;
235 
236 	if (!enable_seqscan)
237 		startup_cost += disable_cost;
238 
239 	/* fetch estimated page cost for tablespace containing table */
240 	get_tablespace_page_costs(baserel->reltablespace,
241 							  NULL,
242 							  &spc_seq_page_cost);
243 
244 	/*
245 	 * disk costs
246 	 */
247 	disk_run_cost = spc_seq_page_cost * baserel->pages;
248 
249 	/* CPU costs */
250 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
251 
252 	startup_cost += qpqual_cost.startup;
253 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
254 	cpu_run_cost = cpu_per_tuple * baserel->tuples;
255 	/* tlist eval costs are paid per output row, not per tuple scanned */
256 	startup_cost += path->pathtarget->cost.startup;
257 	cpu_run_cost += path->pathtarget->cost.per_tuple * path->rows;
258 
259 	/* Adjust costing for parallelism, if used. */
260 	if (path->parallel_workers > 0)
261 	{
262 		double		parallel_divisor = get_parallel_divisor(path);
263 
264 		/* The CPU cost is divided among all the workers. */
265 		cpu_run_cost /= parallel_divisor;
266 
267 		/*
268 		 * It may be possible to amortize some of the I/O cost, but probably
269 		 * not very much, because most operating systems already do aggressive
270 		 * prefetching.  For now, we assume that the disk run cost can't be
271 		 * amortized at all.
272 		 */
273 
274 		/*
275 		 * In the case of a parallel plan, the row count needs to represent
276 		 * the number of tuples processed per worker.
277 		 */
278 		path->rows = clamp_row_est(path->rows / parallel_divisor);
279 	}
280 
281 	path->startup_cost = startup_cost;
282 	path->total_cost = startup_cost + cpu_run_cost + disk_run_cost;
283 }
284 
285 /*
286  * cost_samplescan
287  *	  Determines and returns the cost of scanning a relation using sampling.
288  *
289  * 'baserel' is the relation to be scanned
290  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
291  */
292 void
cost_samplescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)293 cost_samplescan(Path *path, PlannerInfo *root,
294 				RelOptInfo *baserel, ParamPathInfo *param_info)
295 {
296 	Cost		startup_cost = 0;
297 	Cost		run_cost = 0;
298 	RangeTblEntry *rte;
299 	TableSampleClause *tsc;
300 	TsmRoutine *tsm;
301 	double		spc_seq_page_cost,
302 				spc_random_page_cost,
303 				spc_page_cost;
304 	QualCost	qpqual_cost;
305 	Cost		cpu_per_tuple;
306 
307 	/* Should only be applied to base relations with tablesample clauses */
308 	Assert(baserel->relid > 0);
309 	rte = planner_rt_fetch(baserel->relid, root);
310 	Assert(rte->rtekind == RTE_RELATION);
311 	tsc = rte->tablesample;
312 	Assert(tsc != NULL);
313 	tsm = GetTsmRoutine(tsc->tsmhandler);
314 
315 	/* Mark the path with the correct row estimate */
316 	if (param_info)
317 		path->rows = param_info->ppi_rows;
318 	else
319 		path->rows = baserel->rows;
320 
321 	/* fetch estimated page cost for tablespace containing table */
322 	get_tablespace_page_costs(baserel->reltablespace,
323 							  &spc_random_page_cost,
324 							  &spc_seq_page_cost);
325 
326 	/* if NextSampleBlock is used, assume random access, else sequential */
327 	spc_page_cost = (tsm->NextSampleBlock != NULL) ?
328 		spc_random_page_cost : spc_seq_page_cost;
329 
330 	/*
331 	 * disk costs (recall that baserel->pages has already been set to the
332 	 * number of pages the sampling method will visit)
333 	 */
334 	run_cost += spc_page_cost * baserel->pages;
335 
336 	/*
337 	 * CPU costs (recall that baserel->tuples has already been set to the
338 	 * number of tuples the sampling method will select).  Note that we ignore
339 	 * execution cost of the TABLESAMPLE parameter expressions; they will be
340 	 * evaluated only once per scan, and in most usages they'll likely be
341 	 * simple constants anyway.  We also don't charge anything for the
342 	 * calculations the sampling method might do internally.
343 	 */
344 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
345 
346 	startup_cost += qpqual_cost.startup;
347 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
348 	run_cost += cpu_per_tuple * baserel->tuples;
349 	/* tlist eval costs are paid per output row, not per tuple scanned */
350 	startup_cost += path->pathtarget->cost.startup;
351 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
352 
353 	path->startup_cost = startup_cost;
354 	path->total_cost = startup_cost + run_cost;
355 }
356 
357 /*
358  * cost_gather
359  *	  Determines and returns the cost of gather path.
360  *
361  * 'rel' is the relation to be operated upon
362  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
363  * 'rows' may be used to point to a row estimate; if non-NULL, it overrides
364  * both 'rel' and 'param_info'.  This is useful when the path doesn't exactly
365  * correspond to any particular RelOptInfo.
366  */
367 void
cost_gather(GatherPath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,double * rows)368 cost_gather(GatherPath *path, PlannerInfo *root,
369 			RelOptInfo *rel, ParamPathInfo *param_info,
370 			double *rows)
371 {
372 	Cost		startup_cost = 0;
373 	Cost		run_cost = 0;
374 
375 	/* Mark the path with the correct row estimate */
376 	if (rows)
377 		path->path.rows = *rows;
378 	else if (param_info)
379 		path->path.rows = param_info->ppi_rows;
380 	else
381 		path->path.rows = rel->rows;
382 
383 	startup_cost = path->subpath->startup_cost;
384 
385 	run_cost = path->subpath->total_cost - path->subpath->startup_cost;
386 
387 	/* Parallel setup and communication cost. */
388 	startup_cost += parallel_setup_cost;
389 	run_cost += parallel_tuple_cost * path->path.rows;
390 
391 	path->path.startup_cost = startup_cost;
392 	path->path.total_cost = (startup_cost + run_cost);
393 }
394 
395 /*
396  * cost_gather_merge
397  *	  Determines and returns the cost of gather merge path.
398  *
399  * GatherMerge merges several pre-sorted input streams, using a heap that at
400  * any given instant holds the next tuple from each stream. If there are N
401  * streams, we need about N*log2(N) tuple comparisons to construct the heap at
402  * startup, and then for each output tuple, about log2(N) comparisons to
403  * replace the top heap entry with the next tuple from the same stream.
404  */
405 void
cost_gather_merge(GatherMergePath * path,PlannerInfo * root,RelOptInfo * rel,ParamPathInfo * param_info,Cost input_startup_cost,Cost input_total_cost,double * rows)406 cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
407 				  RelOptInfo *rel, ParamPathInfo *param_info,
408 				  Cost input_startup_cost, Cost input_total_cost,
409 				  double *rows)
410 {
411 	Cost		startup_cost = 0;
412 	Cost		run_cost = 0;
413 	Cost		comparison_cost;
414 	double		N;
415 	double		logN;
416 
417 	/* Mark the path with the correct row estimate */
418 	if (rows)
419 		path->path.rows = *rows;
420 	else if (param_info)
421 		path->path.rows = param_info->ppi_rows;
422 	else
423 		path->path.rows = rel->rows;
424 
425 	if (!enable_gathermerge)
426 		startup_cost += disable_cost;
427 
428 	/*
429 	 * Add one to the number of workers to account for the leader.  This might
430 	 * be overgenerous since the leader will do less work than other workers
431 	 * in typical cases, but we'll go with it for now.
432 	 */
433 	Assert(path->num_workers > 0);
434 	N = (double) path->num_workers + 1;
435 	logN = LOG2(N);
436 
437 	/* Assumed cost per tuple comparison */
438 	comparison_cost = 2.0 * cpu_operator_cost;
439 
440 	/* Heap creation cost */
441 	startup_cost += comparison_cost * N * logN;
442 
443 	/* Per-tuple heap maintenance cost */
444 	run_cost += path->path.rows * comparison_cost * logN;
445 
446 	/* small cost for heap management, like cost_merge_append */
447 	run_cost += cpu_operator_cost * path->path.rows;
448 
449 	/*
450 	 * Parallel setup and communication cost.  Since Gather Merge, unlike
451 	 * Gather, requires us to block until a tuple is available from every
452 	 * worker, we bump the IPC cost up a little bit as compared with Gather.
453 	 * For lack of a better idea, charge an extra 5%.
454 	 */
455 	startup_cost += parallel_setup_cost;
456 	run_cost += parallel_tuple_cost * path->path.rows * 1.05;
457 
458 	path->path.startup_cost = startup_cost + input_startup_cost;
459 	path->path.total_cost = (startup_cost + run_cost + input_total_cost);
460 }
461 
462 /*
463  * cost_index
464  *	  Determines and returns the cost of scanning a relation using an index.
465  *
466  * 'path' describes the indexscan under consideration, and is complete
467  *		except for the fields to be set by this routine
468  * 'loop_count' is the number of repetitions of the indexscan to factor into
469  *		estimates of caching behavior
470  *
471  * In addition to rows, startup_cost and total_cost, cost_index() sets the
472  * path's indextotalcost and indexselectivity fields.  These values will be
473  * needed if the IndexPath is used in a BitmapIndexScan.
474  *
475  * NOTE: path->indexquals must contain only clauses usable as index
476  * restrictions.  Any additional quals evaluated as qpquals may reduce the
477  * number of returned tuples, but they won't reduce the number of tuples
478  * we have to fetch from the table, so they don't reduce the scan cost.
479  */
480 void
cost_index(IndexPath * path,PlannerInfo * root,double loop_count,bool partial_path)481 cost_index(IndexPath *path, PlannerInfo *root, double loop_count,
482 		   bool partial_path)
483 {
484 	IndexOptInfo *index = path->indexinfo;
485 	RelOptInfo *baserel = index->rel;
486 	bool		indexonly = (path->path.pathtype == T_IndexOnlyScan);
487 	amcostestimate_function amcostestimate;
488 	List	   *qpquals;
489 	Cost		startup_cost = 0;
490 	Cost		run_cost = 0;
491 	Cost		cpu_run_cost = 0;
492 	Cost		indexStartupCost;
493 	Cost		indexTotalCost;
494 	Selectivity indexSelectivity;
495 	double		indexCorrelation,
496 				csquared;
497 	double		spc_seq_page_cost,
498 				spc_random_page_cost;
499 	Cost		min_IO_cost,
500 				max_IO_cost;
501 	QualCost	qpqual_cost;
502 	Cost		cpu_per_tuple;
503 	double		tuples_fetched;
504 	double		pages_fetched;
505 	double		rand_heap_pages;
506 	double		index_pages;
507 
508 	/* Should only be applied to base relations */
509 	Assert(IsA(baserel, RelOptInfo) &&
510 		   IsA(index, IndexOptInfo));
511 	Assert(baserel->relid > 0);
512 	Assert(baserel->rtekind == RTE_RELATION);
513 
514 	/*
515 	 * Mark the path with the correct row estimate, and identify which quals
516 	 * will need to be enforced as qpquals.  We need not check any quals that
517 	 * are implied by the index's predicate, so we can use indrestrictinfo not
518 	 * baserestrictinfo as the list of relevant restriction clauses for the
519 	 * rel.
520 	 */
521 	if (path->path.param_info)
522 	{
523 		path->path.rows = path->path.param_info->ppi_rows;
524 		/* qpquals come from the rel's restriction clauses and ppi_clauses */
525 		qpquals = list_concat(extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
526 														  path->indexclauses),
527 							  extract_nonindex_conditions(path->path.param_info->ppi_clauses,
528 														  path->indexclauses));
529 	}
530 	else
531 	{
532 		path->path.rows = baserel->rows;
533 		/* qpquals come from just the rel's restriction clauses */
534 		qpquals = extract_nonindex_conditions(path->indexinfo->indrestrictinfo,
535 											  path->indexclauses);
536 	}
537 
538 	if (!enable_indexscan)
539 		startup_cost += disable_cost;
540 	/* we don't need to check enable_indexonlyscan; indxpath.c does that */
541 
542 	/*
543 	 * Call index-access-method-specific code to estimate the processing cost
544 	 * for scanning the index, as well as the selectivity of the index (ie,
545 	 * the fraction of main-table tuples we will have to retrieve) and its
546 	 * correlation to the main-table tuple order.  We need a cast here because
547 	 * pathnodes.h uses a weak function type to avoid including amapi.h.
548 	 */
549 	amcostestimate = (amcostestimate_function) index->amcostestimate;
550 	amcostestimate(root, path, loop_count,
551 				   &indexStartupCost, &indexTotalCost,
552 				   &indexSelectivity, &indexCorrelation,
553 				   &index_pages);
554 
555 	/*
556 	 * Save amcostestimate's results for possible use in bitmap scan planning.
557 	 * We don't bother to save indexStartupCost or indexCorrelation, because a
558 	 * bitmap scan doesn't care about either.
559 	 */
560 	path->indextotalcost = indexTotalCost;
561 	path->indexselectivity = indexSelectivity;
562 
563 	/* all costs for touching index itself included here */
564 	startup_cost += indexStartupCost;
565 	run_cost += indexTotalCost - indexStartupCost;
566 
567 	/* estimate number of main-table tuples fetched */
568 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
569 
570 	/* fetch estimated page costs for tablespace containing table */
571 	get_tablespace_page_costs(baserel->reltablespace,
572 							  &spc_random_page_cost,
573 							  &spc_seq_page_cost);
574 
575 	/*----------
576 	 * Estimate number of main-table pages fetched, and compute I/O cost.
577 	 *
578 	 * When the index ordering is uncorrelated with the table ordering,
579 	 * we use an approximation proposed by Mackert and Lohman (see
580 	 * index_pages_fetched() for details) to compute the number of pages
581 	 * fetched, and then charge spc_random_page_cost per page fetched.
582 	 *
583 	 * When the index ordering is exactly correlated with the table ordering
584 	 * (just after a CLUSTER, for example), the number of pages fetched should
585 	 * be exactly selectivity * table_size.  What's more, all but the first
586 	 * will be sequential fetches, not the random fetches that occur in the
587 	 * uncorrelated case.  So if the number of pages is more than 1, we
588 	 * ought to charge
589 	 *		spc_random_page_cost + (pages_fetched - 1) * spc_seq_page_cost
590 	 * For partially-correlated indexes, we ought to charge somewhere between
591 	 * these two estimates.  We currently interpolate linearly between the
592 	 * estimates based on the correlation squared (XXX is that appropriate?).
593 	 *
594 	 * If it's an index-only scan, then we will not need to fetch any heap
595 	 * pages for which the visibility map shows all tuples are visible.
596 	 * Hence, reduce the estimated number of heap fetches accordingly.
597 	 * We use the measured fraction of the entire heap that is all-visible,
598 	 * which might not be particularly relevant to the subset of the heap
599 	 * that this query will fetch; but it's not clear how to do better.
600 	 *----------
601 	 */
602 	if (loop_count > 1)
603 	{
604 		/*
605 		 * For repeated indexscans, the appropriate estimate for the
606 		 * uncorrelated case is to scale up the number of tuples fetched in
607 		 * the Mackert and Lohman formula by the number of scans, so that we
608 		 * estimate the number of pages fetched by all the scans; then
609 		 * pro-rate the costs for one scan.  In this case we assume all the
610 		 * fetches are random accesses.
611 		 */
612 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
613 											baserel->pages,
614 											(double) index->pages,
615 											root);
616 
617 		if (indexonly)
618 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
619 
620 		rand_heap_pages = pages_fetched;
621 
622 		max_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
623 
624 		/*
625 		 * In the perfectly correlated case, the number of pages touched by
626 		 * each scan is selectivity * table_size, and we can use the Mackert
627 		 * and Lohman formula at the page level to estimate how much work is
628 		 * saved by caching across scans.  We still assume all the fetches are
629 		 * random, though, which is an overestimate that's hard to correct for
630 		 * without double-counting the cache effects.  (But in most cases
631 		 * where such a plan is actually interesting, only one page would get
632 		 * fetched per scan anyway, so it shouldn't matter much.)
633 		 */
634 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
635 
636 		pages_fetched = index_pages_fetched(pages_fetched * loop_count,
637 											baserel->pages,
638 											(double) index->pages,
639 											root);
640 
641 		if (indexonly)
642 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
643 
644 		min_IO_cost = (pages_fetched * spc_random_page_cost) / loop_count;
645 	}
646 	else
647 	{
648 		/*
649 		 * Normal case: apply the Mackert and Lohman formula, and then
650 		 * interpolate between that and the correlation-derived result.
651 		 */
652 		pages_fetched = index_pages_fetched(tuples_fetched,
653 											baserel->pages,
654 											(double) index->pages,
655 											root);
656 
657 		if (indexonly)
658 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
659 
660 		rand_heap_pages = pages_fetched;
661 
662 		/* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */
663 		max_IO_cost = pages_fetched * spc_random_page_cost;
664 
665 		/* min_IO_cost is for the perfectly correlated case (csquared=1) */
666 		pages_fetched = ceil(indexSelectivity * (double) baserel->pages);
667 
668 		if (indexonly)
669 			pages_fetched = ceil(pages_fetched * (1.0 - baserel->allvisfrac));
670 
671 		if (pages_fetched > 0)
672 		{
673 			min_IO_cost = spc_random_page_cost;
674 			if (pages_fetched > 1)
675 				min_IO_cost += (pages_fetched - 1) * spc_seq_page_cost;
676 		}
677 		else
678 			min_IO_cost = 0;
679 	}
680 
681 	if (partial_path)
682 	{
683 		/*
684 		 * For index only scans compute workers based on number of index pages
685 		 * fetched; the number of heap pages we fetch might be so small as to
686 		 * effectively rule out parallelism, which we don't want to do.
687 		 */
688 		if (indexonly)
689 			rand_heap_pages = -1;
690 
691 		/*
692 		 * Estimate the number of parallel workers required to scan index. Use
693 		 * the number of heap pages computed considering heap fetches won't be
694 		 * sequential as for parallel scans the pages are accessed in random
695 		 * order.
696 		 */
697 		path->path.parallel_workers = compute_parallel_worker(baserel,
698 															  rand_heap_pages,
699 															  index_pages,
700 															  max_parallel_workers_per_gather);
701 
702 		/*
703 		 * Fall out if workers can't be assigned for parallel scan, because in
704 		 * such a case this path will be rejected.  So there is no benefit in
705 		 * doing extra computation.
706 		 */
707 		if (path->path.parallel_workers <= 0)
708 			return;
709 
710 		path->path.parallel_aware = true;
711 	}
712 
713 	/*
714 	 * Now interpolate based on estimated index order correlation to get total
715 	 * disk I/O cost for main table accesses.
716 	 */
717 	csquared = indexCorrelation * indexCorrelation;
718 
719 	run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost);
720 
721 	/*
722 	 * Estimate CPU costs per tuple.
723 	 *
724 	 * What we want here is cpu_tuple_cost plus the evaluation costs of any
725 	 * qual clauses that we have to evaluate as qpquals.
726 	 */
727 	cost_qual_eval(&qpqual_cost, qpquals, root);
728 
729 	startup_cost += qpqual_cost.startup;
730 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
731 
732 	cpu_run_cost += cpu_per_tuple * tuples_fetched;
733 
734 	/* tlist eval costs are paid per output row, not per tuple scanned */
735 	startup_cost += path->path.pathtarget->cost.startup;
736 	cpu_run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
737 
738 	/* Adjust costing for parallelism, if used. */
739 	if (path->path.parallel_workers > 0)
740 	{
741 		double		parallel_divisor = get_parallel_divisor(&path->path);
742 
743 		path->path.rows = clamp_row_est(path->path.rows / parallel_divisor);
744 
745 		/* The CPU cost is divided among all the workers. */
746 		cpu_run_cost /= parallel_divisor;
747 	}
748 
749 	run_cost += cpu_run_cost;
750 
751 	path->path.startup_cost = startup_cost;
752 	path->path.total_cost = startup_cost + run_cost;
753 }
754 
755 /*
756  * extract_nonindex_conditions
757  *
758  * Given a list of quals to be enforced in an indexscan, extract the ones that
759  * will have to be applied as qpquals (ie, the index machinery won't handle
760  * them).  Here we detect only whether a qual clause is directly redundant
761  * with some indexclause.  If the index path is chosen for use, createplan.c
762  * will try a bit harder to get rid of redundant qual conditions; specifically
763  * it will see if quals can be proven to be implied by the indexquals.  But
764  * it does not seem worth the cycles to try to factor that in at this stage,
765  * since we're only trying to estimate qual eval costs.  Otherwise this must
766  * match the logic in create_indexscan_plan().
767  *
768  * qual_clauses, and the result, are lists of RestrictInfos.
769  * indexclauses is a list of IndexClauses.
770  */
771 static List *
extract_nonindex_conditions(List * qual_clauses,List * indexclauses)772 extract_nonindex_conditions(List *qual_clauses, List *indexclauses)
773 {
774 	List	   *result = NIL;
775 	ListCell   *lc;
776 
777 	foreach(lc, qual_clauses)
778 	{
779 		RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
780 
781 		if (rinfo->pseudoconstant)
782 			continue;			/* we may drop pseudoconstants here */
783 		if (is_redundant_with_indexclauses(rinfo, indexclauses))
784 			continue;			/* dup or derived from same EquivalenceClass */
785 		/* ... skip the predicate proof attempt createplan.c will try ... */
786 		result = lappend(result, rinfo);
787 	}
788 	return result;
789 }
790 
791 /*
792  * index_pages_fetched
793  *	  Estimate the number of pages actually fetched after accounting for
794  *	  cache effects.
795  *
796  * We use an approximation proposed by Mackert and Lohman, "Index Scans
797  * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions
798  * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424.
799  * The Mackert and Lohman approximation is that the number of pages
800  * fetched is
801  *	PF =
802  *		min(2TNs/(2T+Ns), T)			when T <= b
803  *		2TNs/(2T+Ns)					when T > b and Ns <= 2Tb/(2T-b)
804  *		b + (Ns - 2Tb/(2T-b))*(T-b)/T	when T > b and Ns > 2Tb/(2T-b)
805  * where
806  *		T = # pages in table
807  *		N = # tuples in table
808  *		s = selectivity = fraction of table to be scanned
809  *		b = # buffer pages available (we include kernel space here)
810  *
811  * We assume that effective_cache_size is the total number of buffer pages
812  * available for the whole query, and pro-rate that space across all the
813  * tables in the query and the index currently under consideration.  (This
814  * ignores space needed for other indexes used by the query, but since we
815  * don't know which indexes will get used, we can't estimate that very well;
816  * and in any case counting all the tables may well be an overestimate, since
817  * depending on the join plan not all the tables may be scanned concurrently.)
818  *
819  * The product Ns is the number of tuples fetched; we pass in that
820  * product rather than calculating it here.  "pages" is the number of pages
821  * in the object under consideration (either an index or a table).
822  * "index_pages" is the amount to add to the total table space, which was
823  * computed for us by make_one_rel.
824  *
825  * Caller is expected to have ensured that tuples_fetched is greater than zero
826  * and rounded to integer (see clamp_row_est).  The result will likewise be
827  * greater than zero and integral.
828  */
829 double
index_pages_fetched(double tuples_fetched,BlockNumber pages,double index_pages,PlannerInfo * root)830 index_pages_fetched(double tuples_fetched, BlockNumber pages,
831 					double index_pages, PlannerInfo *root)
832 {
833 	double		pages_fetched;
834 	double		total_pages;
835 	double		T,
836 				b;
837 
838 	/* T is # pages in table, but don't allow it to be zero */
839 	T = (pages > 1) ? (double) pages : 1.0;
840 
841 	/* Compute number of pages assumed to be competing for cache space */
842 	total_pages = root->total_table_pages + index_pages;
843 	total_pages = Max(total_pages, 1.0);
844 	Assert(T <= total_pages);
845 
846 	/* b is pro-rated share of effective_cache_size */
847 	b = (double) effective_cache_size * T / total_pages;
848 
849 	/* force it positive and integral */
850 	if (b <= 1.0)
851 		b = 1.0;
852 	else
853 		b = ceil(b);
854 
855 	/* This part is the Mackert and Lohman formula */
856 	if (T <= b)
857 	{
858 		pages_fetched =
859 			(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
860 		if (pages_fetched >= T)
861 			pages_fetched = T;
862 		else
863 			pages_fetched = ceil(pages_fetched);
864 	}
865 	else
866 	{
867 		double		lim;
868 
869 		lim = (2.0 * T * b) / (2.0 * T - b);
870 		if (tuples_fetched <= lim)
871 		{
872 			pages_fetched =
873 				(2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
874 		}
875 		else
876 		{
877 			pages_fetched =
878 				b + (tuples_fetched - lim) * (T - b) / T;
879 		}
880 		pages_fetched = ceil(pages_fetched);
881 	}
882 	return pages_fetched;
883 }
884 
885 /*
886  * get_indexpath_pages
887  *		Determine the total size of the indexes used in a bitmap index path.
888  *
889  * Note: if the same index is used more than once in a bitmap tree, we will
890  * count it multiple times, which perhaps is the wrong thing ... but it's
891  * not completely clear, and detecting duplicates is difficult, so ignore it
892  * for now.
893  */
894 static double
get_indexpath_pages(Path * bitmapqual)895 get_indexpath_pages(Path *bitmapqual)
896 {
897 	double		result = 0;
898 	ListCell   *l;
899 
900 	if (IsA(bitmapqual, BitmapAndPath))
901 	{
902 		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
903 
904 		foreach(l, apath->bitmapquals)
905 		{
906 			result += get_indexpath_pages((Path *) lfirst(l));
907 		}
908 	}
909 	else if (IsA(bitmapqual, BitmapOrPath))
910 	{
911 		BitmapOrPath *opath = (BitmapOrPath *) bitmapqual;
912 
913 		foreach(l, opath->bitmapquals)
914 		{
915 			result += get_indexpath_pages((Path *) lfirst(l));
916 		}
917 	}
918 	else if (IsA(bitmapqual, IndexPath))
919 	{
920 		IndexPath  *ipath = (IndexPath *) bitmapqual;
921 
922 		result = (double) ipath->indexinfo->pages;
923 	}
924 	else
925 		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
926 
927 	return result;
928 }
929 
930 /*
931  * cost_bitmap_heap_scan
932  *	  Determines and returns the cost of scanning a relation using a bitmap
933  *	  index-then-heap plan.
934  *
935  * 'baserel' is the relation to be scanned
936  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
937  * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths
938  * 'loop_count' is the number of repetitions of the indexscan to factor into
939  *		estimates of caching behavior
940  *
941  * Note: the component IndexPaths in bitmapqual should have been costed
942  * using the same loop_count.
943  */
944 void
cost_bitmap_heap_scan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,Path * bitmapqual,double loop_count)945 cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
946 					  ParamPathInfo *param_info,
947 					  Path *bitmapqual, double loop_count)
948 {
949 	Cost		startup_cost = 0;
950 	Cost		run_cost = 0;
951 	Cost		indexTotalCost;
952 	QualCost	qpqual_cost;
953 	Cost		cpu_per_tuple;
954 	Cost		cost_per_page;
955 	Cost		cpu_run_cost;
956 	double		tuples_fetched;
957 	double		pages_fetched;
958 	double		spc_seq_page_cost,
959 				spc_random_page_cost;
960 	double		T;
961 
962 	/* Should only be applied to base relations */
963 	Assert(IsA(baserel, RelOptInfo));
964 	Assert(baserel->relid > 0);
965 	Assert(baserel->rtekind == RTE_RELATION);
966 
967 	/* Mark the path with the correct row estimate */
968 	if (param_info)
969 		path->rows = param_info->ppi_rows;
970 	else
971 		path->rows = baserel->rows;
972 
973 	if (!enable_bitmapscan)
974 		startup_cost += disable_cost;
975 
976 	pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual,
977 										 loop_count, &indexTotalCost,
978 										 &tuples_fetched);
979 
980 	startup_cost += indexTotalCost;
981 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
982 
983 	/* Fetch estimated page costs for tablespace containing table. */
984 	get_tablespace_page_costs(baserel->reltablespace,
985 							  &spc_random_page_cost,
986 							  &spc_seq_page_cost);
987 
988 	/*
989 	 * For small numbers of pages we should charge spc_random_page_cost
990 	 * apiece, while if nearly all the table's pages are being read, it's more
991 	 * appropriate to charge spc_seq_page_cost apiece.  The effect is
992 	 * nonlinear, too. For lack of a better idea, interpolate like this to
993 	 * determine the cost per page.
994 	 */
995 	if (pages_fetched >= 2.0)
996 		cost_per_page = spc_random_page_cost -
997 			(spc_random_page_cost - spc_seq_page_cost)
998 			* sqrt(pages_fetched / T);
999 	else
1000 		cost_per_page = spc_random_page_cost;
1001 
1002 	run_cost += pages_fetched * cost_per_page;
1003 
1004 	/*
1005 	 * Estimate CPU costs per tuple.
1006 	 *
1007 	 * Often the indexquals don't need to be rechecked at each tuple ... but
1008 	 * not always, especially not if there are enough tuples involved that the
1009 	 * bitmaps become lossy.  For the moment, just assume they will be
1010 	 * rechecked always.  This means we charge the full freight for all the
1011 	 * scan clauses.
1012 	 */
1013 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1014 
1015 	startup_cost += qpqual_cost.startup;
1016 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1017 	cpu_run_cost = cpu_per_tuple * tuples_fetched;
1018 
1019 	/* Adjust costing for parallelism, if used. */
1020 	if (path->parallel_workers > 0)
1021 	{
1022 		double		parallel_divisor = get_parallel_divisor(path);
1023 
1024 		/* The CPU cost is divided among all the workers. */
1025 		cpu_run_cost /= parallel_divisor;
1026 
1027 		path->rows = clamp_row_est(path->rows / parallel_divisor);
1028 	}
1029 
1030 
1031 	run_cost += cpu_run_cost;
1032 
1033 	/* tlist eval costs are paid per output row, not per tuple scanned */
1034 	startup_cost += path->pathtarget->cost.startup;
1035 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1036 
1037 	path->startup_cost = startup_cost;
1038 	path->total_cost = startup_cost + run_cost;
1039 }
1040 
1041 /*
1042  * cost_bitmap_tree_node
1043  *		Extract cost and selectivity from a bitmap tree node (index/and/or)
1044  */
1045 void
cost_bitmap_tree_node(Path * path,Cost * cost,Selectivity * selec)1046 cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec)
1047 {
1048 	if (IsA(path, IndexPath))
1049 	{
1050 		*cost = ((IndexPath *) path)->indextotalcost;
1051 		*selec = ((IndexPath *) path)->indexselectivity;
1052 
1053 		/*
1054 		 * Charge a small amount per retrieved tuple to reflect the costs of
1055 		 * manipulating the bitmap.  This is mostly to make sure that a bitmap
1056 		 * scan doesn't look to be the same cost as an indexscan to retrieve a
1057 		 * single tuple.
1058 		 */
1059 		*cost += 0.1 * cpu_operator_cost * path->rows;
1060 	}
1061 	else if (IsA(path, BitmapAndPath))
1062 	{
1063 		*cost = path->total_cost;
1064 		*selec = ((BitmapAndPath *) path)->bitmapselectivity;
1065 	}
1066 	else if (IsA(path, BitmapOrPath))
1067 	{
1068 		*cost = path->total_cost;
1069 		*selec = ((BitmapOrPath *) path)->bitmapselectivity;
1070 	}
1071 	else
1072 	{
1073 		elog(ERROR, "unrecognized node type: %d", nodeTag(path));
1074 		*cost = *selec = 0;		/* keep compiler quiet */
1075 	}
1076 }
1077 
1078 /*
1079  * cost_bitmap_and_node
1080  *		Estimate the cost of a BitmapAnd node
1081  *
1082  * Note that this considers only the costs of index scanning and bitmap
1083  * creation, not the eventual heap access.  In that sense the object isn't
1084  * truly a Path, but it has enough path-like properties (costs in particular)
1085  * to warrant treating it as one.  We don't bother to set the path rows field,
1086  * however.
1087  */
1088 void
cost_bitmap_and_node(BitmapAndPath * path,PlannerInfo * root)1089 cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root)
1090 {
1091 	Cost		totalCost;
1092 	Selectivity selec;
1093 	ListCell   *l;
1094 
1095 	/*
1096 	 * We estimate AND selectivity on the assumption that the inputs are
1097 	 * independent.  This is probably often wrong, but we don't have the info
1098 	 * to do better.
1099 	 *
1100 	 * The runtime cost of the BitmapAnd itself is estimated at 100x
1101 	 * cpu_operator_cost for each tbm_intersect needed.  Probably too small,
1102 	 * definitely too simplistic?
1103 	 */
1104 	totalCost = 0.0;
1105 	selec = 1.0;
1106 	foreach(l, path->bitmapquals)
1107 	{
1108 		Path	   *subpath = (Path *) lfirst(l);
1109 		Cost		subCost;
1110 		Selectivity subselec;
1111 
1112 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1113 
1114 		selec *= subselec;
1115 
1116 		totalCost += subCost;
1117 		if (l != list_head(path->bitmapquals))
1118 			totalCost += 100.0 * cpu_operator_cost;
1119 	}
1120 	path->bitmapselectivity = selec;
1121 	path->path.rows = 0;		/* per above, not used */
1122 	path->path.startup_cost = totalCost;
1123 	path->path.total_cost = totalCost;
1124 }
1125 
1126 /*
1127  * cost_bitmap_or_node
1128  *		Estimate the cost of a BitmapOr node
1129  *
1130  * See comments for cost_bitmap_and_node.
1131  */
1132 void
cost_bitmap_or_node(BitmapOrPath * path,PlannerInfo * root)1133 cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root)
1134 {
1135 	Cost		totalCost;
1136 	Selectivity selec;
1137 	ListCell   *l;
1138 
1139 	/*
1140 	 * We estimate OR selectivity on the assumption that the inputs are
1141 	 * non-overlapping, since that's often the case in "x IN (list)" type
1142 	 * situations.  Of course, we clamp to 1.0 at the end.
1143 	 *
1144 	 * The runtime cost of the BitmapOr itself is estimated at 100x
1145 	 * cpu_operator_cost for each tbm_union needed.  Probably too small,
1146 	 * definitely too simplistic?  We are aware that the tbm_unions are
1147 	 * optimized out when the inputs are BitmapIndexScans.
1148 	 */
1149 	totalCost = 0.0;
1150 	selec = 0.0;
1151 	foreach(l, path->bitmapquals)
1152 	{
1153 		Path	   *subpath = (Path *) lfirst(l);
1154 		Cost		subCost;
1155 		Selectivity subselec;
1156 
1157 		cost_bitmap_tree_node(subpath, &subCost, &subselec);
1158 
1159 		selec += subselec;
1160 
1161 		totalCost += subCost;
1162 		if (l != list_head(path->bitmapquals) &&
1163 			!IsA(subpath, IndexPath))
1164 			totalCost += 100.0 * cpu_operator_cost;
1165 	}
1166 	path->bitmapselectivity = Min(selec, 1.0);
1167 	path->path.rows = 0;		/* per above, not used */
1168 	path->path.startup_cost = totalCost;
1169 	path->path.total_cost = totalCost;
1170 }
1171 
1172 /*
1173  * cost_tidscan
1174  *	  Determines and returns the cost of scanning a relation using TIDs.
1175  *
1176  * 'baserel' is the relation to be scanned
1177  * 'tidquals' is the list of TID-checkable quals
1178  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1179  */
1180 void
cost_tidscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,List * tidquals,ParamPathInfo * param_info)1181 cost_tidscan(Path *path, PlannerInfo *root,
1182 			 RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info)
1183 {
1184 	Cost		startup_cost = 0;
1185 	Cost		run_cost = 0;
1186 	bool		isCurrentOf = false;
1187 	QualCost	qpqual_cost;
1188 	Cost		cpu_per_tuple;
1189 	QualCost	tid_qual_cost;
1190 	int			ntuples;
1191 	ListCell   *l;
1192 	double		spc_random_page_cost;
1193 
1194 	/* Should only be applied to base relations */
1195 	Assert(baserel->relid > 0);
1196 	Assert(baserel->rtekind == RTE_RELATION);
1197 
1198 	/* Mark the path with the correct row estimate */
1199 	if (param_info)
1200 		path->rows = param_info->ppi_rows;
1201 	else
1202 		path->rows = baserel->rows;
1203 
1204 	/* Count how many tuples we expect to retrieve */
1205 	ntuples = 0;
1206 	foreach(l, tidquals)
1207 	{
1208 		RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
1209 		Expr	   *qual = rinfo->clause;
1210 
1211 		if (IsA(qual, ScalarArrayOpExpr))
1212 		{
1213 			/* Each element of the array yields 1 tuple */
1214 			ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual;
1215 			Node	   *arraynode = (Node *) lsecond(saop->args);
1216 
1217 			ntuples += estimate_array_length(arraynode);
1218 		}
1219 		else if (IsA(qual, CurrentOfExpr))
1220 		{
1221 			/* CURRENT OF yields 1 tuple */
1222 			isCurrentOf = true;
1223 			ntuples++;
1224 		}
1225 		else
1226 		{
1227 			/* It's just CTID = something, count 1 tuple */
1228 			ntuples++;
1229 		}
1230 	}
1231 
1232 	/*
1233 	 * We must force TID scan for WHERE CURRENT OF, because only nodeTidscan.c
1234 	 * understands how to do it correctly.  Therefore, honor enable_tidscan
1235 	 * only when CURRENT OF isn't present.  Also note that cost_qual_eval
1236 	 * counts a CurrentOfExpr as having startup cost disable_cost, which we
1237 	 * subtract off here; that's to prevent other plan types such as seqscan
1238 	 * from winning.
1239 	 */
1240 	if (isCurrentOf)
1241 	{
1242 		Assert(baserel->baserestrictcost.startup >= disable_cost);
1243 		startup_cost -= disable_cost;
1244 	}
1245 	else if (!enable_tidscan)
1246 		startup_cost += disable_cost;
1247 
1248 	/*
1249 	 * The TID qual expressions will be computed once, any other baserestrict
1250 	 * quals once per retrieved tuple.
1251 	 */
1252 	cost_qual_eval(&tid_qual_cost, tidquals, root);
1253 
1254 	/* fetch estimated page cost for tablespace containing table */
1255 	get_tablespace_page_costs(baserel->reltablespace,
1256 							  &spc_random_page_cost,
1257 							  NULL);
1258 
1259 	/* disk costs --- assume each tuple on a different page */
1260 	run_cost += spc_random_page_cost * ntuples;
1261 
1262 	/* Add scanning CPU costs */
1263 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1264 
1265 	/* XXX currently we assume TID quals are a subset of qpquals */
1266 	startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple;
1267 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple -
1268 		tid_qual_cost.per_tuple;
1269 	run_cost += cpu_per_tuple * ntuples;
1270 
1271 	/* tlist eval costs are paid per output row, not per tuple scanned */
1272 	startup_cost += path->pathtarget->cost.startup;
1273 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1274 
1275 	path->startup_cost = startup_cost;
1276 	path->total_cost = startup_cost + run_cost;
1277 }
1278 
1279 /*
1280  * cost_subqueryscan
1281  *	  Determines and returns the cost of scanning a subquery RTE.
1282  *
1283  * 'baserel' is the relation to be scanned
1284  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1285  */
1286 void
cost_subqueryscan(SubqueryScanPath * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1287 cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root,
1288 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1289 {
1290 	Cost		startup_cost;
1291 	Cost		run_cost;
1292 	QualCost	qpqual_cost;
1293 	Cost		cpu_per_tuple;
1294 
1295 	/* Should only be applied to base relations that are subqueries */
1296 	Assert(baserel->relid > 0);
1297 	Assert(baserel->rtekind == RTE_SUBQUERY);
1298 
1299 	/* Mark the path with the correct row estimate */
1300 	if (param_info)
1301 		path->path.rows = param_info->ppi_rows;
1302 	else
1303 		path->path.rows = baserel->rows;
1304 
1305 	/*
1306 	 * Cost of path is cost of evaluating the subplan, plus cost of evaluating
1307 	 * any restriction clauses and tlist that will be attached to the
1308 	 * SubqueryScan node, plus cpu_tuple_cost to account for selection and
1309 	 * projection overhead.
1310 	 */
1311 	path->path.startup_cost = path->subpath->startup_cost;
1312 	path->path.total_cost = path->subpath->total_cost;
1313 
1314 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1315 
1316 	startup_cost = qpqual_cost.startup;
1317 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1318 	run_cost = cpu_per_tuple * baserel->tuples;
1319 
1320 	/* tlist eval costs are paid per output row, not per tuple scanned */
1321 	startup_cost += path->path.pathtarget->cost.startup;
1322 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
1323 
1324 	path->path.startup_cost += startup_cost;
1325 	path->path.total_cost += startup_cost + run_cost;
1326 }
1327 
1328 /*
1329  * cost_functionscan
1330  *	  Determines and returns the cost of scanning a function RTE.
1331  *
1332  * 'baserel' is the relation to be scanned
1333  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1334  */
1335 void
cost_functionscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1336 cost_functionscan(Path *path, PlannerInfo *root,
1337 				  RelOptInfo *baserel, ParamPathInfo *param_info)
1338 {
1339 	Cost		startup_cost = 0;
1340 	Cost		run_cost = 0;
1341 	QualCost	qpqual_cost;
1342 	Cost		cpu_per_tuple;
1343 	RangeTblEntry *rte;
1344 	QualCost	exprcost;
1345 
1346 	/* Should only be applied to base relations that are functions */
1347 	Assert(baserel->relid > 0);
1348 	rte = planner_rt_fetch(baserel->relid, root);
1349 	Assert(rte->rtekind == RTE_FUNCTION);
1350 
1351 	/* Mark the path with the correct row estimate */
1352 	if (param_info)
1353 		path->rows = param_info->ppi_rows;
1354 	else
1355 		path->rows = baserel->rows;
1356 
1357 	/*
1358 	 * Estimate costs of executing the function expression(s).
1359 	 *
1360 	 * Currently, nodeFunctionscan.c always executes the functions to
1361 	 * completion before returning any rows, and caches the results in a
1362 	 * tuplestore.  So the function eval cost is all startup cost, and per-row
1363 	 * costs are minimal.
1364 	 *
1365 	 * XXX in principle we ought to charge tuplestore spill costs if the
1366 	 * number of rows is large.  However, given how phony our rowcount
1367 	 * estimates for functions tend to be, there's not a lot of point in that
1368 	 * refinement right now.
1369 	 */
1370 	cost_qual_eval_node(&exprcost, (Node *) rte->functions, root);
1371 
1372 	startup_cost += exprcost.startup + exprcost.per_tuple;
1373 
1374 	/* Add scanning CPU costs */
1375 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1376 
1377 	startup_cost += qpqual_cost.startup;
1378 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1379 	run_cost += cpu_per_tuple * baserel->tuples;
1380 
1381 	/* tlist eval costs are paid per output row, not per tuple scanned */
1382 	startup_cost += path->pathtarget->cost.startup;
1383 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1384 
1385 	path->startup_cost = startup_cost;
1386 	path->total_cost = startup_cost + run_cost;
1387 }
1388 
1389 /*
1390  * cost_tablefuncscan
1391  *	  Determines and returns the cost of scanning a table function.
1392  *
1393  * 'baserel' is the relation to be scanned
1394  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1395  */
1396 void
cost_tablefuncscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1397 cost_tablefuncscan(Path *path, PlannerInfo *root,
1398 				   RelOptInfo *baserel, ParamPathInfo *param_info)
1399 {
1400 	Cost		startup_cost = 0;
1401 	Cost		run_cost = 0;
1402 	QualCost	qpqual_cost;
1403 	Cost		cpu_per_tuple;
1404 	RangeTblEntry *rte;
1405 	QualCost	exprcost;
1406 
1407 	/* Should only be applied to base relations that are functions */
1408 	Assert(baserel->relid > 0);
1409 	rte = planner_rt_fetch(baserel->relid, root);
1410 	Assert(rte->rtekind == RTE_TABLEFUNC);
1411 
1412 	/* Mark the path with the correct row estimate */
1413 	if (param_info)
1414 		path->rows = param_info->ppi_rows;
1415 	else
1416 		path->rows = baserel->rows;
1417 
1418 	/*
1419 	 * Estimate costs of executing the table func expression(s).
1420 	 *
1421 	 * XXX in principle we ought to charge tuplestore spill costs if the
1422 	 * number of rows is large.  However, given how phony our rowcount
1423 	 * estimates for tablefuncs tend to be, there's not a lot of point in that
1424 	 * refinement right now.
1425 	 */
1426 	cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
1427 
1428 	startup_cost += exprcost.startup + exprcost.per_tuple;
1429 
1430 	/* Add scanning CPU costs */
1431 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1432 
1433 	startup_cost += qpqual_cost.startup;
1434 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1435 	run_cost += cpu_per_tuple * baserel->tuples;
1436 
1437 	/* tlist eval costs are paid per output row, not per tuple scanned */
1438 	startup_cost += path->pathtarget->cost.startup;
1439 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1440 
1441 	path->startup_cost = startup_cost;
1442 	path->total_cost = startup_cost + run_cost;
1443 }
1444 
1445 /*
1446  * cost_valuesscan
1447  *	  Determines and returns the cost of scanning a VALUES RTE.
1448  *
1449  * 'baserel' is the relation to be scanned
1450  * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
1451  */
1452 void
cost_valuesscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1453 cost_valuesscan(Path *path, PlannerInfo *root,
1454 				RelOptInfo *baserel, ParamPathInfo *param_info)
1455 {
1456 	Cost		startup_cost = 0;
1457 	Cost		run_cost = 0;
1458 	QualCost	qpqual_cost;
1459 	Cost		cpu_per_tuple;
1460 
1461 	/* Should only be applied to base relations that are values lists */
1462 	Assert(baserel->relid > 0);
1463 	Assert(baserel->rtekind == RTE_VALUES);
1464 
1465 	/* Mark the path with the correct row estimate */
1466 	if (param_info)
1467 		path->rows = param_info->ppi_rows;
1468 	else
1469 		path->rows = baserel->rows;
1470 
1471 	/*
1472 	 * For now, estimate list evaluation cost at one operator eval per list
1473 	 * (probably pretty bogus, but is it worth being smarter?)
1474 	 */
1475 	cpu_per_tuple = cpu_operator_cost;
1476 
1477 	/* Add scanning CPU costs */
1478 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1479 
1480 	startup_cost += qpqual_cost.startup;
1481 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1482 	run_cost += cpu_per_tuple * baserel->tuples;
1483 
1484 	/* tlist eval costs are paid per output row, not per tuple scanned */
1485 	startup_cost += path->pathtarget->cost.startup;
1486 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1487 
1488 	path->startup_cost = startup_cost;
1489 	path->total_cost = startup_cost + run_cost;
1490 }
1491 
1492 /*
1493  * cost_ctescan
1494  *	  Determines and returns the cost of scanning a CTE RTE.
1495  *
1496  * Note: this is used for both self-reference and regular CTEs; the
1497  * possible cost differences are below the threshold of what we could
1498  * estimate accurately anyway.  Note that the costs of evaluating the
1499  * referenced CTE query are added into the final plan as initplan costs,
1500  * and should NOT be counted here.
1501  */
1502 void
cost_ctescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1503 cost_ctescan(Path *path, PlannerInfo *root,
1504 			 RelOptInfo *baserel, ParamPathInfo *param_info)
1505 {
1506 	Cost		startup_cost = 0;
1507 	Cost		run_cost = 0;
1508 	QualCost	qpqual_cost;
1509 	Cost		cpu_per_tuple;
1510 
1511 	/* Should only be applied to base relations that are CTEs */
1512 	Assert(baserel->relid > 0);
1513 	Assert(baserel->rtekind == RTE_CTE);
1514 
1515 	/* Mark the path with the correct row estimate */
1516 	if (param_info)
1517 		path->rows = param_info->ppi_rows;
1518 	else
1519 		path->rows = baserel->rows;
1520 
1521 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1522 	cpu_per_tuple = cpu_tuple_cost;
1523 
1524 	/* Add scanning CPU costs */
1525 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1526 
1527 	startup_cost += qpqual_cost.startup;
1528 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1529 	run_cost += cpu_per_tuple * baserel->tuples;
1530 
1531 	/* tlist eval costs are paid per output row, not per tuple scanned */
1532 	startup_cost += path->pathtarget->cost.startup;
1533 	run_cost += path->pathtarget->cost.per_tuple * path->rows;
1534 
1535 	path->startup_cost = startup_cost;
1536 	path->total_cost = startup_cost + run_cost;
1537 }
1538 
1539 /*
1540  * cost_namedtuplestorescan
1541  *	  Determines and returns the cost of scanning a named tuplestore.
1542  */
1543 void
cost_namedtuplestorescan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1544 cost_namedtuplestorescan(Path *path, PlannerInfo *root,
1545 						 RelOptInfo *baserel, ParamPathInfo *param_info)
1546 {
1547 	Cost		startup_cost = 0;
1548 	Cost		run_cost = 0;
1549 	QualCost	qpqual_cost;
1550 	Cost		cpu_per_tuple;
1551 
1552 	/* Should only be applied to base relations that are Tuplestores */
1553 	Assert(baserel->relid > 0);
1554 	Assert(baserel->rtekind == RTE_NAMEDTUPLESTORE);
1555 
1556 	/* Mark the path with the correct row estimate */
1557 	if (param_info)
1558 		path->rows = param_info->ppi_rows;
1559 	else
1560 		path->rows = baserel->rows;
1561 
1562 	/* Charge one CPU tuple cost per row for tuplestore manipulation */
1563 	cpu_per_tuple = cpu_tuple_cost;
1564 
1565 	/* Add scanning CPU costs */
1566 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1567 
1568 	startup_cost += qpqual_cost.startup;
1569 	cpu_per_tuple += cpu_tuple_cost + qpqual_cost.per_tuple;
1570 	run_cost += cpu_per_tuple * baserel->tuples;
1571 
1572 	path->startup_cost = startup_cost;
1573 	path->total_cost = startup_cost + run_cost;
1574 }
1575 
1576 /*
1577  * cost_resultscan
1578  *	  Determines and returns the cost of scanning an RTE_RESULT relation.
1579  */
1580 void
cost_resultscan(Path * path,PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info)1581 cost_resultscan(Path *path, PlannerInfo *root,
1582 				RelOptInfo *baserel, ParamPathInfo *param_info)
1583 {
1584 	Cost		startup_cost = 0;
1585 	Cost		run_cost = 0;
1586 	QualCost	qpqual_cost;
1587 	Cost		cpu_per_tuple;
1588 
1589 	/* Should only be applied to RTE_RESULT base relations */
1590 	Assert(baserel->relid > 0);
1591 	Assert(baserel->rtekind == RTE_RESULT);
1592 
1593 	/* Mark the path with the correct row estimate */
1594 	if (param_info)
1595 		path->rows = param_info->ppi_rows;
1596 	else
1597 		path->rows = baserel->rows;
1598 
1599 	/* We charge qual cost plus cpu_tuple_cost */
1600 	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
1601 
1602 	startup_cost += qpqual_cost.startup;
1603 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
1604 	run_cost += cpu_per_tuple * baserel->tuples;
1605 
1606 	path->startup_cost = startup_cost;
1607 	path->total_cost = startup_cost + run_cost;
1608 }
1609 
1610 /*
1611  * cost_recursive_union
1612  *	  Determines and returns the cost of performing a recursive union,
1613  *	  and also the estimated output size.
1614  *
1615  * We are given Paths for the nonrecursive and recursive terms.
1616  */
1617 void
cost_recursive_union(Path * runion,Path * nrterm,Path * rterm)1618 cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
1619 {
1620 	Cost		startup_cost;
1621 	Cost		total_cost;
1622 	double		total_rows;
1623 
1624 	/* We probably have decent estimates for the non-recursive term */
1625 	startup_cost = nrterm->startup_cost;
1626 	total_cost = nrterm->total_cost;
1627 	total_rows = nrterm->rows;
1628 
1629 	/*
1630 	 * We arbitrarily assume that about 10 recursive iterations will be
1631 	 * needed, and that we've managed to get a good fix on the cost and output
1632 	 * size of each one of them.  These are mighty shaky assumptions but it's
1633 	 * hard to see how to do better.
1634 	 */
1635 	total_cost += 10 * rterm->total_cost;
1636 	total_rows += 10 * rterm->rows;
1637 
1638 	/*
1639 	 * Also charge cpu_tuple_cost per row to account for the costs of
1640 	 * manipulating the tuplestores.  (We don't worry about possible
1641 	 * spill-to-disk costs.)
1642 	 */
1643 	total_cost += cpu_tuple_cost * total_rows;
1644 
1645 	runion->startup_cost = startup_cost;
1646 	runion->total_cost = total_cost;
1647 	runion->rows = total_rows;
1648 	runion->pathtarget->width = Max(nrterm->pathtarget->width,
1649 									rterm->pathtarget->width);
1650 }
1651 
1652 /*
1653  * cost_tuplesort
1654  *	  Determines and returns the cost of sorting a relation using tuplesort,
1655  *    not including the cost of reading the input data.
1656  *
1657  * If the total volume of data to sort is less than sort_mem, we will do
1658  * an in-memory sort, which requires no I/O and about t*log2(t) tuple
1659  * comparisons for t tuples.
1660  *
1661  * If the total volume exceeds sort_mem, we switch to a tape-style merge
1662  * algorithm.  There will still be about t*log2(t) tuple comparisons in
1663  * total, but we will also need to write and read each tuple once per
1664  * merge pass.  We expect about ceil(logM(r)) merge passes where r is the
1665  * number of initial runs formed and M is the merge order used by tuplesort.c.
1666  * Since the average initial run should be about sort_mem, we have
1667  *		disk traffic = 2 * relsize * ceil(logM(p / sort_mem))
1668  *		cpu = comparison_cost * t * log2(t)
1669  *
1670  * If the sort is bounded (i.e., only the first k result tuples are needed)
1671  * and k tuples can fit into sort_mem, we use a heap method that keeps only
1672  * k tuples in the heap; this will require about t*log2(k) tuple comparisons.
1673  *
1674  * The disk traffic is assumed to be 3/4ths sequential and 1/4th random
1675  * accesses (XXX can't we refine that guess?)
1676  *
1677  * By default, we charge two operator evals per tuple comparison, which should
1678  * be in the right ballpark in most cases.  The caller can tweak this by
1679  * specifying nonzero comparison_cost; typically that's used for any extra
1680  * work that has to be done to prepare the inputs to the comparison operators.
1681  *
1682  * 'tuples' is the number of tuples in the relation
1683  * 'width' is the average tuple width in bytes
1684  * 'comparison_cost' is the extra cost per comparison, if any
1685  * 'sort_mem' is the number of kilobytes of work memory allowed for the sort
1686  * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound
1687  */
1688 static void
cost_tuplesort(Cost * startup_cost,Cost * run_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1689 cost_tuplesort(Cost *startup_cost, Cost *run_cost,
1690 			   double tuples, int width,
1691 			   Cost comparison_cost, int sort_mem,
1692 			   double limit_tuples)
1693 {
1694 	double		input_bytes = relation_byte_size(tuples, width);
1695 	double		output_bytes;
1696 	double		output_tuples;
1697 	long		sort_mem_bytes = sort_mem * 1024L;
1698 
1699 	/*
1700 	 * We want to be sure the cost of a sort is never estimated as zero, even
1701 	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1702 	 */
1703 	if (tuples < 2.0)
1704 		tuples = 2.0;
1705 
1706 	/* Include the default cost-per-comparison */
1707 	comparison_cost += 2.0 * cpu_operator_cost;
1708 
1709 	/* Do we have a useful LIMIT? */
1710 	if (limit_tuples > 0 && limit_tuples < tuples)
1711 	{
1712 		output_tuples = limit_tuples;
1713 		output_bytes = relation_byte_size(output_tuples, width);
1714 	}
1715 	else
1716 	{
1717 		output_tuples = tuples;
1718 		output_bytes = input_bytes;
1719 	}
1720 
1721 	if (output_bytes > sort_mem_bytes)
1722 	{
1723 		/*
1724 		 * We'll have to use a disk-based sort of all the tuples
1725 		 */
1726 		double		npages = ceil(input_bytes / BLCKSZ);
1727 		double		nruns = input_bytes / sort_mem_bytes;
1728 		double		mergeorder = tuplesort_merge_order(sort_mem_bytes);
1729 		double		log_runs;
1730 		double		npageaccesses;
1731 
1732 		/*
1733 		 * CPU costs
1734 		 *
1735 		 * Assume about N log2 N comparisons
1736 		 */
1737 		*startup_cost = comparison_cost * tuples * LOG2(tuples);
1738 
1739 		/* Disk costs */
1740 
1741 		/* Compute logM(r) as log(r) / log(M) */
1742 		if (nruns > mergeorder)
1743 			log_runs = ceil(log(nruns) / log(mergeorder));
1744 		else
1745 			log_runs = 1.0;
1746 		npageaccesses = 2.0 * npages * log_runs;
1747 		/* Assume 3/4ths of accesses are sequential, 1/4th are not */
1748 		*startup_cost += npageaccesses *
1749 			(seq_page_cost * 0.75 + random_page_cost * 0.25);
1750 	}
1751 	else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes)
1752 	{
1753 		/*
1754 		 * We'll use a bounded heap-sort keeping just K tuples in memory, for
1755 		 * a total number of tuple comparisons of N log2 K; but the constant
1756 		 * factor is a bit higher than for quicksort.  Tweak it so that the
1757 		 * cost curve is continuous at the crossover point.
1758 		 */
1759 		*startup_cost = comparison_cost * tuples * LOG2(2.0 * output_tuples);
1760 	}
1761 	else
1762 	{
1763 		/* We'll use plain quicksort on all the input tuples */
1764 		*startup_cost = comparison_cost * tuples * LOG2(tuples);
1765 	}
1766 
1767 	/*
1768 	 * Also charge a small amount (arbitrarily set equal to operator cost) per
1769 	 * extracted tuple.  We don't charge cpu_tuple_cost because a Sort node
1770 	 * doesn't do qual-checking or projection, so it has less overhead than
1771 	 * most plan nodes.  Note it's correct to use tuples not output_tuples
1772 	 * here --- the upper LIMIT will pro-rate the run cost so we'd be double
1773 	 * counting the LIMIT otherwise.
1774 	 */
1775 	*run_cost = cpu_operator_cost * tuples;
1776 }
1777 
1778 /*
1779  * cost_incremental_sort
1780  * 	Determines and returns the cost of sorting a relation incrementally, when
1781  *  the input path is presorted by a prefix of the pathkeys.
1782  *
1783  * 'presorted_keys' is the number of leading pathkeys by which the input path
1784  * is sorted.
1785  *
1786  * We estimate the number of groups into which the relation is divided by the
1787  * leading pathkeys, and then calculate the cost of sorting a single group
1788  * with tuplesort using cost_tuplesort().
1789  */
1790 void
cost_incremental_sort(Path * path,PlannerInfo * root,List * pathkeys,int presorted_keys,Cost input_startup_cost,Cost input_total_cost,double input_tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1791 cost_incremental_sort(Path *path,
1792 					  PlannerInfo *root, List *pathkeys, int presorted_keys,
1793 					  Cost input_startup_cost, Cost input_total_cost,
1794 					  double input_tuples, int width, Cost comparison_cost, int sort_mem,
1795 					  double limit_tuples)
1796 {
1797 	Cost		startup_cost = 0,
1798 				run_cost = 0,
1799 				input_run_cost = input_total_cost - input_startup_cost;
1800 	double		group_tuples,
1801 				input_groups;
1802 	Cost		group_startup_cost,
1803 				group_run_cost,
1804 				group_input_run_cost;
1805 	List	   *presortedExprs = NIL;
1806 	ListCell   *l;
1807 	int			i = 0;
1808 	bool		unknown_varno = false;
1809 
1810 	Assert(presorted_keys != 0);
1811 
1812 	/*
1813 	 * We want to be sure the cost of a sort is never estimated as zero, even
1814 	 * if passed-in tuple count is zero.  Besides, mustn't do log(0)...
1815 	 */
1816 	if (input_tuples < 2.0)
1817 		input_tuples = 2.0;
1818 
1819 	/* Default estimate of number of groups, capped to one group per row. */
1820 	input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
1821 
1822 	/*
1823 	 * Extract presorted keys as list of expressions.
1824 	 *
1825 	 * We need to be careful about Vars containing "varno 0" which might have
1826 	 * been introduced by generate_append_tlist, which would confuse
1827 	 * estimate_num_groups (in fact it'd fail for such expressions). See
1828 	 * recurse_set_operations which has to deal with the same issue.
1829 	 *
1830 	 * Unlike recurse_set_operations we can't access the original target list
1831 	 * here, and even if we could it's not very clear how useful would that be
1832 	 * for a set operation combining multiple tables. So we simply detect if
1833 	 * there are any expressions with "varno 0" and use the default
1834 	 * DEFAULT_NUM_DISTINCT in that case.
1835 	 *
1836 	 * We might also use either 1.0 (a single group) or input_tuples (each row
1837 	 * being a separate group), pretty much the worst and best case for
1838 	 * incremental sort. But those are extreme cases and using something in
1839 	 * between seems reasonable. Furthermore, generate_append_tlist is used
1840 	 * for set operations, which are likely to produce mostly unique output
1841 	 * anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
1842 	 * while maintaining lower startup cost.
1843 	 */
1844 	foreach(l, pathkeys)
1845 	{
1846 		PathKey    *key = (PathKey *) lfirst(l);
1847 		EquivalenceMember *member = (EquivalenceMember *)
1848 		linitial(key->pk_eclass->ec_members);
1849 
1850 		/*
1851 		 * Check if the expression contains Var with "varno 0" so that we
1852 		 * don't call estimate_num_groups in that case.
1853 		 */
1854 		if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
1855 		{
1856 			unknown_varno = true;
1857 			break;
1858 		}
1859 
1860 		/* expression not containing any Vars with "varno 0" */
1861 		presortedExprs = lappend(presortedExprs, member->em_expr);
1862 
1863 		i++;
1864 		if (i >= presorted_keys)
1865 			break;
1866 	}
1867 
1868 	/* Estimate number of groups with equal presorted keys. */
1869 	if (!unknown_varno)
1870 		input_groups = estimate_num_groups(root, presortedExprs, input_tuples, NULL);
1871 
1872 	group_tuples = input_tuples / input_groups;
1873 	group_input_run_cost = input_run_cost / input_groups;
1874 
1875 	/*
1876 	 * Estimate average cost of sorting of one group where presorted keys are
1877 	 * equal.  Incremental sort is sensitive to distribution of tuples to the
1878 	 * groups, where we're relying on quite rough assumptions.  Thus, we're
1879 	 * pessimistic about incremental sort performance and increase its average
1880 	 * group size by half.
1881 	 */
1882 	cost_tuplesort(&group_startup_cost, &group_run_cost,
1883 				   1.5 * group_tuples, width, comparison_cost, sort_mem,
1884 				   limit_tuples);
1885 
1886 	/*
1887 	 * Startup cost of incremental sort is the startup cost of its first group
1888 	 * plus the cost of its input.
1889 	 */
1890 	startup_cost += group_startup_cost
1891 		+ input_startup_cost + group_input_run_cost;
1892 
1893 	/*
1894 	 * After we started producing tuples from the first group, the cost of
1895 	 * producing all the tuples is given by the cost to finish processing this
1896 	 * group, plus the total cost to process the remaining groups, plus the
1897 	 * remaining cost of input.
1898 	 */
1899 	run_cost += group_run_cost
1900 		+ (group_run_cost + group_startup_cost) * (input_groups - 1)
1901 		+ group_input_run_cost * (input_groups - 1);
1902 
1903 	/*
1904 	 * Incremental sort adds some overhead by itself. Firstly, it has to
1905 	 * detect the sort groups. This is roughly equal to one extra copy and
1906 	 * comparison per tuple. Secondly, it has to reset the tuplesort context
1907 	 * for every group.
1908 	 */
1909 	run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
1910 	run_cost += 2.0 * cpu_tuple_cost * input_groups;
1911 
1912 	path->rows = input_tuples;
1913 	path->startup_cost = startup_cost;
1914 	path->total_cost = startup_cost + run_cost;
1915 }
1916 
1917 /*
1918  * cost_sort
1919  *	  Determines and returns the cost of sorting a relation, including
1920  *	  the cost of reading the input data.
1921  *
1922  * NOTE: some callers currently pass NIL for pathkeys because they
1923  * can't conveniently supply the sort keys.  Since this routine doesn't
1924  * currently do anything with pathkeys anyway, that doesn't matter...
1925  * but if it ever does, it should react gracefully to lack of key data.
1926  * (Actually, the thing we'd most likely be interested in is just the number
1927  * of sort keys, which all callers *could* supply.)
1928  */
1929 void
cost_sort(Path * path,PlannerInfo * root,List * pathkeys,Cost input_cost,double tuples,int width,Cost comparison_cost,int sort_mem,double limit_tuples)1930 cost_sort(Path *path, PlannerInfo *root,
1931 		  List *pathkeys, Cost input_cost, double tuples, int width,
1932 		  Cost comparison_cost, int sort_mem,
1933 		  double limit_tuples)
1934 
1935 {
1936 	Cost		startup_cost;
1937 	Cost		run_cost;
1938 
1939 	cost_tuplesort(&startup_cost, &run_cost,
1940 				   tuples, width,
1941 				   comparison_cost, sort_mem,
1942 				   limit_tuples);
1943 
1944 	if (!enable_sort)
1945 		startup_cost += disable_cost;
1946 
1947 	startup_cost += input_cost;
1948 
1949 	path->rows = tuples;
1950 	path->startup_cost = startup_cost;
1951 	path->total_cost = startup_cost + run_cost;
1952 }
1953 
1954 /*
1955  * append_nonpartial_cost
1956  *	  Estimate the cost of the non-partial paths in a Parallel Append.
1957  *	  The non-partial paths are assumed to be the first "numpaths" paths
1958  *	  from the subpaths list, and to be in order of decreasing cost.
1959  */
1960 static Cost
append_nonpartial_cost(List * subpaths,int numpaths,int parallel_workers)1961 append_nonpartial_cost(List *subpaths, int numpaths, int parallel_workers)
1962 {
1963 	Cost	   *costarr;
1964 	int			arrlen;
1965 	ListCell   *l;
1966 	ListCell   *cell;
1967 	int			i;
1968 	int			path_index;
1969 	int			min_index;
1970 	int			max_index;
1971 
1972 	if (numpaths == 0)
1973 		return 0;
1974 
1975 	/*
1976 	 * Array length is number of workers or number of relevant paths,
1977 	 * whichever is less.
1978 	 */
1979 	arrlen = Min(parallel_workers, numpaths);
1980 	costarr = (Cost *) palloc(sizeof(Cost) * arrlen);
1981 
1982 	/* The first few paths will each be claimed by a different worker. */
1983 	path_index = 0;
1984 	foreach(cell, subpaths)
1985 	{
1986 		Path	   *subpath = (Path *) lfirst(cell);
1987 
1988 		if (path_index == arrlen)
1989 			break;
1990 		costarr[path_index++] = subpath->total_cost;
1991 	}
1992 
1993 	/*
1994 	 * Since subpaths are sorted by decreasing cost, the last one will have
1995 	 * the minimum cost.
1996 	 */
1997 	min_index = arrlen - 1;
1998 
1999 	/*
2000 	 * For each of the remaining subpaths, add its cost to the array element
2001 	 * with minimum cost.
2002 	 */
2003 	for_each_cell(l, subpaths, cell)
2004 	{
2005 		Path	   *subpath = (Path *) lfirst(l);
2006 		int			i;
2007 
2008 		/* Consider only the non-partial paths */
2009 		if (path_index++ == numpaths)
2010 			break;
2011 
2012 		costarr[min_index] += subpath->total_cost;
2013 
2014 		/* Update the new min cost array index */
2015 		for (min_index = i = 0; i < arrlen; i++)
2016 		{
2017 			if (costarr[i] < costarr[min_index])
2018 				min_index = i;
2019 		}
2020 	}
2021 
2022 	/* Return the highest cost from the array */
2023 	for (max_index = i = 0; i < arrlen; i++)
2024 	{
2025 		if (costarr[i] > costarr[max_index])
2026 			max_index = i;
2027 	}
2028 
2029 	return costarr[max_index];
2030 }
2031 
2032 /*
2033  * cost_append
2034  *	  Determines and returns the cost of an Append node.
2035  */
2036 void
cost_append(AppendPath * apath)2037 cost_append(AppendPath *apath)
2038 {
2039 	ListCell   *l;
2040 
2041 	apath->path.startup_cost = 0;
2042 	apath->path.total_cost = 0;
2043 	apath->path.rows = 0;
2044 
2045 	if (apath->subpaths == NIL)
2046 		return;
2047 
2048 	if (!apath->path.parallel_aware)
2049 	{
2050 		List	   *pathkeys = apath->path.pathkeys;
2051 
2052 		if (pathkeys == NIL)
2053 		{
2054 			Path	   *subpath = (Path *) linitial(apath->subpaths);
2055 
2056 			/*
2057 			 * For an unordered, non-parallel-aware Append we take the startup
2058 			 * cost as the startup cost of the first subpath.
2059 			 */
2060 			apath->path.startup_cost = subpath->startup_cost;
2061 
2062 			/* Compute rows and costs as sums of subplan rows and costs. */
2063 			foreach(l, apath->subpaths)
2064 			{
2065 				Path	   *subpath = (Path *) lfirst(l);
2066 
2067 				apath->path.rows += subpath->rows;
2068 				apath->path.total_cost += subpath->total_cost;
2069 			}
2070 		}
2071 		else
2072 		{
2073 			/*
2074 			 * For an ordered, non-parallel-aware Append we take the startup
2075 			 * cost as the sum of the subpath startup costs.  This ensures
2076 			 * that we don't underestimate the startup cost when a query's
2077 			 * LIMIT is such that several of the children have to be run to
2078 			 * satisfy it.  This might be overkill --- another plausible hack
2079 			 * would be to take the Append's startup cost as the maximum of
2080 			 * the child startup costs.  But we don't want to risk believing
2081 			 * that an ORDER BY LIMIT query can be satisfied at small cost
2082 			 * when the first child has small startup cost but later ones
2083 			 * don't.  (If we had the ability to deal with nonlinear cost
2084 			 * interpolation for partial retrievals, we would not need to be
2085 			 * so conservative about this.)
2086 			 *
2087 			 * This case is also different from the above in that we have to
2088 			 * account for possibly injecting sorts into subpaths that aren't
2089 			 * natively ordered.
2090 			 */
2091 			foreach(l, apath->subpaths)
2092 			{
2093 				Path	   *subpath = (Path *) lfirst(l);
2094 				Path		sort_path;	/* dummy for result of cost_sort */
2095 
2096 				if (!pathkeys_contained_in(pathkeys, subpath->pathkeys))
2097 				{
2098 					/*
2099 					 * We'll need to insert a Sort node, so include costs for
2100 					 * that.  We can use the parent's LIMIT if any, since we
2101 					 * certainly won't pull more than that many tuples from
2102 					 * any child.
2103 					 */
2104 					cost_sort(&sort_path,
2105 							  NULL, /* doesn't currently need root */
2106 							  pathkeys,
2107 							  subpath->total_cost,
2108 							  subpath->rows,
2109 							  subpath->pathtarget->width,
2110 							  0.0,
2111 							  work_mem,
2112 							  apath->limit_tuples);
2113 					subpath = &sort_path;
2114 				}
2115 
2116 				apath->path.rows += subpath->rows;
2117 				apath->path.startup_cost += subpath->startup_cost;
2118 				apath->path.total_cost += subpath->total_cost;
2119 			}
2120 		}
2121 	}
2122 	else						/* parallel-aware */
2123 	{
2124 		int			i = 0;
2125 		double		parallel_divisor = get_parallel_divisor(&apath->path);
2126 
2127 		/* Parallel-aware Append never produces ordered output. */
2128 		Assert(apath->path.pathkeys == NIL);
2129 
2130 		/* Calculate startup cost. */
2131 		foreach(l, apath->subpaths)
2132 		{
2133 			Path	   *subpath = (Path *) lfirst(l);
2134 
2135 			/*
2136 			 * Append will start returning tuples when the child node having
2137 			 * lowest startup cost is done setting up. We consider only the
2138 			 * first few subplans that immediately get a worker assigned.
2139 			 */
2140 			if (i == 0)
2141 				apath->path.startup_cost = subpath->startup_cost;
2142 			else if (i < apath->path.parallel_workers)
2143 				apath->path.startup_cost = Min(apath->path.startup_cost,
2144 											   subpath->startup_cost);
2145 
2146 			/*
2147 			 * Apply parallel divisor to subpaths.  Scale the number of rows
2148 			 * for each partial subpath based on the ratio of the parallel
2149 			 * divisor originally used for the subpath to the one we adopted.
2150 			 * Also add the cost of partial paths to the total cost, but
2151 			 * ignore non-partial paths for now.
2152 			 */
2153 			if (i < apath->first_partial_path)
2154 				apath->path.rows += subpath->rows / parallel_divisor;
2155 			else
2156 			{
2157 				double		subpath_parallel_divisor;
2158 
2159 				subpath_parallel_divisor = get_parallel_divisor(subpath);
2160 				apath->path.rows += subpath->rows * (subpath_parallel_divisor /
2161 													 parallel_divisor);
2162 				apath->path.total_cost += subpath->total_cost;
2163 			}
2164 
2165 			apath->path.rows = clamp_row_est(apath->path.rows);
2166 
2167 			i++;
2168 		}
2169 
2170 		/* Add cost for non-partial subpaths. */
2171 		apath->path.total_cost +=
2172 			append_nonpartial_cost(apath->subpaths,
2173 								   apath->first_partial_path,
2174 								   apath->path.parallel_workers);
2175 	}
2176 
2177 	/*
2178 	 * Although Append does not do any selection or projection, it's not free;
2179 	 * add a small per-tuple overhead.
2180 	 */
2181 	apath->path.total_cost +=
2182 		cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * apath->path.rows;
2183 }
2184 
2185 /*
2186  * cost_merge_append
2187  *	  Determines and returns the cost of a MergeAppend node.
2188  *
2189  * MergeAppend merges several pre-sorted input streams, using a heap that
2190  * at any given instant holds the next tuple from each stream.  If there
2191  * are N streams, we need about N*log2(N) tuple comparisons to construct
2192  * the heap at startup, and then for each output tuple, about log2(N)
2193  * comparisons to replace the top entry.
2194  *
2195  * (The effective value of N will drop once some of the input streams are
2196  * exhausted, but it seems unlikely to be worth trying to account for that.)
2197  *
2198  * The heap is never spilled to disk, since we assume N is not very large.
2199  * So this is much simpler than cost_sort.
2200  *
2201  * As in cost_sort, we charge two operator evals per tuple comparison.
2202  *
2203  * 'pathkeys' is a list of sort keys
2204  * 'n_streams' is the number of input streams
2205  * 'input_startup_cost' is the sum of the input streams' startup costs
2206  * 'input_total_cost' is the sum of the input streams' total costs
2207  * 'tuples' is the number of tuples in all the streams
2208  */
2209 void
cost_merge_append(Path * path,PlannerInfo * root,List * pathkeys,int n_streams,Cost input_startup_cost,Cost input_total_cost,double tuples)2210 cost_merge_append(Path *path, PlannerInfo *root,
2211 				  List *pathkeys, int n_streams,
2212 				  Cost input_startup_cost, Cost input_total_cost,
2213 				  double tuples)
2214 {
2215 	Cost		startup_cost = 0;
2216 	Cost		run_cost = 0;
2217 	Cost		comparison_cost;
2218 	double		N;
2219 	double		logN;
2220 
2221 	/*
2222 	 * Avoid log(0)...
2223 	 */
2224 	N = (n_streams < 2) ? 2.0 : (double) n_streams;
2225 	logN = LOG2(N);
2226 
2227 	/* Assumed cost per tuple comparison */
2228 	comparison_cost = 2.0 * cpu_operator_cost;
2229 
2230 	/* Heap creation cost */
2231 	startup_cost += comparison_cost * N * logN;
2232 
2233 	/* Per-tuple heap maintenance cost */
2234 	run_cost += tuples * comparison_cost * logN;
2235 
2236 	/*
2237 	 * Although MergeAppend does not do any selection or projection, it's not
2238 	 * free; add a small per-tuple overhead.
2239 	 */
2240 	run_cost += cpu_tuple_cost * APPEND_CPU_COST_MULTIPLIER * tuples;
2241 
2242 	path->startup_cost = startup_cost + input_startup_cost;
2243 	path->total_cost = startup_cost + run_cost + input_total_cost;
2244 }
2245 
2246 /*
2247  * cost_material
2248  *	  Determines and returns the cost of materializing a relation, including
2249  *	  the cost of reading the input data.
2250  *
2251  * If the total volume of data to materialize exceeds work_mem, we will need
2252  * to write it to disk, so the cost is much higher in that case.
2253  *
2254  * Note that here we are estimating the costs for the first scan of the
2255  * relation, so the materialization is all overhead --- any savings will
2256  * occur only on rescan, which is estimated in cost_rescan.
2257  */
2258 void
cost_material(Path * path,Cost input_startup_cost,Cost input_total_cost,double tuples,int width)2259 cost_material(Path *path,
2260 			  Cost input_startup_cost, Cost input_total_cost,
2261 			  double tuples, int width)
2262 {
2263 	Cost		startup_cost = input_startup_cost;
2264 	Cost		run_cost = input_total_cost - input_startup_cost;
2265 	double		nbytes = relation_byte_size(tuples, width);
2266 	long		work_mem_bytes = work_mem * 1024L;
2267 
2268 	path->rows = tuples;
2269 
2270 	/*
2271 	 * Whether spilling or not, charge 2x cpu_operator_cost per tuple to
2272 	 * reflect bookkeeping overhead.  (This rate must be more than what
2273 	 * cost_rescan charges for materialize, ie, cpu_operator_cost per tuple;
2274 	 * if it is exactly the same then there will be a cost tie between
2275 	 * nestloop with A outer, materialized B inner and nestloop with B outer,
2276 	 * materialized A inner.  The extra cost ensures we'll prefer
2277 	 * materializing the smaller rel.)	Note that this is normally a good deal
2278 	 * less than cpu_tuple_cost; which is OK because a Material plan node
2279 	 * doesn't do qual-checking or projection, so it's got less overhead than
2280 	 * most plan nodes.
2281 	 */
2282 	run_cost += 2 * cpu_operator_cost * tuples;
2283 
2284 	/*
2285 	 * If we will spill to disk, charge at the rate of seq_page_cost per page.
2286 	 * This cost is assumed to be evenly spread through the plan run phase,
2287 	 * which isn't exactly accurate but our cost model doesn't allow for
2288 	 * nonuniform costs within the run phase.
2289 	 */
2290 	if (nbytes > work_mem_bytes)
2291 	{
2292 		double		npages = ceil(nbytes / BLCKSZ);
2293 
2294 		run_cost += seq_page_cost * npages;
2295 	}
2296 
2297 	path->startup_cost = startup_cost;
2298 	path->total_cost = startup_cost + run_cost;
2299 }
2300 
2301 /*
2302  * cost_agg
2303  *		Determines and returns the cost of performing an Agg plan node,
2304  *		including the cost of its input.
2305  *
2306  * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
2307  * we are using a hashed Agg node just to do grouping).
2308  *
2309  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
2310  * are for appropriately-sorted input.
2311  */
2312 void
cost_agg(Path * path,PlannerInfo * root,AggStrategy aggstrategy,const AggClauseCosts * aggcosts,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples,double input_width)2313 cost_agg(Path *path, PlannerInfo *root,
2314 		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
2315 		 int numGroupCols, double numGroups,
2316 		 List *quals,
2317 		 Cost input_startup_cost, Cost input_total_cost,
2318 		 double input_tuples, double input_width)
2319 {
2320 	double		output_tuples;
2321 	Cost		startup_cost;
2322 	Cost		total_cost;
2323 	AggClauseCosts dummy_aggcosts;
2324 
2325 	/* Use all-zero per-aggregate costs if NULL is passed */
2326 	if (aggcosts == NULL)
2327 	{
2328 		Assert(aggstrategy == AGG_HASHED);
2329 		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
2330 		aggcosts = &dummy_aggcosts;
2331 	}
2332 
2333 	/*
2334 	 * The transCost.per_tuple component of aggcosts should be charged once
2335 	 * per input tuple, corresponding to the costs of evaluating the aggregate
2336 	 * transfns and their input expressions. The finalCost.per_tuple component
2337 	 * is charged once per output tuple, corresponding to the costs of
2338 	 * evaluating the finalfns.  Startup costs are of course charged but once.
2339 	 *
2340 	 * If we are grouping, we charge an additional cpu_operator_cost per
2341 	 * grouping column per input tuple for grouping comparisons.
2342 	 *
2343 	 * We will produce a single output tuple if not grouping, and a tuple per
2344 	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
2345 	 *
2346 	 * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the
2347 	 * same total CPU cost, but AGG_SORTED has lower startup cost.  If the
2348 	 * input path is already sorted appropriately, AGG_SORTED should be
2349 	 * preferred (since it has no risk of memory overflow).  This will happen
2350 	 * as long as the computed total costs are indeed exactly equal --- but if
2351 	 * there's roundoff error we might do the wrong thing.  So be sure that
2352 	 * the computations below form the same intermediate values in the same
2353 	 * order.
2354 	 */
2355 	if (aggstrategy == AGG_PLAIN)
2356 	{
2357 		startup_cost = input_total_cost;
2358 		startup_cost += aggcosts->transCost.startup;
2359 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2360 		startup_cost += aggcosts->finalCost.startup;
2361 		startup_cost += aggcosts->finalCost.per_tuple;
2362 		/* we aren't grouping */
2363 		total_cost = startup_cost + cpu_tuple_cost;
2364 		output_tuples = 1;
2365 	}
2366 	else if (aggstrategy == AGG_SORTED || aggstrategy == AGG_MIXED)
2367 	{
2368 		/* Here we are able to deliver output on-the-fly */
2369 		startup_cost = input_startup_cost;
2370 		total_cost = input_total_cost;
2371 		if (aggstrategy == AGG_MIXED && !enable_hashagg)
2372 		{
2373 			startup_cost += disable_cost;
2374 			total_cost += disable_cost;
2375 		}
2376 		/* calcs phrased this way to match HASHED case, see note above */
2377 		total_cost += aggcosts->transCost.startup;
2378 		total_cost += aggcosts->transCost.per_tuple * input_tuples;
2379 		total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2380 		total_cost += aggcosts->finalCost.startup;
2381 		total_cost += aggcosts->finalCost.per_tuple * numGroups;
2382 		total_cost += cpu_tuple_cost * numGroups;
2383 		output_tuples = numGroups;
2384 	}
2385 	else
2386 	{
2387 		/* must be AGG_HASHED */
2388 		startup_cost = input_total_cost;
2389 		if (!enable_hashagg)
2390 			startup_cost += disable_cost;
2391 		startup_cost += aggcosts->transCost.startup;
2392 		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
2393 		/* cost of computing hash value */
2394 		startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
2395 		startup_cost += aggcosts->finalCost.startup;
2396 
2397 		total_cost = startup_cost;
2398 		total_cost += aggcosts->finalCost.per_tuple * numGroups;
2399 		/* cost of retrieving from hash table */
2400 		total_cost += cpu_tuple_cost * numGroups;
2401 		output_tuples = numGroups;
2402 	}
2403 
2404 	/*
2405 	 * Add the disk costs of hash aggregation that spills to disk.
2406 	 *
2407 	 * Groups that go into the hash table stay in memory until finalized, so
2408 	 * spilling and reprocessing tuples doesn't incur additional invocations
2409 	 * of transCost or finalCost. Furthermore, the computed hash value is
2410 	 * stored with the spilled tuples, so we don't incur extra invocations of
2411 	 * the hash function.
2412 	 *
2413 	 * Hash Agg begins returning tuples after the first batch is complete.
2414 	 * Accrue writes (spilled tuples) to startup_cost and to total_cost;
2415 	 * accrue reads only to total_cost.
2416 	 */
2417 	if (aggstrategy == AGG_HASHED || aggstrategy == AGG_MIXED)
2418 	{
2419 		double		pages;
2420 		double		pages_written = 0.0;
2421 		double		pages_read = 0.0;
2422 		double		spill_cost;
2423 		double		hashentrysize;
2424 		double		nbatches;
2425 		Size		mem_limit;
2426 		uint64		ngroups_limit;
2427 		int			num_partitions;
2428 		int			depth;
2429 
2430 		/*
2431 		 * Estimate number of batches based on the computed limits. If less
2432 		 * than or equal to one, all groups are expected to fit in memory;
2433 		 * otherwise we expect to spill.
2434 		 */
2435 		hashentrysize = hash_agg_entry_size(aggcosts->numAggs, input_width,
2436 											aggcosts->transitionSpace);
2437 		hash_agg_set_limits(hashentrysize, numGroups, 0, &mem_limit,
2438 							&ngroups_limit, &num_partitions);
2439 
2440 		nbatches = Max((numGroups * hashentrysize) / mem_limit,
2441 					   numGroups / ngroups_limit);
2442 
2443 		nbatches = Max(ceil(nbatches), 1.0);
2444 		num_partitions = Max(num_partitions, 2);
2445 
2446 		/*
2447 		 * The number of partitions can change at different levels of
2448 		 * recursion; but for the purposes of this calculation assume it stays
2449 		 * constant.
2450 		 */
2451 		depth = ceil(log(nbatches) / log(num_partitions));
2452 
2453 		/*
2454 		 * Estimate number of pages read and written. For each level of
2455 		 * recursion, a tuple must be written and then later read.
2456 		 */
2457 		pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
2458 		pages_written = pages_read = pages * depth;
2459 
2460 		/*
2461 		 * HashAgg has somewhat worse IO behavior than Sort on typical
2462 		 * hardware/OS combinations. Account for this with a generic penalty.
2463 		 */
2464 		pages_read *= 2.0;
2465 		pages_written *= 2.0;
2466 
2467 		startup_cost += pages_written * random_page_cost;
2468 		total_cost += pages_written * random_page_cost;
2469 		total_cost += pages_read * seq_page_cost;
2470 
2471 		/* account for CPU cost of spilling a tuple and reading it back */
2472 		spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
2473 		startup_cost += spill_cost;
2474 		total_cost += spill_cost;
2475 	}
2476 
2477 	/*
2478 	 * If there are quals (HAVING quals), account for their cost and
2479 	 * selectivity.
2480 	 */
2481 	if (quals)
2482 	{
2483 		QualCost	qual_cost;
2484 
2485 		cost_qual_eval(&qual_cost, quals, root);
2486 		startup_cost += qual_cost.startup;
2487 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2488 
2489 		output_tuples = clamp_row_est(output_tuples *
2490 									  clauselist_selectivity(root,
2491 															 quals,
2492 															 0,
2493 															 JOIN_INNER,
2494 															 NULL));
2495 	}
2496 
2497 	path->rows = output_tuples;
2498 	path->startup_cost = startup_cost;
2499 	path->total_cost = total_cost;
2500 }
2501 
2502 /*
2503  * cost_windowagg
2504  *		Determines and returns the cost of performing a WindowAgg plan node,
2505  *		including the cost of its input.
2506  *
2507  * Input is assumed already properly sorted.
2508  */
2509 void
cost_windowagg(Path * path,PlannerInfo * root,List * windowFuncs,int numPartCols,int numOrderCols,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2510 cost_windowagg(Path *path, PlannerInfo *root,
2511 			   List *windowFuncs, int numPartCols, int numOrderCols,
2512 			   Cost input_startup_cost, Cost input_total_cost,
2513 			   double input_tuples)
2514 {
2515 	Cost		startup_cost;
2516 	Cost		total_cost;
2517 	ListCell   *lc;
2518 
2519 	startup_cost = input_startup_cost;
2520 	total_cost = input_total_cost;
2521 
2522 	/*
2523 	 * Window functions are assumed to cost their stated execution cost, plus
2524 	 * the cost of evaluating their input expressions, per tuple.  Since they
2525 	 * may in fact evaluate their inputs at multiple rows during each cycle,
2526 	 * this could be a drastic underestimate; but without a way to know how
2527 	 * many rows the window function will fetch, it's hard to do better.  In
2528 	 * any case, it's a good estimate for all the built-in window functions,
2529 	 * so we'll just do this for now.
2530 	 */
2531 	foreach(lc, windowFuncs)
2532 	{
2533 		WindowFunc *wfunc = lfirst_node(WindowFunc, lc);
2534 		Cost		wfunccost;
2535 		QualCost	argcosts;
2536 
2537 		argcosts.startup = argcosts.per_tuple = 0;
2538 		add_function_cost(root, wfunc->winfnoid, (Node *) wfunc,
2539 						  &argcosts);
2540 		startup_cost += argcosts.startup;
2541 		wfunccost = argcosts.per_tuple;
2542 
2543 		/* also add the input expressions' cost to per-input-row costs */
2544 		cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
2545 		startup_cost += argcosts.startup;
2546 		wfunccost += argcosts.per_tuple;
2547 
2548 		/*
2549 		 * Add the filter's cost to per-input-row costs.  XXX We should reduce
2550 		 * input expression costs according to filter selectivity.
2551 		 */
2552 		cost_qual_eval_node(&argcosts, (Node *) wfunc->aggfilter, root);
2553 		startup_cost += argcosts.startup;
2554 		wfunccost += argcosts.per_tuple;
2555 
2556 		total_cost += wfunccost * input_tuples;
2557 	}
2558 
2559 	/*
2560 	 * We also charge cpu_operator_cost per grouping column per tuple for
2561 	 * grouping comparisons, plus cpu_tuple_cost per tuple for general
2562 	 * overhead.
2563 	 *
2564 	 * XXX this neglects costs of spooling the data to disk when it overflows
2565 	 * work_mem.  Sooner or later that should get accounted for.
2566 	 */
2567 	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
2568 	total_cost += cpu_tuple_cost * input_tuples;
2569 
2570 	path->rows = input_tuples;
2571 	path->startup_cost = startup_cost;
2572 	path->total_cost = total_cost;
2573 }
2574 
2575 /*
2576  * cost_group
2577  *		Determines and returns the cost of performing a Group plan node,
2578  *		including the cost of its input.
2579  *
2580  * Note: caller must ensure that input costs are for appropriately-sorted
2581  * input.
2582  */
2583 void
cost_group(Path * path,PlannerInfo * root,int numGroupCols,double numGroups,List * quals,Cost input_startup_cost,Cost input_total_cost,double input_tuples)2584 cost_group(Path *path, PlannerInfo *root,
2585 		   int numGroupCols, double numGroups,
2586 		   List *quals,
2587 		   Cost input_startup_cost, Cost input_total_cost,
2588 		   double input_tuples)
2589 {
2590 	double		output_tuples;
2591 	Cost		startup_cost;
2592 	Cost		total_cost;
2593 
2594 	output_tuples = numGroups;
2595 	startup_cost = input_startup_cost;
2596 	total_cost = input_total_cost;
2597 
2598 	/*
2599 	 * Charge one cpu_operator_cost per comparison per input tuple. We assume
2600 	 * all columns get compared at most of the tuples.
2601 	 */
2602 	total_cost += cpu_operator_cost * input_tuples * numGroupCols;
2603 
2604 	/*
2605 	 * If there are quals (HAVING quals), account for their cost and
2606 	 * selectivity.
2607 	 */
2608 	if (quals)
2609 	{
2610 		QualCost	qual_cost;
2611 
2612 		cost_qual_eval(&qual_cost, quals, root);
2613 		startup_cost += qual_cost.startup;
2614 		total_cost += qual_cost.startup + output_tuples * qual_cost.per_tuple;
2615 
2616 		output_tuples = clamp_row_est(output_tuples *
2617 									  clauselist_selectivity(root,
2618 															 quals,
2619 															 0,
2620 															 JOIN_INNER,
2621 															 NULL));
2622 	}
2623 
2624 	path->rows = output_tuples;
2625 	path->startup_cost = startup_cost;
2626 	path->total_cost = total_cost;
2627 }
2628 
2629 /*
2630  * initial_cost_nestloop
2631  *	  Preliminary estimate of the cost of a nestloop join path.
2632  *
2633  * This must quickly produce lower-bound estimates of the path's startup and
2634  * total costs.  If we are unable to eliminate the proposed path from
2635  * consideration using the lower bounds, final_cost_nestloop will be called
2636  * to obtain the final estimates.
2637  *
2638  * The exact division of labor between this function and final_cost_nestloop
2639  * is private to them, and represents a tradeoff between speed of the initial
2640  * estimate and getting a tight lower bound.  We choose to not examine the
2641  * join quals here, since that's by far the most expensive part of the
2642  * calculations.  The end result is that CPU-cost considerations must be
2643  * left for the second phase; and for SEMI/ANTI joins, we must also postpone
2644  * incorporation of the inner path's run cost.
2645  *
2646  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2647  *		other data to be used by final_cost_nestloop
2648  * 'jointype' is the type of join to be performed
2649  * 'outer_path' is the outer input to the join
2650  * 'inner_path' is the inner input to the join
2651  * 'extra' contains miscellaneous information about the join
2652  */
2653 void
initial_cost_nestloop(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,Path * outer_path,Path * inner_path,JoinPathExtraData * extra)2654 initial_cost_nestloop(PlannerInfo *root, JoinCostWorkspace *workspace,
2655 					  JoinType jointype,
2656 					  Path *outer_path, Path *inner_path,
2657 					  JoinPathExtraData *extra)
2658 {
2659 	Cost		startup_cost = 0;
2660 	Cost		run_cost = 0;
2661 	double		outer_path_rows = outer_path->rows;
2662 	Cost		inner_rescan_start_cost;
2663 	Cost		inner_rescan_total_cost;
2664 	Cost		inner_run_cost;
2665 	Cost		inner_rescan_run_cost;
2666 
2667 	/* estimate costs to rescan the inner relation */
2668 	cost_rescan(root, inner_path,
2669 				&inner_rescan_start_cost,
2670 				&inner_rescan_total_cost);
2671 
2672 	/* cost of source data */
2673 
2674 	/*
2675 	 * NOTE: clearly, we must pay both outer and inner paths' startup_cost
2676 	 * before we can start returning tuples, so the join's startup cost is
2677 	 * their sum.  We'll also pay the inner path's rescan startup cost
2678 	 * multiple times.
2679 	 */
2680 	startup_cost += outer_path->startup_cost + inner_path->startup_cost;
2681 	run_cost += outer_path->total_cost - outer_path->startup_cost;
2682 	if (outer_path_rows > 1)
2683 		run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
2684 
2685 	inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
2686 	inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
2687 
2688 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI ||
2689 		extra->inner_unique)
2690 	{
2691 		/*
2692 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2693 		 * executor will stop after the first match.
2694 		 *
2695 		 * Getting decent estimates requires inspection of the join quals,
2696 		 * which we choose to postpone to final_cost_nestloop.
2697 		 */
2698 
2699 		/* Save private data for final_cost_nestloop */
2700 		workspace->inner_run_cost = inner_run_cost;
2701 		workspace->inner_rescan_run_cost = inner_rescan_run_cost;
2702 	}
2703 	else
2704 	{
2705 		/* Normal case; we'll scan whole input rel for each outer row */
2706 		run_cost += inner_run_cost;
2707 		if (outer_path_rows > 1)
2708 			run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
2709 	}
2710 
2711 	/* CPU costs left for later */
2712 
2713 	/* Public result fields */
2714 	workspace->startup_cost = startup_cost;
2715 	workspace->total_cost = startup_cost + run_cost;
2716 	/* Save private data for final_cost_nestloop */
2717 	workspace->run_cost = run_cost;
2718 }
2719 
2720 /*
2721  * final_cost_nestloop
2722  *	  Final estimate of the cost and result size of a nestloop join path.
2723  *
2724  * 'path' is already filled in except for the rows and cost fields
2725  * 'workspace' is the result from initial_cost_nestloop
2726  * 'extra' contains miscellaneous information about the join
2727  */
2728 void
final_cost_nestloop(PlannerInfo * root,NestPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)2729 final_cost_nestloop(PlannerInfo *root, NestPath *path,
2730 					JoinCostWorkspace *workspace,
2731 					JoinPathExtraData *extra)
2732 {
2733 	Path	   *outer_path = path->outerjoinpath;
2734 	Path	   *inner_path = path->innerjoinpath;
2735 	double		outer_path_rows = outer_path->rows;
2736 	double		inner_path_rows = inner_path->rows;
2737 	Cost		startup_cost = workspace->startup_cost;
2738 	Cost		run_cost = workspace->run_cost;
2739 	Cost		cpu_per_tuple;
2740 	QualCost	restrict_qual_cost;
2741 	double		ntuples;
2742 
2743 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2744 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2745 		outer_path_rows = 1;
2746 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2747 		inner_path_rows = 1;
2748 
2749 	/* Mark the path with the correct row estimate */
2750 	if (path->path.param_info)
2751 		path->path.rows = path->path.param_info->ppi_rows;
2752 	else
2753 		path->path.rows = path->path.parent->rows;
2754 
2755 	/* For partial paths, scale row estimate. */
2756 	if (path->path.parallel_workers > 0)
2757 	{
2758 		double		parallel_divisor = get_parallel_divisor(&path->path);
2759 
2760 		path->path.rows =
2761 			clamp_row_est(path->path.rows / parallel_divisor);
2762 	}
2763 
2764 	/*
2765 	 * We could include disable_cost in the preliminary estimate, but that
2766 	 * would amount to optimizing for the case where the join method is
2767 	 * disabled, which doesn't seem like the way to bet.
2768 	 */
2769 	if (!enable_nestloop)
2770 		startup_cost += disable_cost;
2771 
2772 	/* cost of inner-relation source data (we already dealt with outer rel) */
2773 
2774 	if (path->jointype == JOIN_SEMI || path->jointype == JOIN_ANTI ||
2775 		extra->inner_unique)
2776 	{
2777 		/*
2778 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
2779 		 * executor will stop after the first match.
2780 		 */
2781 		Cost		inner_run_cost = workspace->inner_run_cost;
2782 		Cost		inner_rescan_run_cost = workspace->inner_rescan_run_cost;
2783 		double		outer_matched_rows;
2784 		double		outer_unmatched_rows;
2785 		Selectivity inner_scan_frac;
2786 
2787 		/*
2788 		 * For an outer-rel row that has at least one match, we can expect the
2789 		 * inner scan to stop after a fraction 1/(match_count+1) of the inner
2790 		 * rows, if the matches are evenly distributed.  Since they probably
2791 		 * aren't quite evenly distributed, we apply a fuzz factor of 2.0 to
2792 		 * that fraction.  (If we used a larger fuzz factor, we'd have to
2793 		 * clamp inner_scan_frac to at most 1.0; but since match_count is at
2794 		 * least 1, no such clamp is needed now.)
2795 		 */
2796 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
2797 		outer_unmatched_rows = outer_path_rows - outer_matched_rows;
2798 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
2799 
2800 		/*
2801 		 * Compute number of tuples processed (not number emitted!).  First,
2802 		 * account for successfully-matched outer rows.
2803 		 */
2804 		ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
2805 
2806 		/*
2807 		 * Now we need to estimate the actual costs of scanning the inner
2808 		 * relation, which may be quite a bit less than N times inner_run_cost
2809 		 * due to early scan stops.  We consider two cases.  If the inner path
2810 		 * is an indexscan using all the joinquals as indexquals, then an
2811 		 * unmatched outer row results in an indexscan returning no rows,
2812 		 * which is probably quite cheap.  Otherwise, the executor will have
2813 		 * to scan the whole inner rel for an unmatched row; not so cheap.
2814 		 */
2815 		if (has_indexed_join_quals(path))
2816 		{
2817 			/*
2818 			 * Successfully-matched outer rows will only require scanning
2819 			 * inner_scan_frac of the inner relation.  In this case, we don't
2820 			 * need to charge the full inner_run_cost even when that's more
2821 			 * than inner_rescan_run_cost, because we can assume that none of
2822 			 * the inner scans ever scan the whole inner relation.  So it's
2823 			 * okay to assume that all the inner scan executions can be
2824 			 * fractions of the full cost, even if materialization is reducing
2825 			 * the rescan cost.  At this writing, it's impossible to get here
2826 			 * for a materialized inner scan, so inner_run_cost and
2827 			 * inner_rescan_run_cost will be the same anyway; but just in
2828 			 * case, use inner_run_cost for the first matched tuple and
2829 			 * inner_rescan_run_cost for additional ones.
2830 			 */
2831 			run_cost += inner_run_cost * inner_scan_frac;
2832 			if (outer_matched_rows > 1)
2833 				run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
2834 
2835 			/*
2836 			 * Add the cost of inner-scan executions for unmatched outer rows.
2837 			 * We estimate this as the same cost as returning the first tuple
2838 			 * of a nonempty scan.  We consider that these are all rescans,
2839 			 * since we used inner_run_cost once already.
2840 			 */
2841 			run_cost += outer_unmatched_rows *
2842 				inner_rescan_run_cost / inner_path_rows;
2843 
2844 			/*
2845 			 * We won't be evaluating any quals at all for unmatched rows, so
2846 			 * don't add them to ntuples.
2847 			 */
2848 		}
2849 		else
2850 		{
2851 			/*
2852 			 * Here, a complicating factor is that rescans may be cheaper than
2853 			 * first scans.  If we never scan all the way to the end of the
2854 			 * inner rel, it might be (depending on the plan type) that we'd
2855 			 * never pay the whole inner first-scan run cost.  However it is
2856 			 * difficult to estimate whether that will happen (and it could
2857 			 * not happen if there are any unmatched outer rows!), so be
2858 			 * conservative and always charge the whole first-scan cost once.
2859 			 * We consider this charge to correspond to the first unmatched
2860 			 * outer row, unless there isn't one in our estimate, in which
2861 			 * case blame it on the first matched row.
2862 			 */
2863 
2864 			/* First, count all unmatched join tuples as being processed */
2865 			ntuples += outer_unmatched_rows * inner_path_rows;
2866 
2867 			/* Now add the forced full scan, and decrement appropriate count */
2868 			run_cost += inner_run_cost;
2869 			if (outer_unmatched_rows >= 1)
2870 				outer_unmatched_rows -= 1;
2871 			else
2872 				outer_matched_rows -= 1;
2873 
2874 			/* Add inner run cost for additional outer tuples having matches */
2875 			if (outer_matched_rows > 0)
2876 				run_cost += outer_matched_rows * inner_rescan_run_cost * inner_scan_frac;
2877 
2878 			/* Add inner run cost for additional unmatched outer tuples */
2879 			if (outer_unmatched_rows > 0)
2880 				run_cost += outer_unmatched_rows * inner_rescan_run_cost;
2881 		}
2882 	}
2883 	else
2884 	{
2885 		/* Normal-case source costs were included in preliminary estimate */
2886 
2887 		/* Compute number of tuples processed (not number emitted!) */
2888 		ntuples = outer_path_rows * inner_path_rows;
2889 	}
2890 
2891 	/* CPU costs */
2892 	cost_qual_eval(&restrict_qual_cost, path->joinrestrictinfo, root);
2893 	startup_cost += restrict_qual_cost.startup;
2894 	cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple;
2895 	run_cost += cpu_per_tuple * ntuples;
2896 
2897 	/* tlist eval costs are paid per output row, not per tuple scanned */
2898 	startup_cost += path->path.pathtarget->cost.startup;
2899 	run_cost += path->path.pathtarget->cost.per_tuple * path->path.rows;
2900 
2901 	path->path.startup_cost = startup_cost;
2902 	path->path.total_cost = startup_cost + run_cost;
2903 }
2904 
2905 /*
2906  * initial_cost_mergejoin
2907  *	  Preliminary estimate of the cost of a mergejoin path.
2908  *
2909  * This must quickly produce lower-bound estimates of the path's startup and
2910  * total costs.  If we are unable to eliminate the proposed path from
2911  * consideration using the lower bounds, final_cost_mergejoin will be called
2912  * to obtain the final estimates.
2913  *
2914  * The exact division of labor between this function and final_cost_mergejoin
2915  * is private to them, and represents a tradeoff between speed of the initial
2916  * estimate and getting a tight lower bound.  We choose to not examine the
2917  * join quals here, except for obtaining the scan selectivity estimate which
2918  * is really essential (but fortunately, use of caching keeps the cost of
2919  * getting that down to something reasonable).
2920  * We also assume that cost_sort is cheap enough to use here.
2921  *
2922  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
2923  *		other data to be used by final_cost_mergejoin
2924  * 'jointype' is the type of join to be performed
2925  * 'mergeclauses' is the list of joinclauses to be used as merge clauses
2926  * 'outer_path' is the outer input to the join
2927  * 'inner_path' is the inner input to the join
2928  * 'outersortkeys' is the list of sort keys for the outer path
2929  * 'innersortkeys' is the list of sort keys for the inner path
2930  * 'extra' contains miscellaneous information about the join
2931  *
2932  * Note: outersortkeys and innersortkeys should be NIL if no explicit
2933  * sort is needed because the respective source path is already ordered.
2934  */
2935 void
initial_cost_mergejoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * mergeclauses,Path * outer_path,Path * inner_path,List * outersortkeys,List * innersortkeys,JoinPathExtraData * extra)2936 initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
2937 					   JoinType jointype,
2938 					   List *mergeclauses,
2939 					   Path *outer_path, Path *inner_path,
2940 					   List *outersortkeys, List *innersortkeys,
2941 					   JoinPathExtraData *extra)
2942 {
2943 	Cost		startup_cost = 0;
2944 	Cost		run_cost = 0;
2945 	double		outer_path_rows = outer_path->rows;
2946 	double		inner_path_rows = inner_path->rows;
2947 	Cost		inner_run_cost;
2948 	double		outer_rows,
2949 				inner_rows,
2950 				outer_skip_rows,
2951 				inner_skip_rows;
2952 	Selectivity outerstartsel,
2953 				outerendsel,
2954 				innerstartsel,
2955 				innerendsel;
2956 	Path		sort_path;		/* dummy for result of cost_sort */
2957 
2958 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
2959 	if (outer_path_rows <= 0 || isnan(outer_path_rows))
2960 		outer_path_rows = 1;
2961 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
2962 		inner_path_rows = 1;
2963 
2964 	/*
2965 	 * A merge join will stop as soon as it exhausts either input stream
2966 	 * (unless it's an outer join, in which case the outer side has to be
2967 	 * scanned all the way anyway).  Estimate fraction of the left and right
2968 	 * inputs that will actually need to be scanned.  Likewise, we can
2969 	 * estimate the number of rows that will be skipped before the first join
2970 	 * pair is found, which should be factored into startup cost. We use only
2971 	 * the first (most significant) merge clause for this purpose. Since
2972 	 * mergejoinscansel() is a fairly expensive computation, we cache the
2973 	 * results in the merge clause RestrictInfo.
2974 	 */
2975 	if (mergeclauses && jointype != JOIN_FULL)
2976 	{
2977 		RestrictInfo *firstclause = (RestrictInfo *) linitial(mergeclauses);
2978 		List	   *opathkeys;
2979 		List	   *ipathkeys;
2980 		PathKey    *opathkey;
2981 		PathKey    *ipathkey;
2982 		MergeScanSelCache *cache;
2983 
2984 		/* Get the input pathkeys to determine the sort-order details */
2985 		opathkeys = outersortkeys ? outersortkeys : outer_path->pathkeys;
2986 		ipathkeys = innersortkeys ? innersortkeys : inner_path->pathkeys;
2987 		Assert(opathkeys);
2988 		Assert(ipathkeys);
2989 		opathkey = (PathKey *) linitial(opathkeys);
2990 		ipathkey = (PathKey *) linitial(ipathkeys);
2991 		/* debugging check */
2992 		if (opathkey->pk_opfamily != ipathkey->pk_opfamily ||
2993 			opathkey->pk_eclass->ec_collation != ipathkey->pk_eclass->ec_collation ||
2994 			opathkey->pk_strategy != ipathkey->pk_strategy ||
2995 			opathkey->pk_nulls_first != ipathkey->pk_nulls_first)
2996 			elog(ERROR, "left and right pathkeys do not match in mergejoin");
2997 
2998 		/* Get the selectivity with caching */
2999 		cache = cached_scansel(root, firstclause, opathkey);
3000 
3001 		if (bms_is_subset(firstclause->left_relids,
3002 						  outer_path->parent->relids))
3003 		{
3004 			/* left side of clause is outer */
3005 			outerstartsel = cache->leftstartsel;
3006 			outerendsel = cache->leftendsel;
3007 			innerstartsel = cache->rightstartsel;
3008 			innerendsel = cache->rightendsel;
3009 		}
3010 		else
3011 		{
3012 			/* left side of clause is inner */
3013 			outerstartsel = cache->rightstartsel;
3014 			outerendsel = cache->rightendsel;
3015 			innerstartsel = cache->leftstartsel;
3016 			innerendsel = cache->leftendsel;
3017 		}
3018 		if (jointype == JOIN_LEFT ||
3019 			jointype == JOIN_ANTI)
3020 		{
3021 			outerstartsel = 0.0;
3022 			outerendsel = 1.0;
3023 		}
3024 		else if (jointype == JOIN_RIGHT)
3025 		{
3026 			innerstartsel = 0.0;
3027 			innerendsel = 1.0;
3028 		}
3029 	}
3030 	else
3031 	{
3032 		/* cope with clauseless or full mergejoin */
3033 		outerstartsel = innerstartsel = 0.0;
3034 		outerendsel = innerendsel = 1.0;
3035 	}
3036 
3037 	/*
3038 	 * Convert selectivities to row counts.  We force outer_rows and
3039 	 * inner_rows to be at least 1, but the skip_rows estimates can be zero.
3040 	 */
3041 	outer_skip_rows = rint(outer_path_rows * outerstartsel);
3042 	inner_skip_rows = rint(inner_path_rows * innerstartsel);
3043 	outer_rows = clamp_row_est(outer_path_rows * outerendsel);
3044 	inner_rows = clamp_row_est(inner_path_rows * innerendsel);
3045 
3046 	/* skip rows can become NaN when path rows has become infinite */
3047 	Assert(outer_skip_rows <= outer_rows || isnan(outer_skip_rows));
3048 	Assert(inner_skip_rows <= inner_rows || isnan(inner_skip_rows));
3049 
3050 	/*
3051 	 * Readjust scan selectivities to account for above rounding.  This is
3052 	 * normally an insignificant effect, but when there are only a few rows in
3053 	 * the inputs, failing to do this makes for a large percentage error.
3054 	 */
3055 	outerstartsel = outer_skip_rows / outer_path_rows;
3056 	innerstartsel = inner_skip_rows / inner_path_rows;
3057 	outerendsel = outer_rows / outer_path_rows;
3058 	innerendsel = inner_rows / inner_path_rows;
3059 
3060 	/* start sel can become NaN when path rows has become infinite */
3061 	Assert(outerstartsel <= outerendsel || isnan(outerstartsel));
3062 	Assert(innerstartsel <= innerendsel || isnan(innerstartsel));
3063 
3064 	/* cost of source data */
3065 
3066 	if (outersortkeys)			/* do we need to sort outer? */
3067 	{
3068 		cost_sort(&sort_path,
3069 				  root,
3070 				  outersortkeys,
3071 				  outer_path->total_cost,
3072 				  outer_path_rows,
3073 				  outer_path->pathtarget->width,
3074 				  0.0,
3075 				  work_mem,
3076 				  -1.0);
3077 		startup_cost += sort_path.startup_cost;
3078 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3079 			* outerstartsel;
3080 		run_cost += (sort_path.total_cost - sort_path.startup_cost)
3081 			* (outerendsel - outerstartsel);
3082 	}
3083 	else
3084 	{
3085 		startup_cost += outer_path->startup_cost;
3086 		startup_cost += (outer_path->total_cost - outer_path->startup_cost)
3087 			* outerstartsel;
3088 		run_cost += (outer_path->total_cost - outer_path->startup_cost)
3089 			* (outerendsel - outerstartsel);
3090 	}
3091 
3092 	if (innersortkeys)			/* do we need to sort inner? */
3093 	{
3094 		cost_sort(&sort_path,
3095 				  root,
3096 				  innersortkeys,
3097 				  inner_path->total_cost,
3098 				  inner_path_rows,
3099 				  inner_path->pathtarget->width,
3100 				  0.0,
3101 				  work_mem,
3102 				  -1.0);
3103 		startup_cost += sort_path.startup_cost;
3104 		startup_cost += (sort_path.total_cost - sort_path.startup_cost)
3105 			* innerstartsel;
3106 		inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
3107 			* (innerendsel - innerstartsel);
3108 	}
3109 	else
3110 	{
3111 		startup_cost += inner_path->startup_cost;
3112 		startup_cost += (inner_path->total_cost - inner_path->startup_cost)
3113 			* innerstartsel;
3114 		inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
3115 			* (innerendsel - innerstartsel);
3116 	}
3117 
3118 	/*
3119 	 * We can't yet determine whether rescanning occurs, or whether
3120 	 * materialization of the inner input should be done.  The minimum
3121 	 * possible inner input cost, regardless of rescan and materialization
3122 	 * considerations, is inner_run_cost.  We include that in
3123 	 * workspace->total_cost, but not yet in run_cost.
3124 	 */
3125 
3126 	/* CPU costs left for later */
3127 
3128 	/* Public result fields */
3129 	workspace->startup_cost = startup_cost;
3130 	workspace->total_cost = startup_cost + run_cost + inner_run_cost;
3131 	/* Save private data for final_cost_mergejoin */
3132 	workspace->run_cost = run_cost;
3133 	workspace->inner_run_cost = inner_run_cost;
3134 	workspace->outer_rows = outer_rows;
3135 	workspace->inner_rows = inner_rows;
3136 	workspace->outer_skip_rows = outer_skip_rows;
3137 	workspace->inner_skip_rows = inner_skip_rows;
3138 }
3139 
3140 /*
3141  * final_cost_mergejoin
3142  *	  Final estimate of the cost and result size of a mergejoin path.
3143  *
3144  * Unlike other costsize functions, this routine makes two actual decisions:
3145  * whether the executor will need to do mark/restore, and whether we should
3146  * materialize the inner path.  It would be logically cleaner to build
3147  * separate paths testing these alternatives, but that would require repeating
3148  * most of the cost calculations, which are not all that cheap.  Since the
3149  * choice will not affect output pathkeys or startup cost, only total cost,
3150  * there is no possibility of wanting to keep more than one path.  So it seems
3151  * best to make the decisions here and record them in the path's
3152  * skip_mark_restore and materialize_inner fields.
3153  *
3154  * Mark/restore overhead is usually required, but can be skipped if we know
3155  * that the executor need find only one match per outer tuple, and that the
3156  * mergeclauses are sufficient to identify a match.
3157  *
3158  * We materialize the inner path if we need mark/restore and either the inner
3159  * path can't support mark/restore, or it's cheaper to use an interposed
3160  * Material node to handle mark/restore.
3161  *
3162  * 'path' is already filled in except for the rows and cost fields and
3163  *		skip_mark_restore and materialize_inner
3164  * 'workspace' is the result from initial_cost_mergejoin
3165  * 'extra' contains miscellaneous information about the join
3166  */
3167 void
final_cost_mergejoin(PlannerInfo * root,MergePath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3168 final_cost_mergejoin(PlannerInfo *root, MergePath *path,
3169 					 JoinCostWorkspace *workspace,
3170 					 JoinPathExtraData *extra)
3171 {
3172 	Path	   *outer_path = path->jpath.outerjoinpath;
3173 	Path	   *inner_path = path->jpath.innerjoinpath;
3174 	double		inner_path_rows = inner_path->rows;
3175 	List	   *mergeclauses = path->path_mergeclauses;
3176 	List	   *innersortkeys = path->innersortkeys;
3177 	Cost		startup_cost = workspace->startup_cost;
3178 	Cost		run_cost = workspace->run_cost;
3179 	Cost		inner_run_cost = workspace->inner_run_cost;
3180 	double		outer_rows = workspace->outer_rows;
3181 	double		inner_rows = workspace->inner_rows;
3182 	double		outer_skip_rows = workspace->outer_skip_rows;
3183 	double		inner_skip_rows = workspace->inner_skip_rows;
3184 	Cost		cpu_per_tuple,
3185 				bare_inner_cost,
3186 				mat_inner_cost;
3187 	QualCost	merge_qual_cost;
3188 	QualCost	qp_qual_cost;
3189 	double		mergejointuples,
3190 				rescannedtuples;
3191 	double		rescanratio;
3192 
3193 	/* Protect some assumptions below that rowcounts aren't zero or NaN */
3194 	if (inner_path_rows <= 0 || isnan(inner_path_rows))
3195 		inner_path_rows = 1;
3196 
3197 	/* Mark the path with the correct row estimate */
3198 	if (path->jpath.path.param_info)
3199 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3200 	else
3201 		path->jpath.path.rows = path->jpath.path.parent->rows;
3202 
3203 	/* For partial paths, scale row estimate. */
3204 	if (path->jpath.path.parallel_workers > 0)
3205 	{
3206 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
3207 
3208 		path->jpath.path.rows =
3209 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
3210 	}
3211 
3212 	/*
3213 	 * We could include disable_cost in the preliminary estimate, but that
3214 	 * would amount to optimizing for the case where the join method is
3215 	 * disabled, which doesn't seem like the way to bet.
3216 	 */
3217 	if (!enable_mergejoin)
3218 		startup_cost += disable_cost;
3219 
3220 	/*
3221 	 * Compute cost of the mergequals and qpquals (other restriction clauses)
3222 	 * separately.
3223 	 */
3224 	cost_qual_eval(&merge_qual_cost, mergeclauses, root);
3225 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3226 	qp_qual_cost.startup -= merge_qual_cost.startup;
3227 	qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple;
3228 
3229 	/*
3230 	 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3231 	 * executor will stop scanning for matches after the first match.  When
3232 	 * all the joinclauses are merge clauses, this means we don't ever need to
3233 	 * back up the merge, and so we can skip mark/restore overhead.
3234 	 */
3235 	if ((path->jpath.jointype == JOIN_SEMI ||
3236 		 path->jpath.jointype == JOIN_ANTI ||
3237 		 extra->inner_unique) &&
3238 		(list_length(path->jpath.joinrestrictinfo) ==
3239 		 list_length(path->path_mergeclauses)))
3240 		path->skip_mark_restore = true;
3241 	else
3242 		path->skip_mark_restore = false;
3243 
3244 	/*
3245 	 * Get approx # tuples passing the mergequals.  We use approx_tuple_count
3246 	 * here because we need an estimate done with JOIN_INNER semantics.
3247 	 */
3248 	mergejointuples = approx_tuple_count(root, &path->jpath, mergeclauses);
3249 
3250 	/*
3251 	 * When there are equal merge keys in the outer relation, the mergejoin
3252 	 * must rescan any matching tuples in the inner relation. This means
3253 	 * re-fetching inner tuples; we have to estimate how often that happens.
3254 	 *
3255 	 * For regular inner and outer joins, the number of re-fetches can be
3256 	 * estimated approximately as size of merge join output minus size of
3257 	 * inner relation. Assume that the distinct key values are 1, 2, ..., and
3258 	 * denote the number of values of each key in the outer relation as m1,
3259 	 * m2, ...; in the inner relation, n1, n2, ...  Then we have
3260 	 *
3261 	 * size of join = m1 * n1 + m2 * n2 + ...
3262 	 *
3263 	 * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 *
3264 	 * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner
3265 	 * relation
3266 	 *
3267 	 * This equation works correctly for outer tuples having no inner match
3268 	 * (nk = 0), but not for inner tuples having no outer match (mk = 0); we
3269 	 * are effectively subtracting those from the number of rescanned tuples,
3270 	 * when we should not.  Can we do better without expensive selectivity
3271 	 * computations?
3272 	 *
3273 	 * The whole issue is moot if we are working from a unique-ified outer
3274 	 * input, or if we know we don't need to mark/restore at all.
3275 	 */
3276 	if (IsA(outer_path, UniquePath) || path->skip_mark_restore)
3277 		rescannedtuples = 0;
3278 	else
3279 	{
3280 		rescannedtuples = mergejointuples - inner_path_rows;
3281 		/* Must clamp because of possible underestimate */
3282 		if (rescannedtuples < 0)
3283 			rescannedtuples = 0;
3284 	}
3285 
3286 	/*
3287 	 * We'll inflate various costs this much to account for rescanning.  Note
3288 	 * that this is to be multiplied by something involving inner_rows, or
3289 	 * another number related to the portion of the inner rel we'll scan.
3290 	 */
3291 	rescanratio = 1.0 + (rescannedtuples / inner_rows);
3292 
3293 	/*
3294 	 * Decide whether we want to materialize the inner input to shield it from
3295 	 * mark/restore and performing re-fetches.  Our cost model for regular
3296 	 * re-fetches is that a re-fetch costs the same as an original fetch,
3297 	 * which is probably an overestimate; but on the other hand we ignore the
3298 	 * bookkeeping costs of mark/restore.  Not clear if it's worth developing
3299 	 * a more refined model.  So we just need to inflate the inner run cost by
3300 	 * rescanratio.
3301 	 */
3302 	bare_inner_cost = inner_run_cost * rescanratio;
3303 
3304 	/*
3305 	 * When we interpose a Material node the re-fetch cost is assumed to be
3306 	 * just cpu_operator_cost per tuple, independently of the underlying
3307 	 * plan's cost; and we charge an extra cpu_operator_cost per original
3308 	 * fetch as well.  Note that we're assuming the materialize node will
3309 	 * never spill to disk, since it only has to remember tuples back to the
3310 	 * last mark.  (If there are a huge number of duplicates, our other cost
3311 	 * factors will make the path so expensive that it probably won't get
3312 	 * chosen anyway.)	So we don't use cost_rescan here.
3313 	 *
3314 	 * Note: keep this estimate in sync with create_mergejoin_plan's labeling
3315 	 * of the generated Material node.
3316 	 */
3317 	mat_inner_cost = inner_run_cost +
3318 		cpu_operator_cost * inner_rows * rescanratio;
3319 
3320 	/*
3321 	 * If we don't need mark/restore at all, we don't need materialization.
3322 	 */
3323 	if (path->skip_mark_restore)
3324 		path->materialize_inner = false;
3325 
3326 	/*
3327 	 * Prefer materializing if it looks cheaper, unless the user has asked to
3328 	 * suppress materialization.
3329 	 */
3330 	else if (enable_material && mat_inner_cost < bare_inner_cost)
3331 		path->materialize_inner = true;
3332 
3333 	/*
3334 	 * Even if materializing doesn't look cheaper, we *must* do it if the
3335 	 * inner path is to be used directly (without sorting) and it doesn't
3336 	 * support mark/restore.
3337 	 *
3338 	 * Since the inner side must be ordered, and only Sorts and IndexScans can
3339 	 * create order to begin with, and they both support mark/restore, you
3340 	 * might think there's no problem --- but you'd be wrong.  Nestloop and
3341 	 * merge joins can *preserve* the order of their inputs, so they can be
3342 	 * selected as the input of a mergejoin, and they don't support
3343 	 * mark/restore at present.
3344 	 *
3345 	 * We don't test the value of enable_material here, because
3346 	 * materialization is required for correctness in this case, and turning
3347 	 * it off does not entitle us to deliver an invalid plan.
3348 	 */
3349 	else if (innersortkeys == NIL &&
3350 			 !ExecSupportsMarkRestore(inner_path))
3351 		path->materialize_inner = true;
3352 
3353 	/*
3354 	 * Also, force materializing if the inner path is to be sorted and the
3355 	 * sort is expected to spill to disk.  This is because the final merge
3356 	 * pass can be done on-the-fly if it doesn't have to support mark/restore.
3357 	 * We don't try to adjust the cost estimates for this consideration,
3358 	 * though.
3359 	 *
3360 	 * Since materialization is a performance optimization in this case,
3361 	 * rather than necessary for correctness, we skip it if enable_material is
3362 	 * off.
3363 	 */
3364 	else if (enable_material && innersortkeys != NIL &&
3365 			 relation_byte_size(inner_path_rows,
3366 								inner_path->pathtarget->width) >
3367 			 (work_mem * 1024L))
3368 		path->materialize_inner = true;
3369 	else
3370 		path->materialize_inner = false;
3371 
3372 	/* Charge the right incremental cost for the chosen case */
3373 	if (path->materialize_inner)
3374 		run_cost += mat_inner_cost;
3375 	else
3376 		run_cost += bare_inner_cost;
3377 
3378 	/* CPU costs */
3379 
3380 	/*
3381 	 * The number of tuple comparisons needed is approximately number of outer
3382 	 * rows plus number of inner rows plus number of rescanned tuples (can we
3383 	 * refine this?).  At each one, we need to evaluate the mergejoin quals.
3384 	 */
3385 	startup_cost += merge_qual_cost.startup;
3386 	startup_cost += merge_qual_cost.per_tuple *
3387 		(outer_skip_rows + inner_skip_rows * rescanratio);
3388 	run_cost += merge_qual_cost.per_tuple *
3389 		((outer_rows - outer_skip_rows) +
3390 		 (inner_rows - inner_skip_rows) * rescanratio);
3391 
3392 	/*
3393 	 * For each tuple that gets through the mergejoin proper, we charge
3394 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3395 	 * clauses that are to be applied at the join.  (This is pessimistic since
3396 	 * not all of the quals may get evaluated at each tuple.)
3397 	 *
3398 	 * Note: we could adjust for SEMI/ANTI joins skipping some qual
3399 	 * evaluations here, but it's probably not worth the trouble.
3400 	 */
3401 	startup_cost += qp_qual_cost.startup;
3402 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3403 	run_cost += cpu_per_tuple * mergejointuples;
3404 
3405 	/* tlist eval costs are paid per output row, not per tuple scanned */
3406 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3407 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3408 
3409 	path->jpath.path.startup_cost = startup_cost;
3410 	path->jpath.path.total_cost = startup_cost + run_cost;
3411 }
3412 
3413 /*
3414  * run mergejoinscansel() with caching
3415  */
3416 static MergeScanSelCache *
cached_scansel(PlannerInfo * root,RestrictInfo * rinfo,PathKey * pathkey)3417 cached_scansel(PlannerInfo *root, RestrictInfo *rinfo, PathKey *pathkey)
3418 {
3419 	MergeScanSelCache *cache;
3420 	ListCell   *lc;
3421 	Selectivity leftstartsel,
3422 				leftendsel,
3423 				rightstartsel,
3424 				rightendsel;
3425 	MemoryContext oldcontext;
3426 
3427 	/* Do we have this result already? */
3428 	foreach(lc, rinfo->scansel_cache)
3429 	{
3430 		cache = (MergeScanSelCache *) lfirst(lc);
3431 		if (cache->opfamily == pathkey->pk_opfamily &&
3432 			cache->collation == pathkey->pk_eclass->ec_collation &&
3433 			cache->strategy == pathkey->pk_strategy &&
3434 			cache->nulls_first == pathkey->pk_nulls_first)
3435 			return cache;
3436 	}
3437 
3438 	/* Nope, do the computation */
3439 	mergejoinscansel(root,
3440 					 (Node *) rinfo->clause,
3441 					 pathkey->pk_opfamily,
3442 					 pathkey->pk_strategy,
3443 					 pathkey->pk_nulls_first,
3444 					 &leftstartsel,
3445 					 &leftendsel,
3446 					 &rightstartsel,
3447 					 &rightendsel);
3448 
3449 	/* Cache the result in suitably long-lived workspace */
3450 	oldcontext = MemoryContextSwitchTo(root->planner_cxt);
3451 
3452 	cache = (MergeScanSelCache *) palloc(sizeof(MergeScanSelCache));
3453 	cache->opfamily = pathkey->pk_opfamily;
3454 	cache->collation = pathkey->pk_eclass->ec_collation;
3455 	cache->strategy = pathkey->pk_strategy;
3456 	cache->nulls_first = pathkey->pk_nulls_first;
3457 	cache->leftstartsel = leftstartsel;
3458 	cache->leftendsel = leftendsel;
3459 	cache->rightstartsel = rightstartsel;
3460 	cache->rightendsel = rightendsel;
3461 
3462 	rinfo->scansel_cache = lappend(rinfo->scansel_cache, cache);
3463 
3464 	MemoryContextSwitchTo(oldcontext);
3465 
3466 	return cache;
3467 }
3468 
3469 /*
3470  * initial_cost_hashjoin
3471  *	  Preliminary estimate of the cost of a hashjoin path.
3472  *
3473  * This must quickly produce lower-bound estimates of the path's startup and
3474  * total costs.  If we are unable to eliminate the proposed path from
3475  * consideration using the lower bounds, final_cost_hashjoin will be called
3476  * to obtain the final estimates.
3477  *
3478  * The exact division of labor between this function and final_cost_hashjoin
3479  * is private to them, and represents a tradeoff between speed of the initial
3480  * estimate and getting a tight lower bound.  We choose to not examine the
3481  * join quals here (other than by counting the number of hash clauses),
3482  * so we can't do much with CPU costs.  We do assume that
3483  * ExecChooseHashTableSize is cheap enough to use here.
3484  *
3485  * 'workspace' is to be filled with startup_cost, total_cost, and perhaps
3486  *		other data to be used by final_cost_hashjoin
3487  * 'jointype' is the type of join to be performed
3488  * 'hashclauses' is the list of joinclauses to be used as hash clauses
3489  * 'outer_path' is the outer input to the join
3490  * 'inner_path' is the inner input to the join
3491  * 'extra' contains miscellaneous information about the join
3492  * 'parallel_hash' indicates that inner_path is partial and that a shared
3493  *		hash table will be built in parallel
3494  */
3495 void
initial_cost_hashjoin(PlannerInfo * root,JoinCostWorkspace * workspace,JoinType jointype,List * hashclauses,Path * outer_path,Path * inner_path,JoinPathExtraData * extra,bool parallel_hash)3496 initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
3497 					  JoinType jointype,
3498 					  List *hashclauses,
3499 					  Path *outer_path, Path *inner_path,
3500 					  JoinPathExtraData *extra,
3501 					  bool parallel_hash)
3502 {
3503 	Cost		startup_cost = 0;
3504 	Cost		run_cost = 0;
3505 	double		outer_path_rows = outer_path->rows;
3506 	double		inner_path_rows = inner_path->rows;
3507 	double		inner_path_rows_total = inner_path_rows;
3508 	int			num_hashclauses = list_length(hashclauses);
3509 	int			numbuckets;
3510 	int			numbatches;
3511 	int			num_skew_mcvs;
3512 	size_t		space_allowed;	/* unused */
3513 
3514 	/* cost of source data */
3515 	startup_cost += outer_path->startup_cost;
3516 	run_cost += outer_path->total_cost - outer_path->startup_cost;
3517 	startup_cost += inner_path->total_cost;
3518 
3519 	/*
3520 	 * Cost of computing hash function: must do it once per input tuple. We
3521 	 * charge one cpu_operator_cost for each column's hash function.  Also,
3522 	 * tack on one cpu_tuple_cost per inner row, to model the costs of
3523 	 * inserting the row into the hashtable.
3524 	 *
3525 	 * XXX when a hashclause is more complex than a single operator, we really
3526 	 * should charge the extra eval costs of the left or right side, as
3527 	 * appropriate, here.  This seems more work than it's worth at the moment.
3528 	 */
3529 	startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost)
3530 		* inner_path_rows;
3531 	run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows;
3532 
3533 	/*
3534 	 * If this is a parallel hash build, then the value we have for
3535 	 * inner_rows_total currently refers only to the rows returned by each
3536 	 * participant.  For shared hash table size estimation, we need the total
3537 	 * number, so we need to undo the division.
3538 	 */
3539 	if (parallel_hash)
3540 		inner_path_rows_total *= get_parallel_divisor(inner_path);
3541 
3542 	/*
3543 	 * Get hash table size that executor would use for inner relation.
3544 	 *
3545 	 * XXX for the moment, always assume that skew optimization will be
3546 	 * performed.  As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
3547 	 * trying to determine that for sure.
3548 	 *
3549 	 * XXX at some point it might be interesting to try to account for skew
3550 	 * optimization in the cost estimate, but for now, we don't.
3551 	 */
3552 	ExecChooseHashTableSize(inner_path_rows_total,
3553 							inner_path->pathtarget->width,
3554 							true,	/* useskew */
3555 							parallel_hash,	/* try_combined_hash_mem */
3556 							outer_path->parallel_workers,
3557 							&space_allowed,
3558 							&numbuckets,
3559 							&numbatches,
3560 							&num_skew_mcvs);
3561 
3562 	/*
3563 	 * If inner relation is too big then we will need to "batch" the join,
3564 	 * which implies writing and reading most of the tuples to disk an extra
3565 	 * time.  Charge seq_page_cost per page, since the I/O should be nice and
3566 	 * sequential.  Writing the inner rel counts as startup cost, all the rest
3567 	 * as run cost.
3568 	 */
3569 	if (numbatches > 1)
3570 	{
3571 		double		outerpages = page_size(outer_path_rows,
3572 										   outer_path->pathtarget->width);
3573 		double		innerpages = page_size(inner_path_rows,
3574 										   inner_path->pathtarget->width);
3575 
3576 		startup_cost += seq_page_cost * innerpages;
3577 		run_cost += seq_page_cost * (innerpages + 2 * outerpages);
3578 	}
3579 
3580 	/* CPU costs left for later */
3581 
3582 	/* Public result fields */
3583 	workspace->startup_cost = startup_cost;
3584 	workspace->total_cost = startup_cost + run_cost;
3585 	/* Save private data for final_cost_hashjoin */
3586 	workspace->run_cost = run_cost;
3587 	workspace->numbuckets = numbuckets;
3588 	workspace->numbatches = numbatches;
3589 	workspace->inner_rows_total = inner_path_rows_total;
3590 }
3591 
3592 /*
3593  * final_cost_hashjoin
3594  *	  Final estimate of the cost and result size of a hashjoin path.
3595  *
3596  * Note: the numbatches estimate is also saved into 'path' for use later
3597  *
3598  * 'path' is already filled in except for the rows and cost fields and
3599  *		num_batches
3600  * 'workspace' is the result from initial_cost_hashjoin
3601  * 'extra' contains miscellaneous information about the join
3602  */
3603 void
final_cost_hashjoin(PlannerInfo * root,HashPath * path,JoinCostWorkspace * workspace,JoinPathExtraData * extra)3604 final_cost_hashjoin(PlannerInfo *root, HashPath *path,
3605 					JoinCostWorkspace *workspace,
3606 					JoinPathExtraData *extra)
3607 {
3608 	Path	   *outer_path = path->jpath.outerjoinpath;
3609 	Path	   *inner_path = path->jpath.innerjoinpath;
3610 	double		outer_path_rows = outer_path->rows;
3611 	double		inner_path_rows = inner_path->rows;
3612 	double		inner_path_rows_total = workspace->inner_rows_total;
3613 	List	   *hashclauses = path->path_hashclauses;
3614 	Cost		startup_cost = workspace->startup_cost;
3615 	Cost		run_cost = workspace->run_cost;
3616 	int			numbuckets = workspace->numbuckets;
3617 	int			numbatches = workspace->numbatches;
3618 	Cost		cpu_per_tuple;
3619 	QualCost	hash_qual_cost;
3620 	QualCost	qp_qual_cost;
3621 	double		hashjointuples;
3622 	double		virtualbuckets;
3623 	Selectivity innerbucketsize;
3624 	Selectivity innermcvfreq;
3625 	ListCell   *hcl;
3626 
3627 	/* Mark the path with the correct row estimate */
3628 	if (path->jpath.path.param_info)
3629 		path->jpath.path.rows = path->jpath.path.param_info->ppi_rows;
3630 	else
3631 		path->jpath.path.rows = path->jpath.path.parent->rows;
3632 
3633 	/* For partial paths, scale row estimate. */
3634 	if (path->jpath.path.parallel_workers > 0)
3635 	{
3636 		double		parallel_divisor = get_parallel_divisor(&path->jpath.path);
3637 
3638 		path->jpath.path.rows =
3639 			clamp_row_est(path->jpath.path.rows / parallel_divisor);
3640 	}
3641 
3642 	/*
3643 	 * We could include disable_cost in the preliminary estimate, but that
3644 	 * would amount to optimizing for the case where the join method is
3645 	 * disabled, which doesn't seem like the way to bet.
3646 	 */
3647 	if (!enable_hashjoin)
3648 		startup_cost += disable_cost;
3649 
3650 	/* mark the path with estimated # of batches */
3651 	path->num_batches = numbatches;
3652 
3653 	/* store the total number of tuples (sum of partial row estimates) */
3654 	path->inner_rows_total = inner_path_rows_total;
3655 
3656 	/* and compute the number of "virtual" buckets in the whole join */
3657 	virtualbuckets = (double) numbuckets * (double) numbatches;
3658 
3659 	/*
3660 	 * Determine bucketsize fraction and MCV frequency for the inner relation.
3661 	 * We use the smallest bucketsize or MCV frequency estimated for any
3662 	 * individual hashclause; this is undoubtedly conservative.
3663 	 *
3664 	 * BUT: if inner relation has been unique-ified, we can assume it's good
3665 	 * for hashing.  This is important both because it's the right answer, and
3666 	 * because we avoid contaminating the cache with a value that's wrong for
3667 	 * non-unique-ified paths.
3668 	 */
3669 	if (IsA(inner_path, UniquePath))
3670 	{
3671 		innerbucketsize = 1.0 / virtualbuckets;
3672 		innermcvfreq = 0.0;
3673 	}
3674 	else
3675 	{
3676 		innerbucketsize = 1.0;
3677 		innermcvfreq = 1.0;
3678 		foreach(hcl, hashclauses)
3679 		{
3680 			RestrictInfo *restrictinfo = lfirst_node(RestrictInfo, hcl);
3681 			Selectivity thisbucketsize;
3682 			Selectivity thismcvfreq;
3683 
3684 			/*
3685 			 * First we have to figure out which side of the hashjoin clause
3686 			 * is the inner side.
3687 			 *
3688 			 * Since we tend to visit the same clauses over and over when
3689 			 * planning a large query, we cache the bucket stats estimates in
3690 			 * the RestrictInfo node to avoid repeated lookups of statistics.
3691 			 */
3692 			if (bms_is_subset(restrictinfo->right_relids,
3693 							  inner_path->parent->relids))
3694 			{
3695 				/* righthand side is inner */
3696 				thisbucketsize = restrictinfo->right_bucketsize;
3697 				if (thisbucketsize < 0)
3698 				{
3699 					/* not cached yet */
3700 					estimate_hash_bucket_stats(root,
3701 											   get_rightop(restrictinfo->clause),
3702 											   virtualbuckets,
3703 											   &restrictinfo->right_mcvfreq,
3704 											   &restrictinfo->right_bucketsize);
3705 					thisbucketsize = restrictinfo->right_bucketsize;
3706 				}
3707 				thismcvfreq = restrictinfo->right_mcvfreq;
3708 			}
3709 			else
3710 			{
3711 				Assert(bms_is_subset(restrictinfo->left_relids,
3712 									 inner_path->parent->relids));
3713 				/* lefthand side is inner */
3714 				thisbucketsize = restrictinfo->left_bucketsize;
3715 				if (thisbucketsize < 0)
3716 				{
3717 					/* not cached yet */
3718 					estimate_hash_bucket_stats(root,
3719 											   get_leftop(restrictinfo->clause),
3720 											   virtualbuckets,
3721 											   &restrictinfo->left_mcvfreq,
3722 											   &restrictinfo->left_bucketsize);
3723 					thisbucketsize = restrictinfo->left_bucketsize;
3724 				}
3725 				thismcvfreq = restrictinfo->left_mcvfreq;
3726 			}
3727 
3728 			if (innerbucketsize > thisbucketsize)
3729 				innerbucketsize = thisbucketsize;
3730 			if (innermcvfreq > thismcvfreq)
3731 				innermcvfreq = thismcvfreq;
3732 		}
3733 	}
3734 
3735 	/*
3736 	 * If the bucket holding the inner MCV would exceed hash_mem, we don't
3737 	 * want to hash unless there is really no other alternative, so apply
3738 	 * disable_cost.  (The executor normally copes with excessive memory usage
3739 	 * by splitting batches, but obviously it cannot separate equal values
3740 	 * that way, so it will be unable to drive the batch size below hash_mem
3741 	 * when this is true.)
3742 	 */
3743 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
3744 						   inner_path->pathtarget->width) > get_hash_memory_limit())
3745 		startup_cost += disable_cost;
3746 
3747 	/*
3748 	 * Compute cost of the hashquals and qpquals (other restriction clauses)
3749 	 * separately.
3750 	 */
3751 	cost_qual_eval(&hash_qual_cost, hashclauses, root);
3752 	cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root);
3753 	qp_qual_cost.startup -= hash_qual_cost.startup;
3754 	qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple;
3755 
3756 	/* CPU costs */
3757 
3758 	if (path->jpath.jointype == JOIN_SEMI ||
3759 		path->jpath.jointype == JOIN_ANTI ||
3760 		extra->inner_unique)
3761 	{
3762 		double		outer_matched_rows;
3763 		Selectivity inner_scan_frac;
3764 
3765 		/*
3766 		 * With a SEMI or ANTI join, or if the innerrel is known unique, the
3767 		 * executor will stop after the first match.
3768 		 *
3769 		 * For an outer-rel row that has at least one match, we can expect the
3770 		 * bucket scan to stop after a fraction 1/(match_count+1) of the
3771 		 * bucket's rows, if the matches are evenly distributed.  Since they
3772 		 * probably aren't quite evenly distributed, we apply a fuzz factor of
3773 		 * 2.0 to that fraction.  (If we used a larger fuzz factor, we'd have
3774 		 * to clamp inner_scan_frac to at most 1.0; but since match_count is
3775 		 * at least 1, no such clamp is needed now.)
3776 		 */
3777 		outer_matched_rows = rint(outer_path_rows * extra->semifactors.outer_match_frac);
3778 		inner_scan_frac = 2.0 / (extra->semifactors.match_count + 1.0);
3779 
3780 		startup_cost += hash_qual_cost.startup;
3781 		run_cost += hash_qual_cost.per_tuple * outer_matched_rows *
3782 			clamp_row_est(inner_path_rows * innerbucketsize * inner_scan_frac) * 0.5;
3783 
3784 		/*
3785 		 * For unmatched outer-rel rows, the picture is quite a lot different.
3786 		 * In the first place, there is no reason to assume that these rows
3787 		 * preferentially hit heavily-populated buckets; instead assume they
3788 		 * are uncorrelated with the inner distribution and so they see an
3789 		 * average bucket size of inner_path_rows / virtualbuckets.  In the
3790 		 * second place, it seems likely that they will have few if any exact
3791 		 * hash-code matches and so very few of the tuples in the bucket will
3792 		 * actually require eval of the hash quals.  We don't have any good
3793 		 * way to estimate how many will, but for the moment assume that the
3794 		 * effective cost per bucket entry is one-tenth what it is for
3795 		 * matchable tuples.
3796 		 */
3797 		run_cost += hash_qual_cost.per_tuple *
3798 			(outer_path_rows - outer_matched_rows) *
3799 			clamp_row_est(inner_path_rows / virtualbuckets) * 0.05;
3800 
3801 		/* Get # of tuples that will pass the basic join */
3802 		if (path->jpath.jointype == JOIN_ANTI)
3803 			hashjointuples = outer_path_rows - outer_matched_rows;
3804 		else
3805 			hashjointuples = outer_matched_rows;
3806 	}
3807 	else
3808 	{
3809 		/*
3810 		 * The number of tuple comparisons needed is the number of outer
3811 		 * tuples times the typical number of tuples in a hash bucket, which
3812 		 * is the inner relation size times its bucketsize fraction.  At each
3813 		 * one, we need to evaluate the hashjoin quals.  But actually,
3814 		 * charging the full qual eval cost at each tuple is pessimistic,
3815 		 * since we don't evaluate the quals unless the hash values match
3816 		 * exactly.  For lack of a better idea, halve the cost estimate to
3817 		 * allow for that.
3818 		 */
3819 		startup_cost += hash_qual_cost.startup;
3820 		run_cost += hash_qual_cost.per_tuple * outer_path_rows *
3821 			clamp_row_est(inner_path_rows * innerbucketsize) * 0.5;
3822 
3823 		/*
3824 		 * Get approx # tuples passing the hashquals.  We use
3825 		 * approx_tuple_count here because we need an estimate done with
3826 		 * JOIN_INNER semantics.
3827 		 */
3828 		hashjointuples = approx_tuple_count(root, &path->jpath, hashclauses);
3829 	}
3830 
3831 	/*
3832 	 * For each tuple that gets through the hashjoin proper, we charge
3833 	 * cpu_tuple_cost plus the cost of evaluating additional restriction
3834 	 * clauses that are to be applied at the join.  (This is pessimistic since
3835 	 * not all of the quals may get evaluated at each tuple.)
3836 	 */
3837 	startup_cost += qp_qual_cost.startup;
3838 	cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple;
3839 	run_cost += cpu_per_tuple * hashjointuples;
3840 
3841 	/* tlist eval costs are paid per output row, not per tuple scanned */
3842 	startup_cost += path->jpath.path.pathtarget->cost.startup;
3843 	run_cost += path->jpath.path.pathtarget->cost.per_tuple * path->jpath.path.rows;
3844 
3845 	path->jpath.path.startup_cost = startup_cost;
3846 	path->jpath.path.total_cost = startup_cost + run_cost;
3847 }
3848 
3849 
3850 /*
3851  * cost_subplan
3852  *		Figure the costs for a SubPlan (or initplan).
3853  *
3854  * Note: we could dig the subplan's Plan out of the root list, but in practice
3855  * all callers have it handy already, so we make them pass it.
3856  */
3857 void
cost_subplan(PlannerInfo * root,SubPlan * subplan,Plan * plan)3858 cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
3859 {
3860 	QualCost	sp_cost;
3861 
3862 	/* Figure any cost for evaluating the testexpr */
3863 	cost_qual_eval(&sp_cost,
3864 				   make_ands_implicit((Expr *) subplan->testexpr),
3865 				   root);
3866 
3867 	if (subplan->useHashTable)
3868 	{
3869 		/*
3870 		 * If we are using a hash table for the subquery outputs, then the
3871 		 * cost of evaluating the query is a one-time cost.  We charge one
3872 		 * cpu_operator_cost per tuple for the work of loading the hashtable,
3873 		 * too.
3874 		 */
3875 		sp_cost.startup += plan->total_cost +
3876 			cpu_operator_cost * plan->plan_rows;
3877 
3878 		/*
3879 		 * The per-tuple costs include the cost of evaluating the lefthand
3880 		 * expressions, plus the cost of probing the hashtable.  We already
3881 		 * accounted for the lefthand expressions as part of the testexpr, and
3882 		 * will also have counted one cpu_operator_cost for each comparison
3883 		 * operator.  That is probably too low for the probing cost, but it's
3884 		 * hard to make a better estimate, so live with it for now.
3885 		 */
3886 	}
3887 	else
3888 	{
3889 		/*
3890 		 * Otherwise we will be rescanning the subplan output on each
3891 		 * evaluation.  We need to estimate how much of the output we will
3892 		 * actually need to scan.  NOTE: this logic should agree with the
3893 		 * tuple_fraction estimates used by make_subplan() in
3894 		 * plan/subselect.c.
3895 		 */
3896 		Cost		plan_run_cost = plan->total_cost - plan->startup_cost;
3897 
3898 		if (subplan->subLinkType == EXISTS_SUBLINK)
3899 		{
3900 			/* we only need to fetch 1 tuple; clamp to avoid zero divide */
3901 			sp_cost.per_tuple += plan_run_cost / clamp_row_est(plan->plan_rows);
3902 		}
3903 		else if (subplan->subLinkType == ALL_SUBLINK ||
3904 				 subplan->subLinkType == ANY_SUBLINK)
3905 		{
3906 			/* assume we need 50% of the tuples */
3907 			sp_cost.per_tuple += 0.50 * plan_run_cost;
3908 			/* also charge a cpu_operator_cost per row examined */
3909 			sp_cost.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost;
3910 		}
3911 		else
3912 		{
3913 			/* assume we need all tuples */
3914 			sp_cost.per_tuple += plan_run_cost;
3915 		}
3916 
3917 		/*
3918 		 * Also account for subplan's startup cost. If the subplan is
3919 		 * uncorrelated or undirect correlated, AND its topmost node is one
3920 		 * that materializes its output, assume that we'll only need to pay
3921 		 * its startup cost once; otherwise assume we pay the startup cost
3922 		 * every time.
3923 		 */
3924 		if (subplan->parParam == NIL &&
3925 			ExecMaterializesOutput(nodeTag(plan)))
3926 			sp_cost.startup += plan->startup_cost;
3927 		else
3928 			sp_cost.per_tuple += plan->startup_cost;
3929 	}
3930 
3931 	subplan->startup_cost = sp_cost.startup;
3932 	subplan->per_call_cost = sp_cost.per_tuple;
3933 }
3934 
3935 
3936 /*
3937  * cost_rescan
3938  *		Given a finished Path, estimate the costs of rescanning it after
3939  *		having done so the first time.  For some Path types a rescan is
3940  *		cheaper than an original scan (if no parameters change), and this
3941  *		function embodies knowledge about that.  The default is to return
3942  *		the same costs stored in the Path.  (Note that the cost estimates
3943  *		actually stored in Paths are always for first scans.)
3944  *
3945  * This function is not currently intended to model effects such as rescans
3946  * being cheaper due to disk block caching; what we are concerned with is
3947  * plan types wherein the executor caches results explicitly, or doesn't
3948  * redo startup calculations, etc.
3949  */
3950 static void
cost_rescan(PlannerInfo * root,Path * path,Cost * rescan_startup_cost,Cost * rescan_total_cost)3951 cost_rescan(PlannerInfo *root, Path *path,
3952 			Cost *rescan_startup_cost,	/* output parameters */
3953 			Cost *rescan_total_cost)
3954 {
3955 	switch (path->pathtype)
3956 	{
3957 		case T_FunctionScan:
3958 
3959 			/*
3960 			 * Currently, nodeFunctionscan.c always executes the function to
3961 			 * completion before returning any rows, and caches the results in
3962 			 * a tuplestore.  So the function eval cost is all startup cost
3963 			 * and isn't paid over again on rescans. However, all run costs
3964 			 * will be paid over again.
3965 			 */
3966 			*rescan_startup_cost = 0;
3967 			*rescan_total_cost = path->total_cost - path->startup_cost;
3968 			break;
3969 		case T_HashJoin:
3970 
3971 			/*
3972 			 * If it's a single-batch join, we don't need to rebuild the hash
3973 			 * table during a rescan.
3974 			 */
3975 			if (((HashPath *) path)->num_batches == 1)
3976 			{
3977 				/* Startup cost is exactly the cost of hash table building */
3978 				*rescan_startup_cost = 0;
3979 				*rescan_total_cost = path->total_cost - path->startup_cost;
3980 			}
3981 			else
3982 			{
3983 				/* Otherwise, no special treatment */
3984 				*rescan_startup_cost = path->startup_cost;
3985 				*rescan_total_cost = path->total_cost;
3986 			}
3987 			break;
3988 		case T_CteScan:
3989 		case T_WorkTableScan:
3990 			{
3991 				/*
3992 				 * These plan types materialize their final result in a
3993 				 * tuplestore or tuplesort object.  So the rescan cost is only
3994 				 * cpu_tuple_cost per tuple, unless the result is large enough
3995 				 * to spill to disk.
3996 				 */
3997 				Cost		run_cost = cpu_tuple_cost * path->rows;
3998 				double		nbytes = relation_byte_size(path->rows,
3999 														path->pathtarget->width);
4000 				long		work_mem_bytes = work_mem * 1024L;
4001 
4002 				if (nbytes > work_mem_bytes)
4003 				{
4004 					/* It will spill, so account for re-read cost */
4005 					double		npages = ceil(nbytes / BLCKSZ);
4006 
4007 					run_cost += seq_page_cost * npages;
4008 				}
4009 				*rescan_startup_cost = 0;
4010 				*rescan_total_cost = run_cost;
4011 			}
4012 			break;
4013 		case T_Material:
4014 		case T_Sort:
4015 			{
4016 				/*
4017 				 * These plan types not only materialize their results, but do
4018 				 * not implement qual filtering or projection.  So they are
4019 				 * even cheaper to rescan than the ones above.  We charge only
4020 				 * cpu_operator_cost per tuple.  (Note: keep that in sync with
4021 				 * the run_cost charge in cost_sort, and also see comments in
4022 				 * cost_material before you change it.)
4023 				 */
4024 				Cost		run_cost = cpu_operator_cost * path->rows;
4025 				double		nbytes = relation_byte_size(path->rows,
4026 														path->pathtarget->width);
4027 				long		work_mem_bytes = work_mem * 1024L;
4028 
4029 				if (nbytes > work_mem_bytes)
4030 				{
4031 					/* It will spill, so account for re-read cost */
4032 					double		npages = ceil(nbytes / BLCKSZ);
4033 
4034 					run_cost += seq_page_cost * npages;
4035 				}
4036 				*rescan_startup_cost = 0;
4037 				*rescan_total_cost = run_cost;
4038 			}
4039 			break;
4040 		default:
4041 			*rescan_startup_cost = path->startup_cost;
4042 			*rescan_total_cost = path->total_cost;
4043 			break;
4044 	}
4045 }
4046 
4047 
4048 /*
4049  * cost_qual_eval
4050  *		Estimate the CPU costs of evaluating a WHERE clause.
4051  *		The input can be either an implicitly-ANDed list of boolean
4052  *		expressions, or a list of RestrictInfo nodes.  (The latter is
4053  *		preferred since it allows caching of the results.)
4054  *		The result includes both a one-time (startup) component,
4055  *		and a per-evaluation component.
4056  */
4057 void
cost_qual_eval(QualCost * cost,List * quals,PlannerInfo * root)4058 cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root)
4059 {
4060 	cost_qual_eval_context context;
4061 	ListCell   *l;
4062 
4063 	context.root = root;
4064 	context.total.startup = 0;
4065 	context.total.per_tuple = 0;
4066 
4067 	/* We don't charge any cost for the implicit ANDing at top level ... */
4068 
4069 	foreach(l, quals)
4070 	{
4071 		Node	   *qual = (Node *) lfirst(l);
4072 
4073 		cost_qual_eval_walker(qual, &context);
4074 	}
4075 
4076 	*cost = context.total;
4077 }
4078 
4079 /*
4080  * cost_qual_eval_node
4081  *		As above, for a single RestrictInfo or expression.
4082  */
4083 void
cost_qual_eval_node(QualCost * cost,Node * qual,PlannerInfo * root)4084 cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root)
4085 {
4086 	cost_qual_eval_context context;
4087 
4088 	context.root = root;
4089 	context.total.startup = 0;
4090 	context.total.per_tuple = 0;
4091 
4092 	cost_qual_eval_walker(qual, &context);
4093 
4094 	*cost = context.total;
4095 }
4096 
4097 static bool
cost_qual_eval_walker(Node * node,cost_qual_eval_context * context)4098 cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
4099 {
4100 	if (node == NULL)
4101 		return false;
4102 
4103 	/*
4104 	 * RestrictInfo nodes contain an eval_cost field reserved for this
4105 	 * routine's use, so that it's not necessary to evaluate the qual clause's
4106 	 * cost more than once.  If the clause's cost hasn't been computed yet,
4107 	 * the field's startup value will contain -1.
4108 	 */
4109 	if (IsA(node, RestrictInfo))
4110 	{
4111 		RestrictInfo *rinfo = (RestrictInfo *) node;
4112 
4113 		if (rinfo->eval_cost.startup < 0)
4114 		{
4115 			cost_qual_eval_context locContext;
4116 
4117 			locContext.root = context->root;
4118 			locContext.total.startup = 0;
4119 			locContext.total.per_tuple = 0;
4120 
4121 			/*
4122 			 * For an OR clause, recurse into the marked-up tree so that we
4123 			 * set the eval_cost for contained RestrictInfos too.
4124 			 */
4125 			if (rinfo->orclause)
4126 				cost_qual_eval_walker((Node *) rinfo->orclause, &locContext);
4127 			else
4128 				cost_qual_eval_walker((Node *) rinfo->clause, &locContext);
4129 
4130 			/*
4131 			 * If the RestrictInfo is marked pseudoconstant, it will be tested
4132 			 * only once, so treat its cost as all startup cost.
4133 			 */
4134 			if (rinfo->pseudoconstant)
4135 			{
4136 				/* count one execution during startup */
4137 				locContext.total.startup += locContext.total.per_tuple;
4138 				locContext.total.per_tuple = 0;
4139 			}
4140 			rinfo->eval_cost = locContext.total;
4141 		}
4142 		context->total.startup += rinfo->eval_cost.startup;
4143 		context->total.per_tuple += rinfo->eval_cost.per_tuple;
4144 		/* do NOT recurse into children */
4145 		return false;
4146 	}
4147 
4148 	/*
4149 	 * For each operator or function node in the given tree, we charge the
4150 	 * estimated execution cost given by pg_proc.procost (remember to multiply
4151 	 * this by cpu_operator_cost).
4152 	 *
4153 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
4154 	 * OR, NOT). Simplistic, but a lot better than no model at all.
4155 	 *
4156 	 * Should we try to account for the possibility of short-circuit
4157 	 * evaluation of AND/OR?  Probably *not*, because that would make the
4158 	 * results depend on the clause ordering, and we are not in any position
4159 	 * to expect that the current ordering of the clauses is the one that's
4160 	 * going to end up being used.  The above per-RestrictInfo caching would
4161 	 * not mix well with trying to re-order clauses anyway.
4162 	 *
4163 	 * Another issue that is entirely ignored here is that if a set-returning
4164 	 * function is below top level in the tree, the functions/operators above
4165 	 * it will need to be evaluated multiple times.  In practical use, such
4166 	 * cases arise so seldom as to not be worth the added complexity needed;
4167 	 * moreover, since our rowcount estimates for functions tend to be pretty
4168 	 * phony, the results would also be pretty phony.
4169 	 */
4170 	if (IsA(node, FuncExpr))
4171 	{
4172 		add_function_cost(context->root, ((FuncExpr *) node)->funcid, node,
4173 						  &context->total);
4174 	}
4175 	else if (IsA(node, OpExpr) ||
4176 			 IsA(node, DistinctExpr) ||
4177 			 IsA(node, NullIfExpr))
4178 	{
4179 		/* rely on struct equivalence to treat these all alike */
4180 		set_opfuncid((OpExpr *) node);
4181 		add_function_cost(context->root, ((OpExpr *) node)->opfuncid, node,
4182 						  &context->total);
4183 	}
4184 	else if (IsA(node, ScalarArrayOpExpr))
4185 	{
4186 		/*
4187 		 * Estimate that the operator will be applied to about half of the
4188 		 * array elements before the answer is determined.
4189 		 */
4190 		ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node;
4191 		Node	   *arraynode = (Node *) lsecond(saop->args);
4192 		QualCost	sacosts;
4193 
4194 		set_sa_opfuncid(saop);
4195 		sacosts.startup = sacosts.per_tuple = 0;
4196 		add_function_cost(context->root, saop->opfuncid, NULL,
4197 						  &sacosts);
4198 		context->total.startup += sacosts.startup;
4199 		context->total.per_tuple += sacosts.per_tuple *
4200 			estimate_array_length(arraynode) * 0.5;
4201 	}
4202 	else if (IsA(node, Aggref) ||
4203 			 IsA(node, WindowFunc))
4204 	{
4205 		/*
4206 		 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
4207 		 * ie, zero execution cost in the current model, because they behave
4208 		 * essentially like Vars at execution.  We disregard the costs of
4209 		 * their input expressions for the same reason.  The actual execution
4210 		 * costs of the aggregate/window functions and their arguments have to
4211 		 * be factored into plan-node-specific costing of the Agg or WindowAgg
4212 		 * plan node.
4213 		 */
4214 		return false;			/* don't recurse into children */
4215 	}
4216 	else if (IsA(node, CoerceViaIO))
4217 	{
4218 		CoerceViaIO *iocoerce = (CoerceViaIO *) node;
4219 		Oid			iofunc;
4220 		Oid			typioparam;
4221 		bool		typisvarlena;
4222 
4223 		/* check the result type's input function */
4224 		getTypeInputInfo(iocoerce->resulttype,
4225 						 &iofunc, &typioparam);
4226 		add_function_cost(context->root, iofunc, NULL,
4227 						  &context->total);
4228 		/* check the input type's output function */
4229 		getTypeOutputInfo(exprType((Node *) iocoerce->arg),
4230 						  &iofunc, &typisvarlena);
4231 		add_function_cost(context->root, iofunc, NULL,
4232 						  &context->total);
4233 	}
4234 	else if (IsA(node, ArrayCoerceExpr))
4235 	{
4236 		ArrayCoerceExpr *acoerce = (ArrayCoerceExpr *) node;
4237 		QualCost	perelemcost;
4238 
4239 		cost_qual_eval_node(&perelemcost, (Node *) acoerce->elemexpr,
4240 							context->root);
4241 		context->total.startup += perelemcost.startup;
4242 		if (perelemcost.per_tuple > 0)
4243 			context->total.per_tuple += perelemcost.per_tuple *
4244 				estimate_array_length((Node *) acoerce->arg);
4245 	}
4246 	else if (IsA(node, RowCompareExpr))
4247 	{
4248 		/* Conservatively assume we will check all the columns */
4249 		RowCompareExpr *rcexpr = (RowCompareExpr *) node;
4250 		ListCell   *lc;
4251 
4252 		foreach(lc, rcexpr->opnos)
4253 		{
4254 			Oid			opid = lfirst_oid(lc);
4255 
4256 			add_function_cost(context->root, get_opcode(opid), NULL,
4257 							  &context->total);
4258 		}
4259 	}
4260 	else if (IsA(node, MinMaxExpr) ||
4261 			 IsA(node, SQLValueFunction) ||
4262 			 IsA(node, XmlExpr) ||
4263 			 IsA(node, CoerceToDomain) ||
4264 			 IsA(node, NextValueExpr))
4265 	{
4266 		/* Treat all these as having cost 1 */
4267 		context->total.per_tuple += cpu_operator_cost;
4268 	}
4269 	else if (IsA(node, CurrentOfExpr))
4270 	{
4271 		/* Report high cost to prevent selection of anything but TID scan */
4272 		context->total.startup += disable_cost;
4273 	}
4274 	else if (IsA(node, SubLink))
4275 	{
4276 		/* This routine should not be applied to un-planned expressions */
4277 		elog(ERROR, "cannot handle unplanned sub-select");
4278 	}
4279 	else if (IsA(node, SubPlan))
4280 	{
4281 		/*
4282 		 * A subplan node in an expression typically indicates that the
4283 		 * subplan will be executed on each evaluation, so charge accordingly.
4284 		 * (Sub-selects that can be executed as InitPlans have already been
4285 		 * removed from the expression.)
4286 		 */
4287 		SubPlan    *subplan = (SubPlan *) node;
4288 
4289 		context->total.startup += subplan->startup_cost;
4290 		context->total.per_tuple += subplan->per_call_cost;
4291 
4292 		/*
4293 		 * We don't want to recurse into the testexpr, because it was already
4294 		 * counted in the SubPlan node's costs.  So we're done.
4295 		 */
4296 		return false;
4297 	}
4298 	else if (IsA(node, AlternativeSubPlan))
4299 	{
4300 		/*
4301 		 * Arbitrarily use the first alternative plan for costing.  (We should
4302 		 * certainly only include one alternative, and we don't yet have
4303 		 * enough information to know which one the executor is most likely to
4304 		 * use.)
4305 		 */
4306 		AlternativeSubPlan *asplan = (AlternativeSubPlan *) node;
4307 
4308 		return cost_qual_eval_walker((Node *) linitial(asplan->subplans),
4309 									 context);
4310 	}
4311 	else if (IsA(node, PlaceHolderVar))
4312 	{
4313 		/*
4314 		 * A PlaceHolderVar should be given cost zero when considering general
4315 		 * expression evaluation costs.  The expense of doing the contained
4316 		 * expression is charged as part of the tlist eval costs of the scan
4317 		 * or join where the PHV is first computed (see set_rel_width and
4318 		 * add_placeholders_to_joinrel).  If we charged it again here, we'd be
4319 		 * double-counting the cost for each level of plan that the PHV
4320 		 * bubbles up through.  Hence, return without recursing into the
4321 		 * phexpr.
4322 		 */
4323 		return false;
4324 	}
4325 
4326 	/* recurse into children */
4327 	return expression_tree_walker(node, cost_qual_eval_walker,
4328 								  (void *) context);
4329 }
4330 
4331 /*
4332  * get_restriction_qual_cost
4333  *	  Compute evaluation costs of a baserel's restriction quals, plus any
4334  *	  movable join quals that have been pushed down to the scan.
4335  *	  Results are returned into *qpqual_cost.
4336  *
4337  * This is a convenience subroutine that works for seqscans and other cases
4338  * where all the given quals will be evaluated the hard way.  It's not useful
4339  * for cost_index(), for example, where the index machinery takes care of
4340  * some of the quals.  We assume baserestrictcost was previously set by
4341  * set_baserel_size_estimates().
4342  */
4343 static void
get_restriction_qual_cost(PlannerInfo * root,RelOptInfo * baserel,ParamPathInfo * param_info,QualCost * qpqual_cost)4344 get_restriction_qual_cost(PlannerInfo *root, RelOptInfo *baserel,
4345 						  ParamPathInfo *param_info,
4346 						  QualCost *qpqual_cost)
4347 {
4348 	if (param_info)
4349 	{
4350 		/* Include costs of pushed-down clauses */
4351 		cost_qual_eval(qpqual_cost, param_info->ppi_clauses, root);
4352 
4353 		qpqual_cost->startup += baserel->baserestrictcost.startup;
4354 		qpqual_cost->per_tuple += baserel->baserestrictcost.per_tuple;
4355 	}
4356 	else
4357 		*qpqual_cost = baserel->baserestrictcost;
4358 }
4359 
4360 
4361 /*
4362  * compute_semi_anti_join_factors
4363  *	  Estimate how much of the inner input a SEMI, ANTI, or inner_unique join
4364  *	  can be expected to scan.
4365  *
4366  * In a hash or nestloop SEMI/ANTI join, the executor will stop scanning
4367  * inner rows as soon as it finds a match to the current outer row.
4368  * The same happens if we have detected the inner rel is unique.
4369  * We should therefore adjust some of the cost components for this effect.
4370  * This function computes some estimates needed for these adjustments.
4371  * These estimates will be the same regardless of the particular paths used
4372  * for the outer and inner relation, so we compute these once and then pass
4373  * them to all the join cost estimation functions.
4374  *
4375  * Input parameters:
4376  *	joinrel: join relation under consideration
4377  *	outerrel: outer relation under consideration
4378  *	innerrel: inner relation under consideration
4379  *	jointype: if not JOIN_SEMI or JOIN_ANTI, we assume it's inner_unique
4380  *	sjinfo: SpecialJoinInfo relevant to this join
4381  *	restrictlist: join quals
4382  * Output parameters:
4383  *	*semifactors is filled in (see pathnodes.h for field definitions)
4384  */
4385 void
compute_semi_anti_join_factors(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outerrel,RelOptInfo * innerrel,JoinType jointype,SpecialJoinInfo * sjinfo,List * restrictlist,SemiAntiJoinFactors * semifactors)4386 compute_semi_anti_join_factors(PlannerInfo *root,
4387 							   RelOptInfo *joinrel,
4388 							   RelOptInfo *outerrel,
4389 							   RelOptInfo *innerrel,
4390 							   JoinType jointype,
4391 							   SpecialJoinInfo *sjinfo,
4392 							   List *restrictlist,
4393 							   SemiAntiJoinFactors *semifactors)
4394 {
4395 	Selectivity jselec;
4396 	Selectivity nselec;
4397 	Selectivity avgmatch;
4398 	SpecialJoinInfo norm_sjinfo;
4399 	List	   *joinquals;
4400 	ListCell   *l;
4401 
4402 	/*
4403 	 * In an ANTI join, we must ignore clauses that are "pushed down", since
4404 	 * those won't affect the match logic.  In a SEMI join, we do not
4405 	 * distinguish joinquals from "pushed down" quals, so just use the whole
4406 	 * restrictinfo list.  For other outer join types, we should consider only
4407 	 * non-pushed-down quals, so that this devolves to an IS_OUTER_JOIN check.
4408 	 */
4409 	if (IS_OUTER_JOIN(jointype))
4410 	{
4411 		joinquals = NIL;
4412 		foreach(l, restrictlist)
4413 		{
4414 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4415 
4416 			if (!RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4417 				joinquals = lappend(joinquals, rinfo);
4418 		}
4419 	}
4420 	else
4421 		joinquals = restrictlist;
4422 
4423 	/*
4424 	 * Get the JOIN_SEMI or JOIN_ANTI selectivity of the join clauses.
4425 	 */
4426 	jselec = clauselist_selectivity(root,
4427 									joinquals,
4428 									0,
4429 									(jointype == JOIN_ANTI) ? JOIN_ANTI : JOIN_SEMI,
4430 									sjinfo);
4431 
4432 	/*
4433 	 * Also get the normal inner-join selectivity of the join clauses.
4434 	 */
4435 	norm_sjinfo.type = T_SpecialJoinInfo;
4436 	norm_sjinfo.min_lefthand = outerrel->relids;
4437 	norm_sjinfo.min_righthand = innerrel->relids;
4438 	norm_sjinfo.syn_lefthand = outerrel->relids;
4439 	norm_sjinfo.syn_righthand = innerrel->relids;
4440 	norm_sjinfo.jointype = JOIN_INNER;
4441 	/* we don't bother trying to make the remaining fields valid */
4442 	norm_sjinfo.lhs_strict = false;
4443 	norm_sjinfo.delay_upper_joins = false;
4444 	norm_sjinfo.semi_can_btree = false;
4445 	norm_sjinfo.semi_can_hash = false;
4446 	norm_sjinfo.semi_operators = NIL;
4447 	norm_sjinfo.semi_rhs_exprs = NIL;
4448 
4449 	nselec = clauselist_selectivity(root,
4450 									joinquals,
4451 									0,
4452 									JOIN_INNER,
4453 									&norm_sjinfo);
4454 
4455 	/* Avoid leaking a lot of ListCells */
4456 	if (IS_OUTER_JOIN(jointype))
4457 		list_free(joinquals);
4458 
4459 	/*
4460 	 * jselec can be interpreted as the fraction of outer-rel rows that have
4461 	 * any matches (this is true for both SEMI and ANTI cases).  And nselec is
4462 	 * the fraction of the Cartesian product that matches.  So, the average
4463 	 * number of matches for each outer-rel row that has at least one match is
4464 	 * nselec * inner_rows / jselec.
4465 	 *
4466 	 * Note: it is correct to use the inner rel's "rows" count here, even
4467 	 * though we might later be considering a parameterized inner path with
4468 	 * fewer rows.  This is because we have included all the join clauses in
4469 	 * the selectivity estimate.
4470 	 */
4471 	if (jselec > 0)				/* protect against zero divide */
4472 	{
4473 		avgmatch = nselec * innerrel->rows / jselec;
4474 		/* Clamp to sane range */
4475 		avgmatch = Max(1.0, avgmatch);
4476 	}
4477 	else
4478 		avgmatch = 1.0;
4479 
4480 	semifactors->outer_match_frac = jselec;
4481 	semifactors->match_count = avgmatch;
4482 }
4483 
4484 /*
4485  * has_indexed_join_quals
4486  *	  Check whether all the joinquals of a nestloop join are used as
4487  *	  inner index quals.
4488  *
4489  * If the inner path of a SEMI/ANTI join is an indexscan (including bitmap
4490  * indexscan) that uses all the joinquals as indexquals, we can assume that an
4491  * unmatched outer tuple is cheap to process, whereas otherwise it's probably
4492  * expensive.
4493  */
4494 static bool
has_indexed_join_quals(NestPath * joinpath)4495 has_indexed_join_quals(NestPath *joinpath)
4496 {
4497 	Relids		joinrelids = joinpath->path.parent->relids;
4498 	Path	   *innerpath = joinpath->innerjoinpath;
4499 	List	   *indexclauses;
4500 	bool		found_one;
4501 	ListCell   *lc;
4502 
4503 	/* If join still has quals to evaluate, it's not fast */
4504 	if (joinpath->joinrestrictinfo != NIL)
4505 		return false;
4506 	/* Nor if the inner path isn't parameterized at all */
4507 	if (innerpath->param_info == NULL)
4508 		return false;
4509 
4510 	/* Find the indexclauses list for the inner scan */
4511 	switch (innerpath->pathtype)
4512 	{
4513 		case T_IndexScan:
4514 		case T_IndexOnlyScan:
4515 			indexclauses = ((IndexPath *) innerpath)->indexclauses;
4516 			break;
4517 		case T_BitmapHeapScan:
4518 			{
4519 				/* Accept only a simple bitmap scan, not AND/OR cases */
4520 				Path	   *bmqual = ((BitmapHeapPath *) innerpath)->bitmapqual;
4521 
4522 				if (IsA(bmqual, IndexPath))
4523 					indexclauses = ((IndexPath *) bmqual)->indexclauses;
4524 				else
4525 					return false;
4526 				break;
4527 			}
4528 		default:
4529 
4530 			/*
4531 			 * If it's not a simple indexscan, it probably doesn't run quickly
4532 			 * for zero rows out, even if it's a parameterized path using all
4533 			 * the joinquals.
4534 			 */
4535 			return false;
4536 	}
4537 
4538 	/*
4539 	 * Examine the inner path's param clauses.  Any that are from the outer
4540 	 * path must be found in the indexclauses list, either exactly or in an
4541 	 * equivalent form generated by equivclass.c.  Also, we must find at least
4542 	 * one such clause, else it's a clauseless join which isn't fast.
4543 	 */
4544 	found_one = false;
4545 	foreach(lc, innerpath->param_info->ppi_clauses)
4546 	{
4547 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
4548 
4549 		if (join_clause_is_movable_into(rinfo,
4550 										innerpath->parent->relids,
4551 										joinrelids))
4552 		{
4553 			if (!is_redundant_with_indexclauses(rinfo, indexclauses))
4554 				return false;
4555 			found_one = true;
4556 		}
4557 	}
4558 	return found_one;
4559 }
4560 
4561 
4562 /*
4563  * approx_tuple_count
4564  *		Quick-and-dirty estimation of the number of join rows passing
4565  *		a set of qual conditions.
4566  *
4567  * The quals can be either an implicitly-ANDed list of boolean expressions,
4568  * or a list of RestrictInfo nodes (typically the latter).
4569  *
4570  * We intentionally compute the selectivity under JOIN_INNER rules, even
4571  * if it's some type of outer join.  This is appropriate because we are
4572  * trying to figure out how many tuples pass the initial merge or hash
4573  * join step.
4574  *
4575  * This is quick-and-dirty because we bypass clauselist_selectivity, and
4576  * simply multiply the independent clause selectivities together.  Now
4577  * clauselist_selectivity often can't do any better than that anyhow, but
4578  * for some situations (such as range constraints) it is smarter.  However,
4579  * we can't effectively cache the results of clauselist_selectivity, whereas
4580  * the individual clause selectivities can be and are cached.
4581  *
4582  * Since we are only using the results to estimate how many potential
4583  * output tuples are generated and passed through qpqual checking, it
4584  * seems OK to live with the approximation.
4585  */
4586 static double
approx_tuple_count(PlannerInfo * root,JoinPath * path,List * quals)4587 approx_tuple_count(PlannerInfo *root, JoinPath *path, List *quals)
4588 {
4589 	double		tuples;
4590 	double		outer_tuples = path->outerjoinpath->rows;
4591 	double		inner_tuples = path->innerjoinpath->rows;
4592 	SpecialJoinInfo sjinfo;
4593 	Selectivity selec = 1.0;
4594 	ListCell   *l;
4595 
4596 	/*
4597 	 * Make up a SpecialJoinInfo for JOIN_INNER semantics.
4598 	 */
4599 	sjinfo.type = T_SpecialJoinInfo;
4600 	sjinfo.min_lefthand = path->outerjoinpath->parent->relids;
4601 	sjinfo.min_righthand = path->innerjoinpath->parent->relids;
4602 	sjinfo.syn_lefthand = path->outerjoinpath->parent->relids;
4603 	sjinfo.syn_righthand = path->innerjoinpath->parent->relids;
4604 	sjinfo.jointype = JOIN_INNER;
4605 	/* we don't bother trying to make the remaining fields valid */
4606 	sjinfo.lhs_strict = false;
4607 	sjinfo.delay_upper_joins = false;
4608 	sjinfo.semi_can_btree = false;
4609 	sjinfo.semi_can_hash = false;
4610 	sjinfo.semi_operators = NIL;
4611 	sjinfo.semi_rhs_exprs = NIL;
4612 
4613 	/* Get the approximate selectivity */
4614 	foreach(l, quals)
4615 	{
4616 		Node	   *qual = (Node *) lfirst(l);
4617 
4618 		/* Note that clause_selectivity will be able to cache its result */
4619 		selec *= clause_selectivity(root, qual, 0, JOIN_INNER, &sjinfo);
4620 	}
4621 
4622 	/* Apply it to the input relation sizes */
4623 	tuples = selec * outer_tuples * inner_tuples;
4624 
4625 	return clamp_row_est(tuples);
4626 }
4627 
4628 
4629 /*
4630  * set_baserel_size_estimates
4631  *		Set the size estimates for the given base relation.
4632  *
4633  * The rel's targetlist and restrictinfo list must have been constructed
4634  * already, and rel->tuples must be set.
4635  *
4636  * We set the following fields of the rel node:
4637  *	rows: the estimated number of output tuples (after applying
4638  *		  restriction clauses).
4639  *	width: the estimated average output tuple width in bytes.
4640  *	baserestrictcost: estimated cost of evaluating baserestrictinfo clauses.
4641  */
4642 void
set_baserel_size_estimates(PlannerInfo * root,RelOptInfo * rel)4643 set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel)
4644 {
4645 	double		nrows;
4646 
4647 	/* Should only be applied to base relations */
4648 	Assert(rel->relid > 0);
4649 
4650 	nrows = rel->tuples *
4651 		clauselist_selectivity(root,
4652 							   rel->baserestrictinfo,
4653 							   0,
4654 							   JOIN_INNER,
4655 							   NULL);
4656 
4657 	rel->rows = clamp_row_est(nrows);
4658 
4659 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
4660 
4661 	set_rel_width(root, rel);
4662 }
4663 
4664 /*
4665  * get_parameterized_baserel_size
4666  *		Make a size estimate for a parameterized scan of a base relation.
4667  *
4668  * 'param_clauses' lists the additional join clauses to be used.
4669  *
4670  * set_baserel_size_estimates must have been applied already.
4671  */
4672 double
get_parameterized_baserel_size(PlannerInfo * root,RelOptInfo * rel,List * param_clauses)4673 get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel,
4674 							   List *param_clauses)
4675 {
4676 	List	   *allclauses;
4677 	double		nrows;
4678 
4679 	/*
4680 	 * Estimate the number of rows returned by the parameterized scan, knowing
4681 	 * that it will apply all the extra join clauses as well as the rel's own
4682 	 * restriction clauses.  Note that we force the clauses to be treated as
4683 	 * non-join clauses during selectivity estimation.
4684 	 */
4685 	allclauses = list_concat_copy(param_clauses, rel->baserestrictinfo);
4686 	nrows = rel->tuples *
4687 		clauselist_selectivity(root,
4688 							   allclauses,
4689 							   rel->relid,	/* do not use 0! */
4690 							   JOIN_INNER,
4691 							   NULL);
4692 	nrows = clamp_row_est(nrows);
4693 	/* For safety, make sure result is not more than the base estimate */
4694 	if (nrows > rel->rows)
4695 		nrows = rel->rows;
4696 	return nrows;
4697 }
4698 
4699 /*
4700  * set_joinrel_size_estimates
4701  *		Set the size estimates for the given join relation.
4702  *
4703  * The rel's targetlist must have been constructed already, and a
4704  * restriction clause list that matches the given component rels must
4705  * be provided.
4706  *
4707  * Since there is more than one way to make a joinrel for more than two
4708  * base relations, the results we get here could depend on which component
4709  * rel pair is provided.  In theory we should get the same answers no matter
4710  * which pair is provided; in practice, since the selectivity estimation
4711  * routines don't handle all cases equally well, we might not.  But there's
4712  * not much to be done about it.  (Would it make sense to repeat the
4713  * calculations for each pair of input rels that's encountered, and somehow
4714  * average the results?  Probably way more trouble than it's worth, and
4715  * anyway we must keep the rowcount estimate the same for all paths for the
4716  * joinrel.)
4717  *
4718  * We set only the rows field here.  The reltarget field was already set by
4719  * build_joinrel_tlist, and baserestrictcost is not used for join rels.
4720  */
4721 void
set_joinrel_size_estimates(PlannerInfo * root,RelOptInfo * rel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,SpecialJoinInfo * sjinfo,List * restrictlist)4722 set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel,
4723 						   RelOptInfo *outer_rel,
4724 						   RelOptInfo *inner_rel,
4725 						   SpecialJoinInfo *sjinfo,
4726 						   List *restrictlist)
4727 {
4728 	rel->rows = calc_joinrel_size_estimate(root,
4729 										   rel,
4730 										   outer_rel,
4731 										   inner_rel,
4732 										   outer_rel->rows,
4733 										   inner_rel->rows,
4734 										   sjinfo,
4735 										   restrictlist);
4736 }
4737 
4738 /*
4739  * get_parameterized_joinrel_size
4740  *		Make a size estimate for a parameterized scan of a join relation.
4741  *
4742  * 'rel' is the joinrel under consideration.
4743  * 'outer_path', 'inner_path' are (probably also parameterized) Paths that
4744  *		produce the relations being joined.
4745  * 'sjinfo' is any SpecialJoinInfo relevant to this join.
4746  * 'restrict_clauses' lists the join clauses that need to be applied at the
4747  * join node (including any movable clauses that were moved down to this join,
4748  * and not including any movable clauses that were pushed down into the
4749  * child paths).
4750  *
4751  * set_joinrel_size_estimates must have been applied already.
4752  */
4753 double
get_parameterized_joinrel_size(PlannerInfo * root,RelOptInfo * rel,Path * outer_path,Path * inner_path,SpecialJoinInfo * sjinfo,List * restrict_clauses)4754 get_parameterized_joinrel_size(PlannerInfo *root, RelOptInfo *rel,
4755 							   Path *outer_path,
4756 							   Path *inner_path,
4757 							   SpecialJoinInfo *sjinfo,
4758 							   List *restrict_clauses)
4759 {
4760 	double		nrows;
4761 
4762 	/*
4763 	 * Estimate the number of rows returned by the parameterized join as the
4764 	 * sizes of the input paths times the selectivity of the clauses that have
4765 	 * ended up at this join node.
4766 	 *
4767 	 * As with set_joinrel_size_estimates, the rowcount estimate could depend
4768 	 * on the pair of input paths provided, though ideally we'd get the same
4769 	 * estimate for any pair with the same parameterization.
4770 	 */
4771 	nrows = calc_joinrel_size_estimate(root,
4772 									   rel,
4773 									   outer_path->parent,
4774 									   inner_path->parent,
4775 									   outer_path->rows,
4776 									   inner_path->rows,
4777 									   sjinfo,
4778 									   restrict_clauses);
4779 	/* For safety, make sure result is not more than the base estimate */
4780 	if (nrows > rel->rows)
4781 		nrows = rel->rows;
4782 	return nrows;
4783 }
4784 
4785 /*
4786  * calc_joinrel_size_estimate
4787  *		Workhorse for set_joinrel_size_estimates and
4788  *		get_parameterized_joinrel_size.
4789  *
4790  * outer_rel/inner_rel are the relations being joined, but they should be
4791  * assumed to have sizes outer_rows/inner_rows; those numbers might be less
4792  * than what rel->rows says, when we are considering parameterized paths.
4793  */
4794 static double
calc_joinrel_size_estimate(PlannerInfo * root,RelOptInfo * joinrel,RelOptInfo * outer_rel,RelOptInfo * inner_rel,double outer_rows,double inner_rows,SpecialJoinInfo * sjinfo,List * restrictlist_in)4795 calc_joinrel_size_estimate(PlannerInfo *root,
4796 						   RelOptInfo *joinrel,
4797 						   RelOptInfo *outer_rel,
4798 						   RelOptInfo *inner_rel,
4799 						   double outer_rows,
4800 						   double inner_rows,
4801 						   SpecialJoinInfo *sjinfo,
4802 						   List *restrictlist_in)
4803 {
4804 	/* This apparently-useless variable dodges a compiler bug in VS2013: */
4805 	List	   *restrictlist = restrictlist_in;
4806 	JoinType	jointype = sjinfo->jointype;
4807 	Selectivity fkselec;
4808 	Selectivity jselec;
4809 	Selectivity pselec;
4810 	double		nrows;
4811 
4812 	/*
4813 	 * Compute joinclause selectivity.  Note that we are only considering
4814 	 * clauses that become restriction clauses at this join level; we are not
4815 	 * double-counting them because they were not considered in estimating the
4816 	 * sizes of the component rels.
4817 	 *
4818 	 * First, see whether any of the joinclauses can be matched to known FK
4819 	 * constraints.  If so, drop those clauses from the restrictlist, and
4820 	 * instead estimate their selectivity using FK semantics.  (We do this
4821 	 * without regard to whether said clauses are local or "pushed down".
4822 	 * Probably, an FK-matching clause could never be seen as pushed down at
4823 	 * an outer join, since it would be strict and hence would be grounds for
4824 	 * join strength reduction.)  fkselec gets the net selectivity for
4825 	 * FK-matching clauses, or 1.0 if there are none.
4826 	 */
4827 	fkselec = get_foreign_key_join_selectivity(root,
4828 											   outer_rel->relids,
4829 											   inner_rel->relids,
4830 											   sjinfo,
4831 											   &restrictlist);
4832 
4833 	/*
4834 	 * For an outer join, we have to distinguish the selectivity of the join's
4835 	 * own clauses (JOIN/ON conditions) from any clauses that were "pushed
4836 	 * down".  For inner joins we just count them all as joinclauses.
4837 	 */
4838 	if (IS_OUTER_JOIN(jointype))
4839 	{
4840 		List	   *joinquals = NIL;
4841 		List	   *pushedquals = NIL;
4842 		ListCell   *l;
4843 
4844 		/* Grovel through the clauses to separate into two lists */
4845 		foreach(l, restrictlist)
4846 		{
4847 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
4848 
4849 			if (RINFO_IS_PUSHED_DOWN(rinfo, joinrel->relids))
4850 				pushedquals = lappend(pushedquals, rinfo);
4851 			else
4852 				joinquals = lappend(joinquals, rinfo);
4853 		}
4854 
4855 		/* Get the separate selectivities */
4856 		jselec = clauselist_selectivity(root,
4857 										joinquals,
4858 										0,
4859 										jointype,
4860 										sjinfo);
4861 		pselec = clauselist_selectivity(root,
4862 										pushedquals,
4863 										0,
4864 										jointype,
4865 										sjinfo);
4866 
4867 		/* Avoid leaking a lot of ListCells */
4868 		list_free(joinquals);
4869 		list_free(pushedquals);
4870 	}
4871 	else
4872 	{
4873 		jselec = clauselist_selectivity(root,
4874 										restrictlist,
4875 										0,
4876 										jointype,
4877 										sjinfo);
4878 		pselec = 0.0;			/* not used, keep compiler quiet */
4879 	}
4880 
4881 	/*
4882 	 * Basically, we multiply size of Cartesian product by selectivity.
4883 	 *
4884 	 * If we are doing an outer join, take that into account: the joinqual
4885 	 * selectivity has to be clamped using the knowledge that the output must
4886 	 * be at least as large as the non-nullable input.  However, any
4887 	 * pushed-down quals are applied after the outer join, so their
4888 	 * selectivity applies fully.
4889 	 *
4890 	 * For JOIN_SEMI and JOIN_ANTI, the selectivity is defined as the fraction
4891 	 * of LHS rows that have matches, and we apply that straightforwardly.
4892 	 */
4893 	switch (jointype)
4894 	{
4895 		case JOIN_INNER:
4896 			nrows = outer_rows * inner_rows * fkselec * jselec;
4897 			/* pselec not used */
4898 			break;
4899 		case JOIN_LEFT:
4900 			nrows = outer_rows * inner_rows * fkselec * jselec;
4901 			if (nrows < outer_rows)
4902 				nrows = outer_rows;
4903 			nrows *= pselec;
4904 			break;
4905 		case JOIN_FULL:
4906 			nrows = outer_rows * inner_rows * fkselec * jselec;
4907 			if (nrows < outer_rows)
4908 				nrows = outer_rows;
4909 			if (nrows < inner_rows)
4910 				nrows = inner_rows;
4911 			nrows *= pselec;
4912 			break;
4913 		case JOIN_SEMI:
4914 			nrows = outer_rows * fkselec * jselec;
4915 			/* pselec not used */
4916 			break;
4917 		case JOIN_ANTI:
4918 			nrows = outer_rows * (1.0 - fkselec * jselec);
4919 			nrows *= pselec;
4920 			break;
4921 		default:
4922 			/* other values not expected here */
4923 			elog(ERROR, "unrecognized join type: %d", (int) jointype);
4924 			nrows = 0;			/* keep compiler quiet */
4925 			break;
4926 	}
4927 
4928 	return clamp_row_est(nrows);
4929 }
4930 
4931 /*
4932  * get_foreign_key_join_selectivity
4933  *		Estimate join selectivity for foreign-key-related clauses.
4934  *
4935  * Remove any clauses that can be matched to FK constraints from *restrictlist,
4936  * and return a substitute estimate of their selectivity.  1.0 is returned
4937  * when there are no such clauses.
4938  *
4939  * The reason for treating such clauses specially is that we can get better
4940  * estimates this way than by relying on clauselist_selectivity(), especially
4941  * for multi-column FKs where that function's assumption that the clauses are
4942  * independent falls down badly.  But even with single-column FKs, we may be
4943  * able to get a better answer when the pg_statistic stats are missing or out
4944  * of date.
4945  */
4946 static Selectivity
get_foreign_key_join_selectivity(PlannerInfo * root,Relids outer_relids,Relids inner_relids,SpecialJoinInfo * sjinfo,List ** restrictlist)4947 get_foreign_key_join_selectivity(PlannerInfo *root,
4948 								 Relids outer_relids,
4949 								 Relids inner_relids,
4950 								 SpecialJoinInfo *sjinfo,
4951 								 List **restrictlist)
4952 {
4953 	Selectivity fkselec = 1.0;
4954 	JoinType	jointype = sjinfo->jointype;
4955 	List	   *worklist = *restrictlist;
4956 	ListCell   *lc;
4957 
4958 	/* Consider each FK constraint that is known to match the query */
4959 	foreach(lc, root->fkey_list)
4960 	{
4961 		ForeignKeyOptInfo *fkinfo = (ForeignKeyOptInfo *) lfirst(lc);
4962 		bool		ref_is_outer;
4963 		List	   *removedlist;
4964 		ListCell   *cell;
4965 
4966 		/*
4967 		 * This FK is not relevant unless it connects a baserel on one side of
4968 		 * this join to a baserel on the other side.
4969 		 */
4970 		if (bms_is_member(fkinfo->con_relid, outer_relids) &&
4971 			bms_is_member(fkinfo->ref_relid, inner_relids))
4972 			ref_is_outer = false;
4973 		else if (bms_is_member(fkinfo->ref_relid, outer_relids) &&
4974 				 bms_is_member(fkinfo->con_relid, inner_relids))
4975 			ref_is_outer = true;
4976 		else
4977 			continue;
4978 
4979 		/*
4980 		 * If we're dealing with a semi/anti join, and the FK's referenced
4981 		 * relation is on the outside, then knowledge of the FK doesn't help
4982 		 * us figure out what we need to know (which is the fraction of outer
4983 		 * rows that have matches).  On the other hand, if the referenced rel
4984 		 * is on the inside, then all outer rows must have matches in the
4985 		 * referenced table (ignoring nulls).  But any restriction or join
4986 		 * clauses that filter that table will reduce the fraction of matches.
4987 		 * We can account for restriction clauses, but it's too hard to guess
4988 		 * how many table rows would get through a join that's inside the RHS.
4989 		 * Hence, if either case applies, punt and ignore the FK.
4990 		 */
4991 		if ((jointype == JOIN_SEMI || jointype == JOIN_ANTI) &&
4992 			(ref_is_outer || bms_membership(inner_relids) != BMS_SINGLETON))
4993 			continue;
4994 
4995 		/*
4996 		 * Modify the restrictlist by removing clauses that match the FK (and
4997 		 * putting them into removedlist instead).  It seems unsafe to modify
4998 		 * the originally-passed List structure, so we make a shallow copy the
4999 		 * first time through.
5000 		 */
5001 		if (worklist == *restrictlist)
5002 			worklist = list_copy(worklist);
5003 
5004 		removedlist = NIL;
5005 		foreach(cell, worklist)
5006 		{
5007 			RestrictInfo *rinfo = (RestrictInfo *) lfirst(cell);
5008 			bool		remove_it = false;
5009 			int			i;
5010 
5011 			/* Drop this clause if it matches any column of the FK */
5012 			for (i = 0; i < fkinfo->nkeys; i++)
5013 			{
5014 				if (rinfo->parent_ec)
5015 				{
5016 					/*
5017 					 * EC-derived clauses can only match by EC.  It is okay to
5018 					 * consider any clause derived from the same EC as
5019 					 * matching the FK: even if equivclass.c chose to generate
5020 					 * a clause equating some other pair of Vars, it could
5021 					 * have generated one equating the FK's Vars.  So for
5022 					 * purposes of estimation, we can act as though it did so.
5023 					 *
5024 					 * Note: checking parent_ec is a bit of a cheat because
5025 					 * there are EC-derived clauses that don't have parent_ec
5026 					 * set; but such clauses must compare expressions that
5027 					 * aren't just Vars, so they cannot match the FK anyway.
5028 					 */
5029 					if (fkinfo->eclass[i] == rinfo->parent_ec)
5030 					{
5031 						remove_it = true;
5032 						break;
5033 					}
5034 				}
5035 				else
5036 				{
5037 					/*
5038 					 * Otherwise, see if rinfo was previously matched to FK as
5039 					 * a "loose" clause.
5040 					 */
5041 					if (list_member_ptr(fkinfo->rinfos[i], rinfo))
5042 					{
5043 						remove_it = true;
5044 						break;
5045 					}
5046 				}
5047 			}
5048 			if (remove_it)
5049 			{
5050 				worklist = foreach_delete_current(worklist, cell);
5051 				removedlist = lappend(removedlist, rinfo);
5052 			}
5053 		}
5054 
5055 		/*
5056 		 * If we failed to remove all the matching clauses we expected to
5057 		 * find, chicken out and ignore this FK; applying its selectivity
5058 		 * might result in double-counting.  Put any clauses we did manage to
5059 		 * remove back into the worklist.
5060 		 *
5061 		 * Since the matching clauses are known not outerjoin-delayed, they
5062 		 * should certainly have appeared in the initial joinclause list.  If
5063 		 * we didn't find them, they must have been matched to, and removed
5064 		 * by, some other FK in a previous iteration of this loop.  (A likely
5065 		 * case is that two FKs are matched to the same EC; there will be only
5066 		 * one EC-derived clause in the initial list, so the first FK will
5067 		 * consume it.)  Applying both FKs' selectivity independently risks
5068 		 * underestimating the join size; in particular, this would undo one
5069 		 * of the main things that ECs were invented for, namely to avoid
5070 		 * double-counting the selectivity of redundant equality conditions.
5071 		 * Later we might think of a reasonable way to combine the estimates,
5072 		 * but for now, just punt, since this is a fairly uncommon situation.
5073 		 */
5074 		if (list_length(removedlist) !=
5075 			(fkinfo->nmatched_ec + fkinfo->nmatched_ri))
5076 		{
5077 			worklist = list_concat(worklist, removedlist);
5078 			continue;
5079 		}
5080 
5081 		/*
5082 		 * Finally we get to the payoff: estimate selectivity using the
5083 		 * knowledge that each referencing row will match exactly one row in
5084 		 * the referenced table.
5085 		 *
5086 		 * XXX that's not true in the presence of nulls in the referencing
5087 		 * column(s), so in principle we should derate the estimate for those.
5088 		 * However (1) if there are any strict restriction clauses for the
5089 		 * referencing column(s) elsewhere in the query, derating here would
5090 		 * be double-counting the null fraction, and (2) it's not very clear
5091 		 * how to combine null fractions for multiple referencing columns. So
5092 		 * we do nothing for now about correcting for nulls.
5093 		 *
5094 		 * XXX another point here is that if either side of an FK constraint
5095 		 * is an inheritance parent, we estimate as though the constraint
5096 		 * covers all its children as well.  This is not an unreasonable
5097 		 * assumption for a referencing table, ie the user probably applied
5098 		 * identical constraints to all child tables (though perhaps we ought
5099 		 * to check that).  But it's not possible to have done that for a
5100 		 * referenced table.  Fortunately, precisely because that doesn't
5101 		 * work, it is uncommon in practice to have an FK referencing a parent
5102 		 * table.  So, at least for now, disregard inheritance here.
5103 		 */
5104 		if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
5105 		{
5106 			/*
5107 			 * For JOIN_SEMI and JOIN_ANTI, we only get here when the FK's
5108 			 * referenced table is exactly the inside of the join.  The join
5109 			 * selectivity is defined as the fraction of LHS rows that have
5110 			 * matches.  The FK implies that every LHS row has a match *in the
5111 			 * referenced table*; but any restriction clauses on it will
5112 			 * reduce the number of matches.  Hence we take the join
5113 			 * selectivity as equal to the selectivity of the table's
5114 			 * restriction clauses, which is rows / tuples; but we must guard
5115 			 * against tuples == 0.
5116 			 */
5117 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5118 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
5119 
5120 			fkselec *= ref_rel->rows / ref_tuples;
5121 		}
5122 		else
5123 		{
5124 			/*
5125 			 * Otherwise, selectivity is exactly 1/referenced-table-size; but
5126 			 * guard against tuples == 0.  Note we should use the raw table
5127 			 * tuple count, not any estimate of its filtered or joined size.
5128 			 */
5129 			RelOptInfo *ref_rel = find_base_rel(root, fkinfo->ref_relid);
5130 			double		ref_tuples = Max(ref_rel->tuples, 1.0);
5131 
5132 			fkselec *= 1.0 / ref_tuples;
5133 		}
5134 	}
5135 
5136 	*restrictlist = worklist;
5137 	return fkselec;
5138 }
5139 
5140 /*
5141  * set_subquery_size_estimates
5142  *		Set the size estimates for a base relation that is a subquery.
5143  *
5144  * The rel's targetlist and restrictinfo list must have been constructed
5145  * already, and the Paths for the subquery must have been completed.
5146  * We look at the subquery's PlannerInfo to extract data.
5147  *
5148  * We set the same fields as set_baserel_size_estimates.
5149  */
5150 void
set_subquery_size_estimates(PlannerInfo * root,RelOptInfo * rel)5151 set_subquery_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5152 {
5153 	PlannerInfo *subroot = rel->subroot;
5154 	RelOptInfo *sub_final_rel;
5155 	ListCell   *lc;
5156 
5157 	/* Should only be applied to base relations that are subqueries */
5158 	Assert(rel->relid > 0);
5159 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_SUBQUERY);
5160 
5161 	/*
5162 	 * Copy raw number of output rows from subquery.  All of its paths should
5163 	 * have the same output rowcount, so just look at cheapest-total.
5164 	 */
5165 	sub_final_rel = fetch_upper_rel(subroot, UPPERREL_FINAL, NULL);
5166 	rel->tuples = sub_final_rel->cheapest_total_path->rows;
5167 
5168 	/*
5169 	 * Compute per-output-column width estimates by examining the subquery's
5170 	 * targetlist.  For any output that is a plain Var, get the width estimate
5171 	 * that was made while planning the subquery.  Otherwise, we leave it to
5172 	 * set_rel_width to fill in a datatype-based default estimate.
5173 	 */
5174 	foreach(lc, subroot->parse->targetList)
5175 	{
5176 		TargetEntry *te = lfirst_node(TargetEntry, lc);
5177 		Node	   *texpr = (Node *) te->expr;
5178 		int32		item_width = 0;
5179 
5180 		/* junk columns aren't visible to upper query */
5181 		if (te->resjunk)
5182 			continue;
5183 
5184 		/*
5185 		 * The subquery could be an expansion of a view that's had columns
5186 		 * added to it since the current query was parsed, so that there are
5187 		 * non-junk tlist columns in it that don't correspond to any column
5188 		 * visible at our query level.  Ignore such columns.
5189 		 */
5190 		if (te->resno < rel->min_attr || te->resno > rel->max_attr)
5191 			continue;
5192 
5193 		/*
5194 		 * XXX This currently doesn't work for subqueries containing set
5195 		 * operations, because the Vars in their tlists are bogus references
5196 		 * to the first leaf subquery, which wouldn't give the right answer
5197 		 * even if we could still get to its PlannerInfo.
5198 		 *
5199 		 * Also, the subquery could be an appendrel for which all branches are
5200 		 * known empty due to constraint exclusion, in which case
5201 		 * set_append_rel_pathlist will have left the attr_widths set to zero.
5202 		 *
5203 		 * In either case, we just leave the width estimate zero until
5204 		 * set_rel_width fixes it.
5205 		 */
5206 		if (IsA(texpr, Var) &&
5207 			subroot->parse->setOperations == NULL)
5208 		{
5209 			Var		   *var = (Var *) texpr;
5210 			RelOptInfo *subrel = find_base_rel(subroot, var->varno);
5211 
5212 			item_width = subrel->attr_widths[var->varattno - subrel->min_attr];
5213 		}
5214 		rel->attr_widths[te->resno - rel->min_attr] = item_width;
5215 	}
5216 
5217 	/* Now estimate number of output rows, etc */
5218 	set_baserel_size_estimates(root, rel);
5219 }
5220 
5221 /*
5222  * set_function_size_estimates
5223  *		Set the size estimates for a base relation that is a function call.
5224  *
5225  * The rel's targetlist and restrictinfo list must have been constructed
5226  * already.
5227  *
5228  * We set the same fields as set_baserel_size_estimates.
5229  */
5230 void
set_function_size_estimates(PlannerInfo * root,RelOptInfo * rel)5231 set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5232 {
5233 	RangeTblEntry *rte;
5234 	ListCell   *lc;
5235 
5236 	/* Should only be applied to base relations that are functions */
5237 	Assert(rel->relid > 0);
5238 	rte = planner_rt_fetch(rel->relid, root);
5239 	Assert(rte->rtekind == RTE_FUNCTION);
5240 
5241 	/*
5242 	 * Estimate number of rows the functions will return. The rowcount of the
5243 	 * node is that of the largest function result.
5244 	 */
5245 	rel->tuples = 0;
5246 	foreach(lc, rte->functions)
5247 	{
5248 		RangeTblFunction *rtfunc = (RangeTblFunction *) lfirst(lc);
5249 		double		ntup = expression_returns_set_rows(root, rtfunc->funcexpr);
5250 
5251 		if (ntup > rel->tuples)
5252 			rel->tuples = ntup;
5253 	}
5254 
5255 	/* Now estimate number of output rows, etc */
5256 	set_baserel_size_estimates(root, rel);
5257 }
5258 
5259 /*
5260  * set_function_size_estimates
5261  *		Set the size estimates for a base relation that is a function call.
5262  *
5263  * The rel's targetlist and restrictinfo list must have been constructed
5264  * already.
5265  *
5266  * We set the same fields as set_tablefunc_size_estimates.
5267  */
5268 void
set_tablefunc_size_estimates(PlannerInfo * root,RelOptInfo * rel)5269 set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5270 {
5271 	/* Should only be applied to base relations that are functions */
5272 	Assert(rel->relid > 0);
5273 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_TABLEFUNC);
5274 
5275 	rel->tuples = 100;
5276 
5277 	/* Now estimate number of output rows, etc */
5278 	set_baserel_size_estimates(root, rel);
5279 }
5280 
5281 /*
5282  * set_values_size_estimates
5283  *		Set the size estimates for a base relation that is a values list.
5284  *
5285  * The rel's targetlist and restrictinfo list must have been constructed
5286  * already.
5287  *
5288  * We set the same fields as set_baserel_size_estimates.
5289  */
5290 void
set_values_size_estimates(PlannerInfo * root,RelOptInfo * rel)5291 set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5292 {
5293 	RangeTblEntry *rte;
5294 
5295 	/* Should only be applied to base relations that are values lists */
5296 	Assert(rel->relid > 0);
5297 	rte = planner_rt_fetch(rel->relid, root);
5298 	Assert(rte->rtekind == RTE_VALUES);
5299 
5300 	/*
5301 	 * Estimate number of rows the values list will return. We know this
5302 	 * precisely based on the list length (well, barring set-returning
5303 	 * functions in list items, but that's a refinement not catered for
5304 	 * anywhere else either).
5305 	 */
5306 	rel->tuples = list_length(rte->values_lists);
5307 
5308 	/* Now estimate number of output rows, etc */
5309 	set_baserel_size_estimates(root, rel);
5310 }
5311 
5312 /*
5313  * set_cte_size_estimates
5314  *		Set the size estimates for a base relation that is a CTE reference.
5315  *
5316  * The rel's targetlist and restrictinfo list must have been constructed
5317  * already, and we need an estimate of the number of rows returned by the CTE
5318  * (if a regular CTE) or the non-recursive term (if a self-reference).
5319  *
5320  * We set the same fields as set_baserel_size_estimates.
5321  */
5322 void
set_cte_size_estimates(PlannerInfo * root,RelOptInfo * rel,double cte_rows)5323 set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows)
5324 {
5325 	RangeTblEntry *rte;
5326 
5327 	/* Should only be applied to base relations that are CTE references */
5328 	Assert(rel->relid > 0);
5329 	rte = planner_rt_fetch(rel->relid, root);
5330 	Assert(rte->rtekind == RTE_CTE);
5331 
5332 	if (rte->self_reference)
5333 	{
5334 		/*
5335 		 * In a self-reference, arbitrarily assume the average worktable size
5336 		 * is about 10 times the nonrecursive term's size.
5337 		 */
5338 		rel->tuples = 10 * cte_rows;
5339 	}
5340 	else
5341 	{
5342 		/* Otherwise just believe the CTE's rowcount estimate */
5343 		rel->tuples = cte_rows;
5344 	}
5345 
5346 	/* Now estimate number of output rows, etc */
5347 	set_baserel_size_estimates(root, rel);
5348 }
5349 
5350 /*
5351  * set_namedtuplestore_size_estimates
5352  *		Set the size estimates for a base relation that is a tuplestore reference.
5353  *
5354  * The rel's targetlist and restrictinfo list must have been constructed
5355  * already.
5356  *
5357  * We set the same fields as set_baserel_size_estimates.
5358  */
5359 void
set_namedtuplestore_size_estimates(PlannerInfo * root,RelOptInfo * rel)5360 set_namedtuplestore_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5361 {
5362 	RangeTblEntry *rte;
5363 
5364 	/* Should only be applied to base relations that are tuplestore references */
5365 	Assert(rel->relid > 0);
5366 	rte = planner_rt_fetch(rel->relid, root);
5367 	Assert(rte->rtekind == RTE_NAMEDTUPLESTORE);
5368 
5369 	/*
5370 	 * Use the estimate provided by the code which is generating the named
5371 	 * tuplestore.  In some cases, the actual number might be available; in
5372 	 * others the same plan will be re-used, so a "typical" value might be
5373 	 * estimated and used.
5374 	 */
5375 	rel->tuples = rte->enrtuples;
5376 	if (rel->tuples < 0)
5377 		rel->tuples = 1000;
5378 
5379 	/* Now estimate number of output rows, etc */
5380 	set_baserel_size_estimates(root, rel);
5381 }
5382 
5383 /*
5384  * set_result_size_estimates
5385  *		Set the size estimates for an RTE_RESULT base relation
5386  *
5387  * The rel's targetlist and restrictinfo list must have been constructed
5388  * already.
5389  *
5390  * We set the same fields as set_baserel_size_estimates.
5391  */
5392 void
set_result_size_estimates(PlannerInfo * root,RelOptInfo * rel)5393 set_result_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5394 {
5395 	/* Should only be applied to RTE_RESULT base relations */
5396 	Assert(rel->relid > 0);
5397 	Assert(planner_rt_fetch(rel->relid, root)->rtekind == RTE_RESULT);
5398 
5399 	/* RTE_RESULT always generates a single row, natively */
5400 	rel->tuples = 1;
5401 
5402 	/* Now estimate number of output rows, etc */
5403 	set_baserel_size_estimates(root, rel);
5404 }
5405 
5406 /*
5407  * set_foreign_size_estimates
5408  *		Set the size estimates for a base relation that is a foreign table.
5409  *
5410  * There is not a whole lot that we can do here; the foreign-data wrapper
5411  * is responsible for producing useful estimates.  We can do a decent job
5412  * of estimating baserestrictcost, so we set that, and we also set up width
5413  * using what will be purely datatype-driven estimates from the targetlist.
5414  * There is no way to do anything sane with the rows value, so we just put
5415  * a default estimate and hope that the wrapper can improve on it.  The
5416  * wrapper's GetForeignRelSize function will be called momentarily.
5417  *
5418  * The rel's targetlist and restrictinfo list must have been constructed
5419  * already.
5420  */
5421 void
set_foreign_size_estimates(PlannerInfo * root,RelOptInfo * rel)5422 set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel)
5423 {
5424 	/* Should only be applied to base relations */
5425 	Assert(rel->relid > 0);
5426 
5427 	rel->rows = 1000;			/* entirely bogus default estimate */
5428 
5429 	cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root);
5430 
5431 	set_rel_width(root, rel);
5432 }
5433 
5434 
5435 /*
5436  * set_rel_width
5437  *		Set the estimated output width of a base relation.
5438  *
5439  * The estimated output width is the sum of the per-attribute width estimates
5440  * for the actually-referenced columns, plus any PHVs or other expressions
5441  * that have to be calculated at this relation.  This is the amount of data
5442  * we'd need to pass upwards in case of a sort, hash, etc.
5443  *
5444  * This function also sets reltarget->cost, so it's a bit misnamed now.
5445  *
5446  * NB: this works best on plain relations because it prefers to look at
5447  * real Vars.  For subqueries, set_subquery_size_estimates will already have
5448  * copied up whatever per-column estimates were made within the subquery,
5449  * and for other types of rels there isn't much we can do anyway.  We fall
5450  * back on (fairly stupid) datatype-based width estimates if we can't get
5451  * any better number.
5452  *
5453  * The per-attribute width estimates are cached for possible re-use while
5454  * building join relations or post-scan/join pathtargets.
5455  */
5456 static void
set_rel_width(PlannerInfo * root,RelOptInfo * rel)5457 set_rel_width(PlannerInfo *root, RelOptInfo *rel)
5458 {
5459 	Oid			reloid = planner_rt_fetch(rel->relid, root)->relid;
5460 	int32		tuple_width = 0;
5461 	bool		have_wholerow_var = false;
5462 	ListCell   *lc;
5463 
5464 	/* Vars are assumed to have cost zero, but other exprs do not */
5465 	rel->reltarget->cost.startup = 0;
5466 	rel->reltarget->cost.per_tuple = 0;
5467 
5468 	foreach(lc, rel->reltarget->exprs)
5469 	{
5470 		Node	   *node = (Node *) lfirst(lc);
5471 
5472 		/*
5473 		 * Ordinarily, a Var in a rel's targetlist must belong to that rel;
5474 		 * but there are corner cases involving LATERAL references where that
5475 		 * isn't so.  If the Var has the wrong varno, fall through to the
5476 		 * generic case (it doesn't seem worth the trouble to be any smarter).
5477 		 */
5478 		if (IsA(node, Var) &&
5479 			((Var *) node)->varno == rel->relid)
5480 		{
5481 			Var		   *var = (Var *) node;
5482 			int			ndx;
5483 			int32		item_width;
5484 
5485 			Assert(var->varattno >= rel->min_attr);
5486 			Assert(var->varattno <= rel->max_attr);
5487 
5488 			ndx = var->varattno - rel->min_attr;
5489 
5490 			/*
5491 			 * If it's a whole-row Var, we'll deal with it below after we have
5492 			 * already cached as many attr widths as possible.
5493 			 */
5494 			if (var->varattno == 0)
5495 			{
5496 				have_wholerow_var = true;
5497 				continue;
5498 			}
5499 
5500 			/*
5501 			 * The width may have been cached already (especially if it's a
5502 			 * subquery), so don't duplicate effort.
5503 			 */
5504 			if (rel->attr_widths[ndx] > 0)
5505 			{
5506 				tuple_width += rel->attr_widths[ndx];
5507 				continue;
5508 			}
5509 
5510 			/* Try to get column width from statistics */
5511 			if (reloid != InvalidOid && var->varattno > 0)
5512 			{
5513 				item_width = get_attavgwidth(reloid, var->varattno);
5514 				if (item_width > 0)
5515 				{
5516 					rel->attr_widths[ndx] = item_width;
5517 					tuple_width += item_width;
5518 					continue;
5519 				}
5520 			}
5521 
5522 			/*
5523 			 * Not a plain relation, or can't find statistics for it. Estimate
5524 			 * using just the type info.
5525 			 */
5526 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5527 			Assert(item_width > 0);
5528 			rel->attr_widths[ndx] = item_width;
5529 			tuple_width += item_width;
5530 		}
5531 		else if (IsA(node, PlaceHolderVar))
5532 		{
5533 			/*
5534 			 * We will need to evaluate the PHV's contained expression while
5535 			 * scanning this rel, so be sure to include it in reltarget->cost.
5536 			 */
5537 			PlaceHolderVar *phv = (PlaceHolderVar *) node;
5538 			PlaceHolderInfo *phinfo = find_placeholder_info(root, phv, false);
5539 			QualCost	cost;
5540 
5541 			tuple_width += phinfo->ph_width;
5542 			cost_qual_eval_node(&cost, (Node *) phv->phexpr, root);
5543 			rel->reltarget->cost.startup += cost.startup;
5544 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5545 		}
5546 		else
5547 		{
5548 			/*
5549 			 * We could be looking at an expression pulled up from a subquery,
5550 			 * or a ROW() representing a whole-row child Var, etc.  Do what we
5551 			 * can using the expression type information.
5552 			 */
5553 			int32		item_width;
5554 			QualCost	cost;
5555 
5556 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5557 			Assert(item_width > 0);
5558 			tuple_width += item_width;
5559 			/* Not entirely clear if we need to account for cost, but do so */
5560 			cost_qual_eval_node(&cost, node, root);
5561 			rel->reltarget->cost.startup += cost.startup;
5562 			rel->reltarget->cost.per_tuple += cost.per_tuple;
5563 		}
5564 	}
5565 
5566 	/*
5567 	 * If we have a whole-row reference, estimate its width as the sum of
5568 	 * per-column widths plus heap tuple header overhead.
5569 	 */
5570 	if (have_wholerow_var)
5571 	{
5572 		int32		wholerow_width = MAXALIGN(SizeofHeapTupleHeader);
5573 
5574 		if (reloid != InvalidOid)
5575 		{
5576 			/* Real relation, so estimate true tuple width */
5577 			wholerow_width += get_relation_data_width(reloid,
5578 													  rel->attr_widths - rel->min_attr);
5579 		}
5580 		else
5581 		{
5582 			/* Do what we can with info for a phony rel */
5583 			AttrNumber	i;
5584 
5585 			for (i = 1; i <= rel->max_attr; i++)
5586 				wholerow_width += rel->attr_widths[i - rel->min_attr];
5587 		}
5588 
5589 		rel->attr_widths[0 - rel->min_attr] = wholerow_width;
5590 
5591 		/*
5592 		 * Include the whole-row Var as part of the output tuple.  Yes, that
5593 		 * really is what happens at runtime.
5594 		 */
5595 		tuple_width += wholerow_width;
5596 	}
5597 
5598 	Assert(tuple_width >= 0);
5599 	rel->reltarget->width = tuple_width;
5600 }
5601 
5602 /*
5603  * set_pathtarget_cost_width
5604  *		Set the estimated eval cost and output width of a PathTarget tlist.
5605  *
5606  * As a notational convenience, returns the same PathTarget pointer passed in.
5607  *
5608  * Most, though not quite all, uses of this function occur after we've run
5609  * set_rel_width() for base relations; so we can usually obtain cached width
5610  * estimates for Vars.  If we can't, fall back on datatype-based width
5611  * estimates.  Present early-planning uses of PathTargets don't need accurate
5612  * widths badly enough to justify going to the catalogs for better data.
5613  */
5614 PathTarget *
set_pathtarget_cost_width(PlannerInfo * root,PathTarget * target)5615 set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target)
5616 {
5617 	int32		tuple_width = 0;
5618 	ListCell   *lc;
5619 
5620 	/* Vars are assumed to have cost zero, but other exprs do not */
5621 	target->cost.startup = 0;
5622 	target->cost.per_tuple = 0;
5623 
5624 	foreach(lc, target->exprs)
5625 	{
5626 		Node	   *node = (Node *) lfirst(lc);
5627 
5628 		if (IsA(node, Var))
5629 		{
5630 			Var		   *var = (Var *) node;
5631 			int32		item_width;
5632 
5633 			/* We should not see any upper-level Vars here */
5634 			Assert(var->varlevelsup == 0);
5635 
5636 			/* Try to get data from RelOptInfo cache */
5637 			if (var->varno < root->simple_rel_array_size)
5638 			{
5639 				RelOptInfo *rel = root->simple_rel_array[var->varno];
5640 
5641 				if (rel != NULL &&
5642 					var->varattno >= rel->min_attr &&
5643 					var->varattno <= rel->max_attr)
5644 				{
5645 					int			ndx = var->varattno - rel->min_attr;
5646 
5647 					if (rel->attr_widths[ndx] > 0)
5648 					{
5649 						tuple_width += rel->attr_widths[ndx];
5650 						continue;
5651 					}
5652 				}
5653 			}
5654 
5655 			/*
5656 			 * No cached data available, so estimate using just the type info.
5657 			 */
5658 			item_width = get_typavgwidth(var->vartype, var->vartypmod);
5659 			Assert(item_width > 0);
5660 			tuple_width += item_width;
5661 		}
5662 		else
5663 		{
5664 			/*
5665 			 * Handle general expressions using type info.
5666 			 */
5667 			int32		item_width;
5668 			QualCost	cost;
5669 
5670 			item_width = get_typavgwidth(exprType(node), exprTypmod(node));
5671 			Assert(item_width > 0);
5672 			tuple_width += item_width;
5673 
5674 			/* Account for cost, too */
5675 			cost_qual_eval_node(&cost, node, root);
5676 			target->cost.startup += cost.startup;
5677 			target->cost.per_tuple += cost.per_tuple;
5678 		}
5679 	}
5680 
5681 	Assert(tuple_width >= 0);
5682 	target->width = tuple_width;
5683 
5684 	return target;
5685 }
5686 
5687 /*
5688  * relation_byte_size
5689  *	  Estimate the storage space in bytes for a given number of tuples
5690  *	  of a given width (size in bytes).
5691  */
5692 static double
relation_byte_size(double tuples,int width)5693 relation_byte_size(double tuples, int width)
5694 {
5695 	return tuples * (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
5696 }
5697 
5698 /*
5699  * page_size
5700  *	  Returns an estimate of the number of pages covered by a given
5701  *	  number of tuples of a given width (size in bytes).
5702  */
5703 static double
page_size(double tuples,int width)5704 page_size(double tuples, int width)
5705 {
5706 	return ceil(relation_byte_size(tuples, width) / BLCKSZ);
5707 }
5708 
5709 /*
5710  * Estimate the fraction of the work that each worker will do given the
5711  * number of workers budgeted for the path.
5712  */
5713 static double
get_parallel_divisor(Path * path)5714 get_parallel_divisor(Path *path)
5715 {
5716 	double		parallel_divisor = path->parallel_workers;
5717 
5718 	/*
5719 	 * Early experience with parallel query suggests that when there is only
5720 	 * one worker, the leader often makes a very substantial contribution to
5721 	 * executing the parallel portion of the plan, but as more workers are
5722 	 * added, it does less and less, because it's busy reading tuples from the
5723 	 * workers and doing whatever non-parallel post-processing is needed.  By
5724 	 * the time we reach 4 workers, the leader no longer makes a meaningful
5725 	 * contribution.  Thus, for now, estimate that the leader spends 30% of
5726 	 * its time servicing each worker, and the remainder executing the
5727 	 * parallel plan.
5728 	 */
5729 	if (parallel_leader_participation)
5730 	{
5731 		double		leader_contribution;
5732 
5733 		leader_contribution = 1.0 - (0.3 * path->parallel_workers);
5734 		if (leader_contribution > 0)
5735 			parallel_divisor += leader_contribution;
5736 	}
5737 
5738 	return parallel_divisor;
5739 }
5740 
5741 /*
5742  * compute_bitmap_pages
5743  *
5744  * compute number of pages fetched from heap in bitmap heap scan.
5745  */
5746 double
compute_bitmap_pages(PlannerInfo * root,RelOptInfo * baserel,Path * bitmapqual,int loop_count,Cost * cost,double * tuple)5747 compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual,
5748 					 int loop_count, Cost *cost, double *tuple)
5749 {
5750 	Cost		indexTotalCost;
5751 	Selectivity indexSelectivity;
5752 	double		T;
5753 	double		pages_fetched;
5754 	double		tuples_fetched;
5755 	double		heap_pages;
5756 	long		maxentries;
5757 
5758 	/*
5759 	 * Fetch total cost of obtaining the bitmap, as well as its total
5760 	 * selectivity.
5761 	 */
5762 	cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity);
5763 
5764 	/*
5765 	 * Estimate number of main-table pages fetched.
5766 	 */
5767 	tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples);
5768 
5769 	T = (baserel->pages > 1) ? (double) baserel->pages : 1.0;
5770 
5771 	/*
5772 	 * For a single scan, the number of heap pages that need to be fetched is
5773 	 * the same as the Mackert and Lohman formula for the case T <= b (ie, no
5774 	 * re-reads needed).
5775 	 */
5776 	pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched);
5777 
5778 	/*
5779 	 * Calculate the number of pages fetched from the heap.  Then based on
5780 	 * current work_mem estimate get the estimated maxentries in the bitmap.
5781 	 * (Note that we always do this calculation based on the number of pages
5782 	 * that would be fetched in a single iteration, even if loop_count > 1.
5783 	 * That's correct, because only that number of entries will be stored in
5784 	 * the bitmap at one time.)
5785 	 */
5786 	heap_pages = Min(pages_fetched, baserel->pages);
5787 	maxentries = tbm_calculate_entries(work_mem * 1024L);
5788 
5789 	if (loop_count > 1)
5790 	{
5791 		/*
5792 		 * For repeated bitmap scans, scale up the number of tuples fetched in
5793 		 * the Mackert and Lohman formula by the number of scans, so that we
5794 		 * estimate the number of pages fetched by all the scans. Then
5795 		 * pro-rate for one scan.
5796 		 */
5797 		pages_fetched = index_pages_fetched(tuples_fetched * loop_count,
5798 											baserel->pages,
5799 											get_indexpath_pages(bitmapqual),
5800 											root);
5801 		pages_fetched /= loop_count;
5802 	}
5803 
5804 	if (pages_fetched >= T)
5805 		pages_fetched = T;
5806 	else
5807 		pages_fetched = ceil(pages_fetched);
5808 
5809 	if (maxentries < heap_pages)
5810 	{
5811 		double		exact_pages;
5812 		double		lossy_pages;
5813 
5814 		/*
5815 		 * Crude approximation of the number of lossy pages.  Because of the
5816 		 * way tbm_lossify() is coded, the number of lossy pages increases
5817 		 * very sharply as soon as we run short of memory; this formula has
5818 		 * that property and seems to perform adequately in testing, but it's
5819 		 * possible we could do better somehow.
5820 		 */
5821 		lossy_pages = Max(0, heap_pages - maxentries / 2);
5822 		exact_pages = heap_pages - lossy_pages;
5823 
5824 		/*
5825 		 * If there are lossy pages then recompute the  number of tuples
5826 		 * processed by the bitmap heap node.  We assume here that the chance
5827 		 * of a given tuple coming from an exact page is the same as the
5828 		 * chance that a given page is exact.  This might not be true, but
5829 		 * it's not clear how we can do any better.
5830 		 */
5831 		if (lossy_pages > 0)
5832 			tuples_fetched =
5833 				clamp_row_est(indexSelectivity *
5834 							  (exact_pages / heap_pages) * baserel->tuples +
5835 							  (lossy_pages / heap_pages) * baserel->tuples);
5836 	}
5837 
5838 	if (cost)
5839 		*cost = indexTotalCost;
5840 	if (tuple)
5841 		*tuple = tuples_fetched;
5842 
5843 	return pages_fetched;
5844 }
5845