1 /*-------------------------------------------------------------------------
2  *
3  * analyzejoins.c
4  *	  Routines for simplifying joins after initial query analysis
5  *
6  * While we do a great deal of join simplification in prep/prepjointree.c,
7  * certain optimizations cannot be performed at that stage for lack of
8  * detailed information about the query.  The routines here are invoked
9  * after initsplan.c has done its work, and can do additional join removal
10  * and simplification steps based on the information extracted.  The penalty
11  * is that we have to work harder to clean up after ourselves when we modify
12  * the query, since the derived data structures have to be updated too.
13  *
14  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
15  * Portions Copyright (c) 1994, Regents of the University of California
16  *
17  *
18  * IDENTIFICATION
19  *	  src/backend/optimizer/plan/analyzejoins.c
20  *
21  *-------------------------------------------------------------------------
22  */
23 #include "postgres.h"
24 
25 #include "nodes/nodeFuncs.h"
26 #include "optimizer/clauses.h"
27 #include "optimizer/joininfo.h"
28 #include "optimizer/pathnode.h"
29 #include "optimizer/paths.h"
30 #include "optimizer/planmain.h"
31 #include "optimizer/tlist.h"
32 #include "optimizer/var.h"
33 #include "utils/lsyscache.h"
34 
35 /* local functions */
36 static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
37 static void remove_rel_from_query(PlannerInfo *root, int relid,
38 					  Relids joinrelids);
39 static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
40 static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel);
41 static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel,
42 					List *clause_list);
43 static Oid	distinct_col_search(int colno, List *colnos, List *opids);
44 static bool is_innerrel_unique_for(PlannerInfo *root,
45 					   Relids outerrelids,
46 					   RelOptInfo *innerrel,
47 					   JoinType jointype,
48 					   List *restrictlist);
49 
50 
51 /*
52  * remove_useless_joins
53  *		Check for relations that don't actually need to be joined at all,
54  *		and remove them from the query.
55  *
56  * We are passed the current joinlist and return the updated list.  Other
57  * data structures that have to be updated are accessible via "root".
58  */
59 List *
remove_useless_joins(PlannerInfo * root,List * joinlist)60 remove_useless_joins(PlannerInfo *root, List *joinlist)
61 {
62 	ListCell   *lc;
63 
64 	/*
65 	 * We are only interested in relations that are left-joined to, so we can
66 	 * scan the join_info_list to find them easily.
67 	 */
68 restart:
69 	foreach(lc, root->join_info_list)
70 	{
71 		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
72 		int			innerrelid;
73 		int			nremoved;
74 
75 		/* Skip if not removable */
76 		if (!join_is_removable(root, sjinfo))
77 			continue;
78 
79 		/*
80 		 * Currently, join_is_removable can only succeed when the sjinfo's
81 		 * righthand is a single baserel.  Remove that rel from the query and
82 		 * joinlist.
83 		 */
84 		innerrelid = bms_singleton_member(sjinfo->min_righthand);
85 
86 		remove_rel_from_query(root, innerrelid,
87 							  bms_union(sjinfo->min_lefthand,
88 										sjinfo->min_righthand));
89 
90 		/* We verify that exactly one reference gets removed from joinlist */
91 		nremoved = 0;
92 		joinlist = remove_rel_from_joinlist(joinlist, innerrelid, &nremoved);
93 		if (nremoved != 1)
94 			elog(ERROR, "failed to find relation %d in joinlist", innerrelid);
95 
96 		/*
97 		 * We can delete this SpecialJoinInfo from the list too, since it's no
98 		 * longer of interest.
99 		 */
100 		root->join_info_list = list_delete_ptr(root->join_info_list, sjinfo);
101 
102 		/*
103 		 * Restart the scan.  This is necessary to ensure we find all
104 		 * removable joins independently of ordering of the join_info_list
105 		 * (note that removal of attr_needed bits may make a join appear
106 		 * removable that did not before).  Also, since we just deleted the
107 		 * current list cell, we'd have to have some kluge to continue the
108 		 * list scan anyway.
109 		 */
110 		goto restart;
111 	}
112 
113 	return joinlist;
114 }
115 
116 /*
117  * clause_sides_match_join
118  *	  Determine whether a join clause is of the right form to use in this join.
119  *
120  * We already know that the clause is a binary opclause referencing only the
121  * rels in the current join.  The point here is to check whether it has the
122  * form "outerrel_expr op innerrel_expr" or "innerrel_expr op outerrel_expr",
123  * rather than mixing outer and inner vars on either side.  If it matches,
124  * we set the transient flag outer_is_left to identify which side is which.
125  */
126 static inline bool
clause_sides_match_join(RestrictInfo * rinfo,Relids outerrelids,Relids innerrelids)127 clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids,
128 						Relids innerrelids)
129 {
130 	if (bms_is_subset(rinfo->left_relids, outerrelids) &&
131 		bms_is_subset(rinfo->right_relids, innerrelids))
132 	{
133 		/* lefthand side is outer */
134 		rinfo->outer_is_left = true;
135 		return true;
136 	}
137 	else if (bms_is_subset(rinfo->left_relids, innerrelids) &&
138 			 bms_is_subset(rinfo->right_relids, outerrelids))
139 	{
140 		/* righthand side is outer */
141 		rinfo->outer_is_left = false;
142 		return true;
143 	}
144 	return false;				/* no good for these input relations */
145 }
146 
147 /*
148  * join_is_removable
149  *	  Check whether we need not perform this special join at all, because
150  *	  it will just duplicate its left input.
151  *
152  * This is true for a left join for which the join condition cannot match
153  * more than one inner-side row.  (There are other possibly interesting
154  * cases, but we don't have the infrastructure to prove them.)  We also
155  * have to check that the inner side doesn't generate any variables needed
156  * above the join.
157  */
158 static bool
join_is_removable(PlannerInfo * root,SpecialJoinInfo * sjinfo)159 join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
160 {
161 	int			innerrelid;
162 	RelOptInfo *innerrel;
163 	Relids		joinrelids;
164 	List	   *clause_list = NIL;
165 	ListCell   *l;
166 	int			attroff;
167 
168 	/*
169 	 * Must be a non-delaying left join to a single baserel, else we aren't
170 	 * going to be able to do anything with it.
171 	 */
172 	if (sjinfo->jointype != JOIN_LEFT ||
173 		sjinfo->delay_upper_joins)
174 		return false;
175 
176 	if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
177 		return false;
178 
179 	innerrel = find_base_rel(root, innerrelid);
180 
181 	/*
182 	 * Before we go to the effort of checking whether any innerrel variables
183 	 * are needed above the join, make a quick check to eliminate cases in
184 	 * which we will surely be unable to prove uniqueness of the innerrel.
185 	 */
186 	if (!rel_supports_distinctness(root, innerrel))
187 		return false;
188 
189 	/* Compute the relid set for the join we are considering */
190 	joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
191 
192 	/*
193 	 * We can't remove the join if any inner-rel attributes are used above the
194 	 * join.
195 	 *
196 	 * Note that this test only detects use of inner-rel attributes in higher
197 	 * join conditions and the target list.  There might be such attributes in
198 	 * pushed-down conditions at this join, too.  We check that case below.
199 	 *
200 	 * As a micro-optimization, it seems better to start with max_attr and
201 	 * count down rather than starting with min_attr and counting up, on the
202 	 * theory that the system attributes are somewhat less likely to be wanted
203 	 * and should be tested last.
204 	 */
205 	for (attroff = innerrel->max_attr - innerrel->min_attr;
206 		 attroff >= 0;
207 		 attroff--)
208 	{
209 		if (!bms_is_subset(innerrel->attr_needed[attroff], joinrelids))
210 			return false;
211 	}
212 
213 	/*
214 	 * Similarly check that the inner rel isn't needed by any PlaceHolderVars
215 	 * that will be used above the join.  We only need to fail if such a PHV
216 	 * actually references some inner-rel attributes; but the correct check
217 	 * for that is relatively expensive, so we first check against ph_eval_at,
218 	 * which must mention the inner rel if the PHV uses any inner-rel attrs as
219 	 * non-lateral references.  Note that if the PHV's syntactic scope is just
220 	 * the inner rel, we can't drop the rel even if the PHV is variable-free.
221 	 */
222 	foreach(l, root->placeholder_list)
223 	{
224 		PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
225 
226 		if (bms_overlap(phinfo->ph_lateral, innerrel->relids))
227 			return false;		/* it references innerrel laterally */
228 		if (bms_is_subset(phinfo->ph_needed, joinrelids))
229 			continue;			/* PHV is not used above the join */
230 		if (!bms_overlap(phinfo->ph_eval_at, innerrel->relids))
231 			continue;			/* it definitely doesn't reference innerrel */
232 		if (bms_is_subset(phinfo->ph_eval_at, innerrel->relids))
233 			return false;		/* there isn't any other place to eval PHV */
234 		if (bms_overlap(pull_varnos((Node *) phinfo->ph_var->phexpr),
235 						innerrel->relids))
236 			return false;		/* it does reference innerrel */
237 	}
238 
239 	/*
240 	 * Search for mergejoinable clauses that constrain the inner rel against
241 	 * either the outer rel or a pseudoconstant.  If an operator is
242 	 * mergejoinable then it behaves like equality for some btree opclass, so
243 	 * it's what we want.  The mergejoinability test also eliminates clauses
244 	 * containing volatile functions, which we couldn't depend on.
245 	 */
246 	foreach(l, innerrel->joininfo)
247 	{
248 		RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l);
249 
250 		/*
251 		 * If it's not a join clause for this outer join, we can't use it.
252 		 * Note that if the clause is pushed-down, then it is logically from
253 		 * above the outer join, even if it references no other rels (it might
254 		 * be from WHERE, for example).
255 		 */
256 		if (RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
257 		{
258 			/*
259 			 * If such a clause actually references the inner rel then join
260 			 * removal has to be disallowed.  We have to check this despite
261 			 * the previous attr_needed checks because of the possibility of
262 			 * pushed-down clauses referencing the rel.
263 			 */
264 			if (bms_is_member(innerrelid, restrictinfo->clause_relids))
265 				return false;
266 			continue;			/* else, ignore; not useful here */
267 		}
268 
269 		/* Ignore if it's not a mergejoinable clause */
270 		if (!restrictinfo->can_join ||
271 			restrictinfo->mergeopfamilies == NIL)
272 			continue;			/* not mergejoinable */
273 
274 		/*
275 		 * Check if clause has the form "outer op inner" or "inner op outer",
276 		 * and if so mark which side is inner.
277 		 */
278 		if (!clause_sides_match_join(restrictinfo, sjinfo->min_lefthand,
279 									 innerrel->relids))
280 			continue;			/* no good for these input relations */
281 
282 		/* OK, add to list */
283 		clause_list = lappend(clause_list, restrictinfo);
284 	}
285 
286 	/*
287 	 * Now that we have the relevant equality join clauses, try to prove the
288 	 * innerrel distinct.
289 	 */
290 	if (rel_is_distinct_for(root, innerrel, clause_list))
291 		return true;
292 
293 	/*
294 	 * Some day it would be nice to check for other methods of establishing
295 	 * distinctness.
296 	 */
297 	return false;
298 }
299 
300 
301 /*
302  * Remove the target relid from the planner's data structures, having
303  * determined that there is no need to include it in the query.
304  *
305  * We are not terribly thorough here.  We must make sure that the rel is
306  * no longer treated as a baserel, and that attributes of other baserels
307  * are no longer marked as being needed at joins involving this rel.
308  * Also, join quals involving the rel have to be removed from the joininfo
309  * lists, but only if they belong to the outer join identified by joinrelids.
310  */
311 static void
remove_rel_from_query(PlannerInfo * root,int relid,Relids joinrelids)312 remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids)
313 {
314 	RelOptInfo *rel = find_base_rel(root, relid);
315 	List	   *joininfos;
316 	Index		rti;
317 	ListCell   *l;
318 	ListCell   *nextl;
319 
320 	/*
321 	 * Mark the rel as "dead" to show it is no longer part of the join tree.
322 	 * (Removing it from the baserel array altogether seems too risky.)
323 	 */
324 	rel->reloptkind = RELOPT_DEADREL;
325 
326 	/*
327 	 * Remove references to the rel from other baserels' attr_needed arrays.
328 	 */
329 	for (rti = 1; rti < root->simple_rel_array_size; rti++)
330 	{
331 		RelOptInfo *otherrel = root->simple_rel_array[rti];
332 		int			attroff;
333 
334 		/* there may be empty slots corresponding to non-baserel RTEs */
335 		if (otherrel == NULL)
336 			continue;
337 
338 		Assert(otherrel->relid == rti); /* sanity check on array */
339 
340 		/* no point in processing target rel itself */
341 		if (otherrel == rel)
342 			continue;
343 
344 		for (attroff = otherrel->max_attr - otherrel->min_attr;
345 			 attroff >= 0;
346 			 attroff--)
347 		{
348 			otherrel->attr_needed[attroff] =
349 				bms_del_member(otherrel->attr_needed[attroff], relid);
350 		}
351 	}
352 
353 	/*
354 	 * Likewise remove references from SpecialJoinInfo data structures.
355 	 *
356 	 * This is relevant in case the outer join we're deleting is nested inside
357 	 * other outer joins: the upper joins' relid sets have to be adjusted. The
358 	 * RHS of the target outer join will be made empty here, but that's OK
359 	 * since caller will delete that SpecialJoinInfo entirely.
360 	 */
361 	foreach(l, root->join_info_list)
362 	{
363 		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l);
364 
365 		sjinfo->min_lefthand = bms_del_member(sjinfo->min_lefthand, relid);
366 		sjinfo->min_righthand = bms_del_member(sjinfo->min_righthand, relid);
367 		sjinfo->syn_lefthand = bms_del_member(sjinfo->syn_lefthand, relid);
368 		sjinfo->syn_righthand = bms_del_member(sjinfo->syn_righthand, relid);
369 	}
370 
371 	/*
372 	 * Likewise remove references from PlaceHolderVar data structures,
373 	 * removing any no-longer-needed placeholders entirely.
374 	 *
375 	 * Removal is a bit tricker than it might seem: we can remove PHVs that
376 	 * are used at the target rel and/or in the join qual, but not those that
377 	 * are used at join partner rels or above the join.  It's not that easy to
378 	 * distinguish PHVs used at partner rels from those used in the join qual,
379 	 * since they will both have ph_needed sets that are subsets of
380 	 * joinrelids.  However, a PHV used at a partner rel could not have the
381 	 * target rel in ph_eval_at, so we check that while deciding whether to
382 	 * remove or just update the PHV.  There is no corresponding test in
383 	 * join_is_removable because it doesn't need to distinguish those cases.
384 	 */
385 	for (l = list_head(root->placeholder_list); l != NULL; l = nextl)
386 	{
387 		PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l);
388 
389 		nextl = lnext(l);
390 		Assert(!bms_is_member(relid, phinfo->ph_lateral));
391 		if (bms_is_subset(phinfo->ph_needed, joinrelids) &&
392 			bms_is_member(relid, phinfo->ph_eval_at))
393 			root->placeholder_list = list_delete_ptr(root->placeholder_list,
394 													 phinfo);
395 		else
396 		{
397 			phinfo->ph_eval_at = bms_del_member(phinfo->ph_eval_at, relid);
398 			Assert(!bms_is_empty(phinfo->ph_eval_at));
399 			phinfo->ph_needed = bms_del_member(phinfo->ph_needed, relid);
400 		}
401 	}
402 
403 	/*
404 	 * Remove any joinquals referencing the rel from the joininfo lists.
405 	 *
406 	 * In some cases, a joinqual has to be put back after deleting its
407 	 * reference to the target rel.  This can occur for pseudoconstant and
408 	 * outerjoin-delayed quals, which can get marked as requiring the rel in
409 	 * order to force them to be evaluated at or above the join.  We can't
410 	 * just discard them, though.  Only quals that logically belonged to the
411 	 * outer join being discarded should be removed from the query.
412 	 *
413 	 * We must make a copy of the rel's old joininfo list before starting the
414 	 * loop, because otherwise remove_join_clause_from_rels would destroy the
415 	 * list while we're scanning it.
416 	 */
417 	joininfos = list_copy(rel->joininfo);
418 	foreach(l, joininfos)
419 	{
420 		RestrictInfo *rinfo = (RestrictInfo *) lfirst(l);
421 
422 		remove_join_clause_from_rels(root, rinfo, rinfo->required_relids);
423 
424 		if (RINFO_IS_PUSHED_DOWN(rinfo, joinrelids))
425 		{
426 			/* Recheck that qual doesn't actually reference the target rel */
427 			Assert(!bms_is_member(relid, rinfo->clause_relids));
428 
429 			/*
430 			 * The required_relids probably aren't shared with anything else,
431 			 * but let's copy them just to be sure.
432 			 */
433 			rinfo->required_relids = bms_copy(rinfo->required_relids);
434 			rinfo->required_relids = bms_del_member(rinfo->required_relids,
435 													relid);
436 			distribute_restrictinfo_to_rels(root, rinfo);
437 		}
438 	}
439 
440 	/*
441 	 * There may be references to the rel in root->fkey_list, but if so,
442 	 * match_foreign_keys_to_quals() will get rid of them.
443 	 */
444 }
445 
446 /*
447  * Remove any occurrences of the target relid from a joinlist structure.
448  *
449  * It's easiest to build a whole new list structure, so we handle it that
450  * way.  Efficiency is not a big deal here.
451  *
452  * *nremoved is incremented by the number of occurrences removed (there
453  * should be exactly one, but the caller checks that).
454  */
455 static List *
remove_rel_from_joinlist(List * joinlist,int relid,int * nremoved)456 remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved)
457 {
458 	List	   *result = NIL;
459 	ListCell   *jl;
460 
461 	foreach(jl, joinlist)
462 	{
463 		Node	   *jlnode = (Node *) lfirst(jl);
464 
465 		if (IsA(jlnode, RangeTblRef))
466 		{
467 			int			varno = ((RangeTblRef *) jlnode)->rtindex;
468 
469 			if (varno == relid)
470 				(*nremoved)++;
471 			else
472 				result = lappend(result, jlnode);
473 		}
474 		else if (IsA(jlnode, List))
475 		{
476 			/* Recurse to handle subproblem */
477 			List	   *sublist;
478 
479 			sublist = remove_rel_from_joinlist((List *) jlnode,
480 											   relid, nremoved);
481 			/* Avoid including empty sub-lists in the result */
482 			if (sublist)
483 				result = lappend(result, sublist);
484 		}
485 		else
486 		{
487 			elog(ERROR, "unrecognized joinlist node type: %d",
488 				 (int) nodeTag(jlnode));
489 		}
490 	}
491 
492 	return result;
493 }
494 
495 
496 /*
497  * reduce_unique_semijoins
498  *		Check for semijoins that can be simplified to plain inner joins
499  *		because the inner relation is provably unique for the join clauses.
500  *
501  * Ideally this would happen during reduce_outer_joins, but we don't have
502  * enough information at that point.
503  *
504  * To perform the strength reduction when applicable, we need only delete
505  * the semijoin's SpecialJoinInfo from root->join_info_list.  (We don't
506  * bother fixing the join type attributed to it in the query jointree,
507  * since that won't be consulted again.)
508  */
509 void
reduce_unique_semijoins(PlannerInfo * root)510 reduce_unique_semijoins(PlannerInfo *root)
511 {
512 	ListCell   *lc;
513 	ListCell   *next;
514 
515 	/*
516 	 * Scan the join_info_list to find semijoins.  We can't use foreach
517 	 * because we may delete the current cell.
518 	 */
519 	for (lc = list_head(root->join_info_list); lc != NULL; lc = next)
520 	{
521 		SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(lc);
522 		int			innerrelid;
523 		RelOptInfo *innerrel;
524 		Relids		joinrelids;
525 		List	   *restrictlist;
526 
527 		next = lnext(lc);
528 
529 		/*
530 		 * Must be a non-delaying semijoin to a single baserel, else we aren't
531 		 * going to be able to do anything with it.  (It's probably not
532 		 * possible for delay_upper_joins to be set on a semijoin, but we
533 		 * might as well check.)
534 		 */
535 		if (sjinfo->jointype != JOIN_SEMI ||
536 			sjinfo->delay_upper_joins)
537 			continue;
538 
539 		if (!bms_get_singleton_member(sjinfo->min_righthand, &innerrelid))
540 			continue;
541 
542 		innerrel = find_base_rel(root, innerrelid);
543 
544 		/*
545 		 * Before we trouble to run generate_join_implied_equalities, make a
546 		 * quick check to eliminate cases in which we will surely be unable to
547 		 * prove uniqueness of the innerrel.
548 		 */
549 		if (!rel_supports_distinctness(root, innerrel))
550 			continue;
551 
552 		/* Compute the relid set for the join we are considering */
553 		joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
554 
555 		/*
556 		 * Since we're only considering a single-rel RHS, any join clauses it
557 		 * has must be clauses linking it to the semijoin's min_lefthand.  We
558 		 * can also consider EC-derived join clauses.
559 		 */
560 		restrictlist =
561 			list_concat(generate_join_implied_equalities(root,
562 														 joinrelids,
563 														 sjinfo->min_lefthand,
564 														 innerrel),
565 						innerrel->joininfo);
566 
567 		/* Test whether the innerrel is unique for those clauses. */
568 		if (!innerrel_is_unique(root, sjinfo->min_lefthand, innerrel,
569 								JOIN_SEMI, restrictlist, true))
570 			continue;
571 
572 		/* OK, remove the SpecialJoinInfo from the list. */
573 		root->join_info_list = list_delete_ptr(root->join_info_list, sjinfo);
574 	}
575 }
576 
577 
578 /*
579  * rel_supports_distinctness
580  *		Could the relation possibly be proven distinct on some set of columns?
581  *
582  * This is effectively a pre-checking function for rel_is_distinct_for().
583  * It must return TRUE if rel_is_distinct_for() could possibly return TRUE
584  * with this rel, but it should not expend a lot of cycles.  The idea is
585  * that callers can avoid doing possibly-expensive processing to compute
586  * rel_is_distinct_for()'s argument lists if the call could not possibly
587  * succeed.
588  */
589 static bool
rel_supports_distinctness(PlannerInfo * root,RelOptInfo * rel)590 rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
591 {
592 	/* We only know about baserels ... */
593 	if (rel->reloptkind != RELOPT_BASEREL)
594 		return false;
595 	if (rel->rtekind == RTE_RELATION)
596 	{
597 		/*
598 		 * For a plain relation, we only know how to prove uniqueness by
599 		 * reference to unique indexes.  Make sure there's at least one
600 		 * suitable unique index.  It must be immediately enforced, and if
601 		 * it's a partial index, it must match the query.  (Keep these
602 		 * conditions in sync with relation_has_unique_index_for!)
603 		 */
604 		ListCell   *lc;
605 
606 		foreach(lc, rel->indexlist)
607 		{
608 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
609 
610 			if (ind->unique && ind->immediate &&
611 				(ind->indpred == NIL || ind->predOK))
612 				return true;
613 		}
614 	}
615 	else if (rel->rtekind == RTE_SUBQUERY)
616 	{
617 		Query	   *subquery = root->simple_rte_array[rel->relid]->subquery;
618 
619 		/* Check if the subquery has any qualities that support distinctness */
620 		if (query_supports_distinctness(subquery))
621 			return true;
622 	}
623 	/* We have no proof rules for any other rtekinds. */
624 	return false;
625 }
626 
627 /*
628  * rel_is_distinct_for
629  *		Does the relation return only distinct rows according to clause_list?
630  *
631  * clause_list is a list of join restriction clauses involving this rel and
632  * some other one.  Return true if no two rows emitted by this rel could
633  * possibly join to the same row of the other rel.
634  *
635  * The caller must have already determined that each condition is a
636  * mergejoinable equality with an expression in this relation on one side, and
637  * an expression not involving this relation on the other.  The transient
638  * outer_is_left flag is used to identify which side references this relation:
639  * left side if outer_is_left is false, right side if it is true.
640  *
641  * Note that the passed-in clause_list may be destructively modified!  This
642  * is OK for current uses, because the clause_list is built by the caller for
643  * the sole purpose of passing to this function.
644  */
645 static bool
rel_is_distinct_for(PlannerInfo * root,RelOptInfo * rel,List * clause_list)646 rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list)
647 {
648 	/*
649 	 * We could skip a couple of tests here if we assume all callers checked
650 	 * rel_supports_distinctness first, but it doesn't seem worth taking any
651 	 * risk for.
652 	 */
653 	if (rel->reloptkind != RELOPT_BASEREL)
654 		return false;
655 	if (rel->rtekind == RTE_RELATION)
656 	{
657 		/*
658 		 * Examine the indexes to see if we have a matching unique index.
659 		 * relation_has_unique_index_for automatically adds any usable
660 		 * restriction clauses for the rel, so we needn't do that here.
661 		 */
662 		if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL))
663 			return true;
664 	}
665 	else if (rel->rtekind == RTE_SUBQUERY)
666 	{
667 		Index		relid = rel->relid;
668 		Query	   *subquery = root->simple_rte_array[relid]->subquery;
669 		List	   *colnos = NIL;
670 		List	   *opids = NIL;
671 		ListCell   *l;
672 
673 		/*
674 		 * Build the argument lists for query_is_distinct_for: a list of
675 		 * output column numbers that the query needs to be distinct over, and
676 		 * a list of equality operators that the output columns need to be
677 		 * distinct according to.
678 		 *
679 		 * (XXX we are not considering restriction clauses attached to the
680 		 * subquery; is that worth doing?)
681 		 */
682 		foreach(l, clause_list)
683 		{
684 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, l);
685 			Oid			op;
686 			Var		   *var;
687 
688 			/*
689 			 * Get the equality operator we need uniqueness according to.
690 			 * (This might be a cross-type operator and thus not exactly the
691 			 * same operator the subquery would consider; that's all right
692 			 * since query_is_distinct_for can resolve such cases.)  The
693 			 * caller's mergejoinability test should have selected only
694 			 * OpExprs.
695 			 */
696 			op = castNode(OpExpr, rinfo->clause)->opno;
697 
698 			/* caller identified the inner side for us */
699 			if (rinfo->outer_is_left)
700 				var = (Var *) get_rightop(rinfo->clause);
701 			else
702 				var = (Var *) get_leftop(rinfo->clause);
703 
704 			/*
705 			 * We may ignore any RelabelType node above the operand.  (There
706 			 * won't be more than one, since eval_const_expressions() has been
707 			 * applied already.)
708 			 */
709 			if (var && IsA(var, RelabelType))
710 				var = (Var *) ((RelabelType *) var)->arg;
711 
712 			/*
713 			 * If inner side isn't a Var referencing a subquery output column,
714 			 * this clause doesn't help us.
715 			 */
716 			if (!var || !IsA(var, Var) ||
717 				var->varno != relid || var->varlevelsup != 0)
718 				continue;
719 
720 			colnos = lappend_int(colnos, var->varattno);
721 			opids = lappend_oid(opids, op);
722 		}
723 
724 		if (query_is_distinct_for(subquery, colnos, opids))
725 			return true;
726 	}
727 	return false;
728 }
729 
730 
731 /*
732  * query_supports_distinctness - could the query possibly be proven distinct
733  *		on some set of output columns?
734  *
735  * This is effectively a pre-checking function for query_is_distinct_for().
736  * It must return TRUE if query_is_distinct_for() could possibly return TRUE
737  * with this query, but it should not expend a lot of cycles.  The idea is
738  * that callers can avoid doing possibly-expensive processing to compute
739  * query_is_distinct_for()'s argument lists if the call could not possibly
740  * succeed.
741  */
742 bool
query_supports_distinctness(Query * query)743 query_supports_distinctness(Query *query)
744 {
745 	/* SRFs break distinctness except with DISTINCT, see below */
746 	if (query->hasTargetSRFs && query->distinctClause == NIL)
747 		return false;
748 
749 	/* check for features we can prove distinctness with */
750 	if (query->distinctClause != NIL ||
751 		query->groupClause != NIL ||
752 		query->groupingSets != NIL ||
753 		query->hasAggs ||
754 		query->havingQual ||
755 		query->setOperations)
756 		return true;
757 
758 	return false;
759 }
760 
761 /*
762  * query_is_distinct_for - does query never return duplicates of the
763  *		specified columns?
764  *
765  * query is a not-yet-planned subquery (in current usage, it's always from
766  * a subquery RTE, which the planner avoids scribbling on).
767  *
768  * colnos is an integer list of output column numbers (resno's).  We are
769  * interested in whether rows consisting of just these columns are certain
770  * to be distinct.  "Distinctness" is defined according to whether the
771  * corresponding upper-level equality operators listed in opids would think
772  * the values are distinct.  (Note: the opids entries could be cross-type
773  * operators, and thus not exactly the equality operators that the subquery
774  * would use itself.  We use equality_ops_are_compatible() to check
775  * compatibility.  That looks at btree or hash opfamily membership, and so
776  * should give trustworthy answers for all operators that we might need
777  * to deal with here.)
778  */
779 bool
query_is_distinct_for(Query * query,List * colnos,List * opids)780 query_is_distinct_for(Query *query, List *colnos, List *opids)
781 {
782 	ListCell   *l;
783 	Oid			opid;
784 
785 	Assert(list_length(colnos) == list_length(opids));
786 
787 	/*
788 	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
789 	 * columns in the DISTINCT clause appear in colnos and operator semantics
790 	 * match.  This is true even if there are SRFs in the DISTINCT columns or
791 	 * elsewhere in the tlist.
792 	 */
793 	if (query->distinctClause)
794 	{
795 		foreach(l, query->distinctClause)
796 		{
797 			SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
798 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
799 													   query->targetList);
800 
801 			opid = distinct_col_search(tle->resno, colnos, opids);
802 			if (!OidIsValid(opid) ||
803 				!equality_ops_are_compatible(opid, sgc->eqop))
804 				break;			/* exit early if no match */
805 		}
806 		if (l == NULL)			/* had matches for all? */
807 			return true;
808 	}
809 
810 	/*
811 	 * Otherwise, a set-returning function in the query's targetlist can
812 	 * result in returning duplicate rows, despite any grouping that might
813 	 * occur before tlist evaluation.  (If all tlist SRFs are within GROUP BY
814 	 * columns, it would be safe because they'd be expanded before grouping.
815 	 * But it doesn't currently seem worth the effort to check for that.)
816 	 */
817 	if (query->hasTargetSRFs)
818 		return false;
819 
820 	/*
821 	 * Similarly, GROUP BY without GROUPING SETS guarantees uniqueness if all
822 	 * the grouped columns appear in colnos and operator semantics match.
823 	 */
824 	if (query->groupClause && !query->groupingSets)
825 	{
826 		foreach(l, query->groupClause)
827 		{
828 			SortGroupClause *sgc = (SortGroupClause *) lfirst(l);
829 			TargetEntry *tle = get_sortgroupclause_tle(sgc,
830 													   query->targetList);
831 
832 			opid = distinct_col_search(tle->resno, colnos, opids);
833 			if (!OidIsValid(opid) ||
834 				!equality_ops_are_compatible(opid, sgc->eqop))
835 				break;			/* exit early if no match */
836 		}
837 		if (l == NULL)			/* had matches for all? */
838 			return true;
839 	}
840 	else if (query->groupingSets)
841 	{
842 		/*
843 		 * If we have grouping sets with expressions, we probably don't have
844 		 * uniqueness and analysis would be hard. Punt.
845 		 */
846 		if (query->groupClause)
847 			return false;
848 
849 		/*
850 		 * If we have no groupClause (therefore no grouping expressions), we
851 		 * might have one or many empty grouping sets. If there's just one,
852 		 * then we're returning only one row and are certainly unique. But
853 		 * otherwise, we know we're certainly not unique.
854 		 */
855 		if (list_length(query->groupingSets) == 1 &&
856 			((GroupingSet *) linitial(query->groupingSets))->kind == GROUPING_SET_EMPTY)
857 			return true;
858 		else
859 			return false;
860 	}
861 	else
862 	{
863 		/*
864 		 * If we have no GROUP BY, but do have aggregates or HAVING, then the
865 		 * result is at most one row so it's surely unique, for any operators.
866 		 */
867 		if (query->hasAggs || query->havingQual)
868 			return true;
869 	}
870 
871 	/*
872 	 * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
873 	 * except with ALL.
874 	 */
875 	if (query->setOperations)
876 	{
877 		SetOperationStmt *topop = castNode(SetOperationStmt, query->setOperations);
878 
879 		Assert(topop->op != SETOP_NONE);
880 
881 		if (!topop->all)
882 		{
883 			ListCell   *lg;
884 
885 			/* We're good if all the nonjunk output columns are in colnos */
886 			lg = list_head(topop->groupClauses);
887 			foreach(l, query->targetList)
888 			{
889 				TargetEntry *tle = (TargetEntry *) lfirst(l);
890 				SortGroupClause *sgc;
891 
892 				if (tle->resjunk)
893 					continue;	/* ignore resjunk columns */
894 
895 				/* non-resjunk columns should have grouping clauses */
896 				Assert(lg != NULL);
897 				sgc = (SortGroupClause *) lfirst(lg);
898 				lg = lnext(lg);
899 
900 				opid = distinct_col_search(tle->resno, colnos, opids);
901 				if (!OidIsValid(opid) ||
902 					!equality_ops_are_compatible(opid, sgc->eqop))
903 					break;		/* exit early if no match */
904 			}
905 			if (l == NULL)		/* had matches for all? */
906 				return true;
907 		}
908 	}
909 
910 	/*
911 	 * XXX Are there any other cases in which we can easily see the result
912 	 * must be distinct?
913 	 *
914 	 * If you do add more smarts to this function, be sure to update
915 	 * query_supports_distinctness() to match.
916 	 */
917 
918 	return false;
919 }
920 
921 /*
922  * distinct_col_search - subroutine for query_is_distinct_for
923  *
924  * If colno is in colnos, return the corresponding element of opids,
925  * else return InvalidOid.  (Ordinarily colnos would not contain duplicates,
926  * but if it does, we arbitrarily select the first match.)
927  */
928 static Oid
distinct_col_search(int colno,List * colnos,List * opids)929 distinct_col_search(int colno, List *colnos, List *opids)
930 {
931 	ListCell   *lc1,
932 			   *lc2;
933 
934 	forboth(lc1, colnos, lc2, opids)
935 	{
936 		if (colno == lfirst_int(lc1))
937 			return lfirst_oid(lc2);
938 	}
939 	return InvalidOid;
940 }
941 
942 
943 /*
944  * innerrel_is_unique
945  *	  Check if the innerrel provably contains at most one tuple matching any
946  *	  tuple from the outerrel, based on join clauses in the 'restrictlist'.
947  *
948  * We need an actual RelOptInfo for the innerrel, but it's sufficient to
949  * identify the outerrel by its Relids.  This asymmetry supports use of this
950  * function before joinrels have been built.
951  *
952  * The proof must be made based only on clauses that will be "joinquals"
953  * rather than "otherquals" at execution.  For an inner join there's no
954  * difference; but if the join is outer, we must ignore pushed-down quals,
955  * as those will become "otherquals".  Note that this means the answer might
956  * vary depending on whether IS_OUTER_JOIN(jointype); since we cache the
957  * answer without regard to that, callers must take care not to call this
958  * with jointypes that would be classified differently by IS_OUTER_JOIN().
959  *
960  * The actual proof is undertaken by is_innerrel_unique_for(); this function
961  * is a frontend that is mainly concerned with caching the answers.
962  * In particular, the force_cache argument allows overriding the internal
963  * heuristic about whether to cache negative answers; it should be "true"
964  * if making an inquiry that is not part of the normal bottom-up join search
965  * sequence.
966  */
967 bool
innerrel_is_unique(PlannerInfo * root,Relids outerrelids,RelOptInfo * innerrel,JoinType jointype,List * restrictlist,bool force_cache)968 innerrel_is_unique(PlannerInfo *root,
969 				   Relids outerrelids,
970 				   RelOptInfo *innerrel,
971 				   JoinType jointype,
972 				   List *restrictlist,
973 				   bool force_cache)
974 {
975 	MemoryContext old_context;
976 	ListCell   *lc;
977 
978 	/* Certainly can't prove uniqueness when there are no joinclauses */
979 	if (restrictlist == NIL)
980 		return false;
981 
982 	/*
983 	 * Make a quick check to eliminate cases in which we will surely be unable
984 	 * to prove uniqueness of the innerrel.
985 	 */
986 	if (!rel_supports_distinctness(root, innerrel))
987 		return false;
988 
989 	/*
990 	 * Query the cache to see if we've managed to prove that innerrel is
991 	 * unique for any subset of this outerrel.  We don't need an exact match,
992 	 * as extra outerrels can't make the innerrel any less unique (or more
993 	 * formally, the restrictlist for a join to a superset outerrel must be a
994 	 * superset of the conditions we successfully used before).
995 	 */
996 	foreach(lc, innerrel->unique_for_rels)
997 	{
998 		Relids		unique_for_rels = (Relids) lfirst(lc);
999 
1000 		if (bms_is_subset(unique_for_rels, outerrelids))
1001 			return true;		/* Success! */
1002 	}
1003 
1004 	/*
1005 	 * Conversely, we may have already determined that this outerrel, or some
1006 	 * superset thereof, cannot prove this innerrel to be unique.
1007 	 */
1008 	foreach(lc, innerrel->non_unique_for_rels)
1009 	{
1010 		Relids		unique_for_rels = (Relids) lfirst(lc);
1011 
1012 		if (bms_is_subset(outerrelids, unique_for_rels))
1013 			return false;
1014 	}
1015 
1016 	/* No cached information, so try to make the proof. */
1017 	if (is_innerrel_unique_for(root, outerrelids, innerrel,
1018 							   jointype, restrictlist))
1019 	{
1020 		/*
1021 		 * Cache the positive result for future probes, being sure to keep it
1022 		 * in the planner_cxt even if we are working in GEQO.
1023 		 *
1024 		 * Note: one might consider trying to isolate the minimal subset of
1025 		 * the outerrels that proved the innerrel unique.  But it's not worth
1026 		 * the trouble, because the planner builds up joinrels incrementally
1027 		 * and so we'll see the minimally sufficient outerrels before any
1028 		 * supersets of them anyway.
1029 		 */
1030 		old_context = MemoryContextSwitchTo(root->planner_cxt);
1031 		innerrel->unique_for_rels = lappend(innerrel->unique_for_rels,
1032 											bms_copy(outerrelids));
1033 		MemoryContextSwitchTo(old_context);
1034 
1035 		return true;			/* Success! */
1036 	}
1037 	else
1038 	{
1039 		/*
1040 		 * None of the join conditions for outerrel proved innerrel unique, so
1041 		 * we can safely reject this outerrel or any subset of it in future
1042 		 * checks.
1043 		 *
1044 		 * However, in normal planning mode, caching this knowledge is totally
1045 		 * pointless; it won't be queried again, because we build up joinrels
1046 		 * from smaller to larger.  It is useful in GEQO mode, where the
1047 		 * knowledge can be carried across successive planning attempts; and
1048 		 * it's likely to be useful when using join-search plugins, too. Hence
1049 		 * cache when join_search_private is non-NULL.  (Yeah, that's a hack,
1050 		 * but it seems reasonable.)
1051 		 *
1052 		 * Also, allow callers to override that heuristic and force caching;
1053 		 * that's useful for reduce_unique_semijoins, which calls here before
1054 		 * the normal join search starts.
1055 		 */
1056 		if (force_cache || root->join_search_private)
1057 		{
1058 			old_context = MemoryContextSwitchTo(root->planner_cxt);
1059 			innerrel->non_unique_for_rels =
1060 				lappend(innerrel->non_unique_for_rels,
1061 						bms_copy(outerrelids));
1062 			MemoryContextSwitchTo(old_context);
1063 		}
1064 
1065 		return false;
1066 	}
1067 }
1068 
1069 /*
1070  * is_innerrel_unique_for
1071  *	  Check if the innerrel provably contains at most one tuple matching any
1072  *	  tuple from the outerrel, based on join clauses in the 'restrictlist'.
1073  */
1074 static bool
is_innerrel_unique_for(PlannerInfo * root,Relids outerrelids,RelOptInfo * innerrel,JoinType jointype,List * restrictlist)1075 is_innerrel_unique_for(PlannerInfo *root,
1076 					   Relids outerrelids,
1077 					   RelOptInfo *innerrel,
1078 					   JoinType jointype,
1079 					   List *restrictlist)
1080 {
1081 	Relids		joinrelids = bms_union(outerrelids, innerrel->relids);
1082 	List	   *clause_list = NIL;
1083 	ListCell   *lc;
1084 
1085 	/*
1086 	 * Search for mergejoinable clauses that constrain the inner rel against
1087 	 * the outer rel.  If an operator is mergejoinable then it behaves like
1088 	 * equality for some btree opclass, so it's what we want.  The
1089 	 * mergejoinability test also eliminates clauses containing volatile
1090 	 * functions, which we couldn't depend on.
1091 	 */
1092 	foreach(lc, restrictlist)
1093 	{
1094 		RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(lc);
1095 
1096 		/*
1097 		 * As noted above, if it's a pushed-down clause and we're at an outer
1098 		 * join, we can't use it.
1099 		 */
1100 		if (IS_OUTER_JOIN(jointype) &&
1101 			RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids))
1102 			continue;
1103 
1104 		/* Ignore if it's not a mergejoinable clause */
1105 		if (!restrictinfo->can_join ||
1106 			restrictinfo->mergeopfamilies == NIL)
1107 			continue;			/* not mergejoinable */
1108 
1109 		/*
1110 		 * Check if clause has the form "outer op inner" or "inner op outer",
1111 		 * and if so mark which side is inner.
1112 		 */
1113 		if (!clause_sides_match_join(restrictinfo, outerrelids,
1114 									 innerrel->relids))
1115 			continue;			/* no good for these input relations */
1116 
1117 		/* OK, add to list */
1118 		clause_list = lappend(clause_list, restrictinfo);
1119 	}
1120 
1121 	/* Let rel_is_distinct_for() do the hard work */
1122 	return rel_is_distinct_for(root, innerrel, clause_list);
1123 }
1124