1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *	  Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/executor/nodeHashjoin.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/htup_details.h"
19 #include "executor/executor.h"
20 #include "executor/hashjoin.h"
21 #include "executor/nodeHash.h"
22 #include "executor/nodeHashjoin.h"
23 #include "miscadmin.h"
24 #include "utils/memutils.h"
25 
26 
27 /*
28  * States of the ExecHashJoin state machine
29  */
30 #define HJ_BUILD_HASHTABLE		1
31 #define HJ_NEED_NEW_OUTER		2
32 #define HJ_SCAN_BUCKET			3
33 #define HJ_FILL_OUTER_TUPLE		4
34 #define HJ_FILL_INNER_TUPLES	5
35 #define HJ_NEED_NEW_BATCH		6
36 
37 /* Returns true if doing null-fill on outer relation */
38 #define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
39 /* Returns true if doing null-fill on inner relation */
40 #define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
41 
42 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
43 						  HashJoinState *hjstate,
44 						  uint32 *hashvalue);
45 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
46 						  BufFile *file,
47 						  uint32 *hashvalue,
48 						  TupleTableSlot *tupleSlot);
49 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
50 
51 
52 /* ----------------------------------------------------------------
53  *		ExecHashJoin
54  *
55  *		This function implements the Hybrid Hashjoin algorithm.
56  *
57  *		Note: the relation we build hash table on is the "inner"
58  *			  the other one is "outer".
59  * ----------------------------------------------------------------
60  */
61 TupleTableSlot *				/* return: a tuple or NULL */
ExecHashJoin(HashJoinState * node)62 ExecHashJoin(HashJoinState *node)
63 {
64 	PlanState  *outerNode;
65 	HashState  *hashNode;
66 	List	   *joinqual;
67 	List	   *otherqual;
68 	ExprContext *econtext;
69 	ExprDoneCond isDone;
70 	HashJoinTable hashtable;
71 	TupleTableSlot *outerTupleSlot;
72 	uint32		hashvalue;
73 	int			batchno;
74 
75 	/*
76 	 * get information from HashJoin node
77 	 */
78 	joinqual = node->js.joinqual;
79 	otherqual = node->js.ps.qual;
80 	hashNode = (HashState *) innerPlanState(node);
81 	outerNode = outerPlanState(node);
82 	hashtable = node->hj_HashTable;
83 	econtext = node->js.ps.ps_ExprContext;
84 
85 	/*
86 	 * Check to see if we're still projecting out tuples from a previous join
87 	 * tuple (because there is a function-returning-set in the projection
88 	 * expressions).  If so, try to project another one.
89 	 */
90 	if (node->js.ps.ps_TupFromTlist)
91 	{
92 		TupleTableSlot *result;
93 
94 		result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
95 		if (isDone == ExprMultipleResult)
96 			return result;
97 		/* Done with that source tuple... */
98 		node->js.ps.ps_TupFromTlist = false;
99 	}
100 
101 	/*
102 	 * Reset per-tuple memory context to free any expression evaluation
103 	 * storage allocated in the previous tuple cycle.  Note this can't happen
104 	 * until we're done projecting out tuples from a join tuple.
105 	 */
106 	ResetExprContext(econtext);
107 
108 	/*
109 	 * run the hash join state machine
110 	 */
111 	for (;;)
112 	{
113 		switch (node->hj_JoinState)
114 		{
115 			case HJ_BUILD_HASHTABLE:
116 
117 				/*
118 				 * First time through: build hash table for inner relation.
119 				 */
120 				Assert(hashtable == NULL);
121 
122 				/*
123 				 * If the outer relation is completely empty, and it's not
124 				 * right/full join, we can quit without building the hash
125 				 * table.  However, for an inner join it is only a win to
126 				 * check this when the outer relation's startup cost is less
127 				 * than the projected cost of building the hash table.
128 				 * Otherwise it's best to build the hash table first and see
129 				 * if the inner relation is empty.  (When it's a left join, we
130 				 * should always make this check, since we aren't going to be
131 				 * able to skip the join on the strength of an empty inner
132 				 * relation anyway.)
133 				 *
134 				 * If we are rescanning the join, we make use of information
135 				 * gained on the previous scan: don't bother to try the
136 				 * prefetch if the previous scan found the outer relation
137 				 * nonempty. This is not 100% reliable since with new
138 				 * parameters the outer relation might yield different
139 				 * results, but it's a good heuristic.
140 				 *
141 				 * The only way to make the check is to try to fetch a tuple
142 				 * from the outer plan node.  If we succeed, we have to stash
143 				 * it away for later consumption by ExecHashJoinOuterGetTuple.
144 				 */
145 				if (HJ_FILL_INNER(node))
146 				{
147 					/* no chance to not build the hash table */
148 					node->hj_FirstOuterTupleSlot = NULL;
149 				}
150 				else if (HJ_FILL_OUTER(node) ||
151 						 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
152 						  !node->hj_OuterNotEmpty))
153 				{
154 					node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
155 					if (TupIsNull(node->hj_FirstOuterTupleSlot))
156 					{
157 						node->hj_OuterNotEmpty = false;
158 						return NULL;
159 					}
160 					else
161 						node->hj_OuterNotEmpty = true;
162 				}
163 				else
164 					node->hj_FirstOuterTupleSlot = NULL;
165 
166 				/*
167 				 * create the hash table
168 				 */
169 				hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
170 												node->hj_HashOperators,
171 												HJ_FILL_INNER(node));
172 				node->hj_HashTable = hashtable;
173 
174 				/*
175 				 * execute the Hash node, to build the hash table
176 				 */
177 				hashNode->hashtable = hashtable;
178 				(void) MultiExecProcNode((PlanState *) hashNode);
179 
180 				/*
181 				 * If the inner relation is completely empty, and we're not
182 				 * doing a left outer join, we can quit without scanning the
183 				 * outer relation.
184 				 */
185 				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
186 					return NULL;
187 
188 				/*
189 				 * need to remember whether nbatch has increased since we
190 				 * began scanning the outer relation
191 				 */
192 				hashtable->nbatch_outstart = hashtable->nbatch;
193 
194 				/*
195 				 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
196 				 * tuple above, because ExecHashJoinOuterGetTuple will
197 				 * immediately set it again.)
198 				 */
199 				node->hj_OuterNotEmpty = false;
200 
201 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
202 
203 				/* FALL THRU */
204 
205 			case HJ_NEED_NEW_OUTER:
206 
207 				/*
208 				 * We don't have an outer tuple, try to get the next one
209 				 */
210 				outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
211 														   node,
212 														   &hashvalue);
213 				if (TupIsNull(outerTupleSlot))
214 				{
215 					/* end of batch, or maybe whole join */
216 					if (HJ_FILL_INNER(node))
217 					{
218 						/* set up to scan for unmatched inner tuples */
219 						ExecPrepHashTableForUnmatched(node);
220 						node->hj_JoinState = HJ_FILL_INNER_TUPLES;
221 					}
222 					else
223 						node->hj_JoinState = HJ_NEED_NEW_BATCH;
224 					continue;
225 				}
226 
227 				econtext->ecxt_outertuple = outerTupleSlot;
228 				node->hj_MatchedOuter = false;
229 
230 				/*
231 				 * Find the corresponding bucket for this tuple in the main
232 				 * hash table or skew hash table.
233 				 */
234 				node->hj_CurHashValue = hashvalue;
235 				ExecHashGetBucketAndBatch(hashtable, hashvalue,
236 										  &node->hj_CurBucketNo, &batchno);
237 				node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
238 																 hashvalue);
239 				node->hj_CurTuple = NULL;
240 
241 				/*
242 				 * The tuple might not belong to the current batch (where
243 				 * "current batch" includes the skew buckets if any).
244 				 */
245 				if (batchno != hashtable->curbatch &&
246 					node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
247 				{
248 					/*
249 					 * Need to postpone this outer tuple to a later batch.
250 					 * Save it in the corresponding outer-batch file.
251 					 */
252 					Assert(batchno > hashtable->curbatch);
253 					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
254 										  hashvalue,
255 										&hashtable->outerBatchFile[batchno]);
256 					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
257 					continue;
258 				}
259 
260 				/* OK, let's scan the bucket for matches */
261 				node->hj_JoinState = HJ_SCAN_BUCKET;
262 
263 				/* FALL THRU */
264 
265 			case HJ_SCAN_BUCKET:
266 
267 				/*
268 				 * We check for interrupts here because this corresponds to
269 				 * where we'd fetch a row from a child plan node in other join
270 				 * types.
271 				 */
272 				CHECK_FOR_INTERRUPTS();
273 
274 				/*
275 				 * Scan the selected hash bucket for matches to current outer
276 				 */
277 				if (!ExecScanHashBucket(node, econtext))
278 				{
279 					/* out of matches; check for possible outer-join fill */
280 					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
281 					continue;
282 				}
283 
284 				/*
285 				 * We've got a match, but still need to test non-hashed quals.
286 				 * ExecScanHashBucket already set up all the state needed to
287 				 * call ExecQual.
288 				 *
289 				 * If we pass the qual, then save state for next call and have
290 				 * ExecProject form the projection, store it in the tuple
291 				 * table, and return the slot.
292 				 *
293 				 * Only the joinquals determine tuple match status, but all
294 				 * quals must pass to actually return the tuple.
295 				 */
296 				if (joinqual == NIL || ExecQual(joinqual, econtext, false))
297 				{
298 					node->hj_MatchedOuter = true;
299 					HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
300 
301 					/* In an antijoin, we never return a matched tuple */
302 					if (node->js.jointype == JOIN_ANTI)
303 					{
304 						node->hj_JoinState = HJ_NEED_NEW_OUTER;
305 						continue;
306 					}
307 
308 					/*
309 					 * In a semijoin, we'll consider returning the first
310 					 * match, but after that we're done with this outer tuple.
311 					 */
312 					if (node->js.jointype == JOIN_SEMI)
313 						node->hj_JoinState = HJ_NEED_NEW_OUTER;
314 
315 					if (otherqual == NIL ||
316 						ExecQual(otherqual, econtext, false))
317 					{
318 						TupleTableSlot *result;
319 
320 						result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
321 
322 						if (isDone != ExprEndResult)
323 						{
324 							node->js.ps.ps_TupFromTlist =
325 								(isDone == ExprMultipleResult);
326 							return result;
327 						}
328 					}
329 					else
330 						InstrCountFiltered2(node, 1);
331 				}
332 				else
333 					InstrCountFiltered1(node, 1);
334 				break;
335 
336 			case HJ_FILL_OUTER_TUPLE:
337 
338 				/*
339 				 * The current outer tuple has run out of matches, so check
340 				 * whether to emit a dummy outer-join tuple.  Whether we emit
341 				 * one or not, the next state is NEED_NEW_OUTER.
342 				 */
343 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
344 
345 				if (!node->hj_MatchedOuter &&
346 					HJ_FILL_OUTER(node))
347 				{
348 					/*
349 					 * Generate a fake join tuple with nulls for the inner
350 					 * tuple, and return it if it passes the non-join quals.
351 					 */
352 					econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
353 
354 					if (otherqual == NIL ||
355 						ExecQual(otherqual, econtext, false))
356 					{
357 						TupleTableSlot *result;
358 
359 						result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
360 
361 						if (isDone != ExprEndResult)
362 						{
363 							node->js.ps.ps_TupFromTlist =
364 								(isDone == ExprMultipleResult);
365 							return result;
366 						}
367 					}
368 					else
369 						InstrCountFiltered2(node, 1);
370 				}
371 				break;
372 
373 			case HJ_FILL_INNER_TUPLES:
374 
375 				/*
376 				 * We have finished a batch, but we are doing right/full join,
377 				 * so any unmatched inner tuples in the hashtable have to be
378 				 * emitted before we continue to the next batch.
379 				 */
380 				if (!ExecScanHashTableForUnmatched(node, econtext))
381 				{
382 					/* no more unmatched tuples */
383 					node->hj_JoinState = HJ_NEED_NEW_BATCH;
384 					continue;
385 				}
386 
387 				/*
388 				 * Generate a fake join tuple with nulls for the outer tuple,
389 				 * and return it if it passes the non-join quals.
390 				 */
391 				econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
392 
393 				if (otherqual == NIL ||
394 					ExecQual(otherqual, econtext, false))
395 				{
396 					TupleTableSlot *result;
397 
398 					result = ExecProject(node->js.ps.ps_ProjInfo, &isDone);
399 
400 					if (isDone != ExprEndResult)
401 					{
402 						node->js.ps.ps_TupFromTlist =
403 							(isDone == ExprMultipleResult);
404 						return result;
405 					}
406 				}
407 				else
408 					InstrCountFiltered2(node, 1);
409 				break;
410 
411 			case HJ_NEED_NEW_BATCH:
412 
413 				/*
414 				 * Try to advance to next batch.  Done if there are no more.
415 				 */
416 				if (!ExecHashJoinNewBatch(node))
417 					return NULL;	/* end of join */
418 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
419 				break;
420 
421 			default:
422 				elog(ERROR, "unrecognized hashjoin state: %d",
423 					 (int) node->hj_JoinState);
424 		}
425 	}
426 }
427 
428 /* ----------------------------------------------------------------
429  *		ExecInitHashJoin
430  *
431  *		Init routine for HashJoin node.
432  * ----------------------------------------------------------------
433  */
434 HashJoinState *
ExecInitHashJoin(HashJoin * node,EState * estate,int eflags)435 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
436 {
437 	HashJoinState *hjstate;
438 	Plan	   *outerNode;
439 	Hash	   *hashNode;
440 	List	   *lclauses;
441 	List	   *rclauses;
442 	List	   *hoperators;
443 	ListCell   *l;
444 
445 	/* check for unsupported flags */
446 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
447 
448 	/*
449 	 * create state structure
450 	 */
451 	hjstate = makeNode(HashJoinState);
452 	hjstate->js.ps.plan = (Plan *) node;
453 	hjstate->js.ps.state = estate;
454 
455 	/*
456 	 * Miscellaneous initialization
457 	 *
458 	 * create expression context for node
459 	 */
460 	ExecAssignExprContext(estate, &hjstate->js.ps);
461 
462 	/*
463 	 * initialize child expressions
464 	 */
465 	hjstate->js.ps.targetlist = (List *)
466 		ExecInitExpr((Expr *) node->join.plan.targetlist,
467 					 (PlanState *) hjstate);
468 	hjstate->js.ps.qual = (List *)
469 		ExecInitExpr((Expr *) node->join.plan.qual,
470 					 (PlanState *) hjstate);
471 	hjstate->js.jointype = node->join.jointype;
472 	hjstate->js.joinqual = (List *)
473 		ExecInitExpr((Expr *) node->join.joinqual,
474 					 (PlanState *) hjstate);
475 	hjstate->hashclauses = (List *)
476 		ExecInitExpr((Expr *) node->hashclauses,
477 					 (PlanState *) hjstate);
478 
479 	/*
480 	 * initialize child nodes
481 	 *
482 	 * Note: we could suppress the REWIND flag for the inner input, which
483 	 * would amount to betting that the hash will be a single batch.  Not
484 	 * clear if this would be a win or not.
485 	 */
486 	outerNode = outerPlan(node);
487 	hashNode = (Hash *) innerPlan(node);
488 
489 	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
490 	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
491 
492 	/*
493 	 * tuple table initialization
494 	 */
495 	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
496 	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
497 
498 	/* set up null tuples for outer joins, if needed */
499 	switch (node->join.jointype)
500 	{
501 		case JOIN_INNER:
502 		case JOIN_SEMI:
503 			break;
504 		case JOIN_LEFT:
505 		case JOIN_ANTI:
506 			hjstate->hj_NullInnerTupleSlot =
507 				ExecInitNullTupleSlot(estate,
508 								 ExecGetResultType(innerPlanState(hjstate)));
509 			break;
510 		case JOIN_RIGHT:
511 			hjstate->hj_NullOuterTupleSlot =
512 				ExecInitNullTupleSlot(estate,
513 								 ExecGetResultType(outerPlanState(hjstate)));
514 			break;
515 		case JOIN_FULL:
516 			hjstate->hj_NullOuterTupleSlot =
517 				ExecInitNullTupleSlot(estate,
518 								 ExecGetResultType(outerPlanState(hjstate)));
519 			hjstate->hj_NullInnerTupleSlot =
520 				ExecInitNullTupleSlot(estate,
521 								 ExecGetResultType(innerPlanState(hjstate)));
522 			break;
523 		default:
524 			elog(ERROR, "unrecognized join type: %d",
525 				 (int) node->join.jointype);
526 	}
527 
528 	/*
529 	 * now for some voodoo.  our temporary tuple slot is actually the result
530 	 * tuple slot of the Hash node (which is our inner plan).  we can do this
531 	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
532 	 * the hash join node uses ExecScanHashBucket() to get at the contents of
533 	 * the hash table.  -cim 6/9/91
534 	 */
535 	{
536 		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
537 		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
538 
539 		hjstate->hj_HashTupleSlot = slot;
540 	}
541 
542 	/*
543 	 * initialize tuple type and projection info
544 	 */
545 	ExecAssignResultTypeFromTL(&hjstate->js.ps);
546 	ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
547 
548 	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
549 						  ExecGetResultType(outerPlanState(hjstate)));
550 
551 	/*
552 	 * initialize hash-specific info
553 	 */
554 	hjstate->hj_HashTable = NULL;
555 	hjstate->hj_FirstOuterTupleSlot = NULL;
556 
557 	hjstate->hj_CurHashValue = 0;
558 	hjstate->hj_CurBucketNo = 0;
559 	hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
560 	hjstate->hj_CurTuple = NULL;
561 
562 	/*
563 	 * Deconstruct the hash clauses into outer and inner argument values, so
564 	 * that we can evaluate those subexpressions separately.  Also make a list
565 	 * of the hash operator OIDs, in preparation for looking up the hash
566 	 * functions to use.
567 	 */
568 	lclauses = NIL;
569 	rclauses = NIL;
570 	hoperators = NIL;
571 	foreach(l, hjstate->hashclauses)
572 	{
573 		FuncExprState *fstate = (FuncExprState *) lfirst(l);
574 		OpExpr	   *hclause;
575 
576 		Assert(IsA(fstate, FuncExprState));
577 		hclause = (OpExpr *) fstate->xprstate.expr;
578 		Assert(IsA(hclause, OpExpr));
579 		lclauses = lappend(lclauses, linitial(fstate->args));
580 		rclauses = lappend(rclauses, lsecond(fstate->args));
581 		hoperators = lappend_oid(hoperators, hclause->opno);
582 	}
583 	hjstate->hj_OuterHashKeys = lclauses;
584 	hjstate->hj_InnerHashKeys = rclauses;
585 	hjstate->hj_HashOperators = hoperators;
586 	/* child Hash node needs to evaluate inner hash keys, too */
587 	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
588 
589 	hjstate->js.ps.ps_TupFromTlist = false;
590 	hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
591 	hjstate->hj_MatchedOuter = false;
592 	hjstate->hj_OuterNotEmpty = false;
593 
594 	return hjstate;
595 }
596 
597 /* ----------------------------------------------------------------
598  *		ExecEndHashJoin
599  *
600  *		clean up routine for HashJoin node
601  * ----------------------------------------------------------------
602  */
603 void
ExecEndHashJoin(HashJoinState * node)604 ExecEndHashJoin(HashJoinState *node)
605 {
606 	/*
607 	 * Free hash table
608 	 */
609 	if (node->hj_HashTable)
610 	{
611 		ExecHashTableDestroy(node->hj_HashTable);
612 		node->hj_HashTable = NULL;
613 	}
614 
615 	/*
616 	 * Free the exprcontext
617 	 */
618 	ExecFreeExprContext(&node->js.ps);
619 
620 	/*
621 	 * clean out the tuple table
622 	 */
623 	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
624 	ExecClearTuple(node->hj_OuterTupleSlot);
625 	ExecClearTuple(node->hj_HashTupleSlot);
626 
627 	/*
628 	 * clean up subtrees
629 	 */
630 	ExecEndNode(outerPlanState(node));
631 	ExecEndNode(innerPlanState(node));
632 }
633 
634 /*
635  * ExecHashJoinOuterGetTuple
636  *
637  *		get the next outer tuple for hashjoin: either by
638  *		executing the outer plan node in the first pass, or from
639  *		the temp files for the hashjoin batches.
640  *
641  * Returns a null slot if no more outer tuples (within the current batch).
642  *
643  * On success, the tuple's hash value is stored at *hashvalue --- this is
644  * either originally computed, or re-read from the temp file.
645  */
646 static TupleTableSlot *
ExecHashJoinOuterGetTuple(PlanState * outerNode,HashJoinState * hjstate,uint32 * hashvalue)647 ExecHashJoinOuterGetTuple(PlanState *outerNode,
648 						  HashJoinState *hjstate,
649 						  uint32 *hashvalue)
650 {
651 	HashJoinTable hashtable = hjstate->hj_HashTable;
652 	int			curbatch = hashtable->curbatch;
653 	TupleTableSlot *slot;
654 
655 	if (curbatch == 0)			/* if it is the first pass */
656 	{
657 		/*
658 		 * Check to see if first outer tuple was already fetched by
659 		 * ExecHashJoin() and not used yet.
660 		 */
661 		slot = hjstate->hj_FirstOuterTupleSlot;
662 		if (!TupIsNull(slot))
663 			hjstate->hj_FirstOuterTupleSlot = NULL;
664 		else
665 			slot = ExecProcNode(outerNode);
666 
667 		while (!TupIsNull(slot))
668 		{
669 			/*
670 			 * We have to compute the tuple's hash value.
671 			 */
672 			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
673 
674 			econtext->ecxt_outertuple = slot;
675 			if (ExecHashGetHashValue(hashtable, econtext,
676 									 hjstate->hj_OuterHashKeys,
677 									 true,		/* outer tuple */
678 									 HJ_FILL_OUTER(hjstate),
679 									 hashvalue))
680 			{
681 				/* remember outer relation is not empty for possible rescan */
682 				hjstate->hj_OuterNotEmpty = true;
683 
684 				return slot;
685 			}
686 
687 			/*
688 			 * That tuple couldn't match because of a NULL, so discard it and
689 			 * continue with the next one.
690 			 */
691 			slot = ExecProcNode(outerNode);
692 		}
693 	}
694 	else if (curbatch < hashtable->nbatch)
695 	{
696 		BufFile    *file = hashtable->outerBatchFile[curbatch];
697 
698 		/*
699 		 * In outer-join cases, we could get here even though the batch file
700 		 * is empty.
701 		 */
702 		if (file == NULL)
703 			return NULL;
704 
705 		slot = ExecHashJoinGetSavedTuple(hjstate,
706 										 file,
707 										 hashvalue,
708 										 hjstate->hj_OuterTupleSlot);
709 		if (!TupIsNull(slot))
710 			return slot;
711 	}
712 
713 	/* End of this batch */
714 	return NULL;
715 }
716 
717 /*
718  * ExecHashJoinNewBatch
719  *		switch to a new hashjoin batch
720  *
721  * Returns true if successful, false if there are no more batches.
722  */
723 static bool
ExecHashJoinNewBatch(HashJoinState * hjstate)724 ExecHashJoinNewBatch(HashJoinState *hjstate)
725 {
726 	HashJoinTable hashtable = hjstate->hj_HashTable;
727 	int			nbatch;
728 	int			curbatch;
729 	BufFile    *innerFile;
730 	TupleTableSlot *slot;
731 	uint32		hashvalue;
732 
733 	nbatch = hashtable->nbatch;
734 	curbatch = hashtable->curbatch;
735 
736 	if (curbatch > 0)
737 	{
738 		/*
739 		 * We no longer need the previous outer batch file; close it right
740 		 * away to free disk space.
741 		 */
742 		if (hashtable->outerBatchFile[curbatch])
743 			BufFileClose(hashtable->outerBatchFile[curbatch]);
744 		hashtable->outerBatchFile[curbatch] = NULL;
745 	}
746 	else	/* we just finished the first batch */
747 	{
748 		/*
749 		 * Reset some of the skew optimization state variables, since we no
750 		 * longer need to consider skew tuples after the first batch. The
751 		 * memory context reset we are about to do will release the skew
752 		 * hashtable itself.
753 		 */
754 		hashtable->skewEnabled = false;
755 		hashtable->skewBucket = NULL;
756 		hashtable->skewBucketNums = NULL;
757 		hashtable->nSkewBuckets = 0;
758 		hashtable->spaceUsedSkew = 0;
759 	}
760 
761 	/*
762 	 * We can always skip over any batches that are completely empty on both
763 	 * sides.  We can sometimes skip over batches that are empty on only one
764 	 * side, but there are exceptions:
765 	 *
766 	 * 1. In a left/full outer join, we have to process outer batches even if
767 	 * the inner batch is empty.  Similarly, in a right/full outer join, we
768 	 * have to process inner batches even if the outer batch is empty.
769 	 *
770 	 * 2. If we have increased nbatch since the initial estimate, we have to
771 	 * scan inner batches since they might contain tuples that need to be
772 	 * reassigned to later inner batches.
773 	 *
774 	 * 3. Similarly, if we have increased nbatch since starting the outer
775 	 * scan, we have to rescan outer batches in case they contain tuples that
776 	 * need to be reassigned.
777 	 */
778 	curbatch++;
779 	while (curbatch < nbatch &&
780 		   (hashtable->outerBatchFile[curbatch] == NULL ||
781 			hashtable->innerBatchFile[curbatch] == NULL))
782 	{
783 		if (hashtable->outerBatchFile[curbatch] &&
784 			HJ_FILL_OUTER(hjstate))
785 			break;				/* must process due to rule 1 */
786 		if (hashtable->innerBatchFile[curbatch] &&
787 			HJ_FILL_INNER(hjstate))
788 			break;				/* must process due to rule 1 */
789 		if (hashtable->innerBatchFile[curbatch] &&
790 			nbatch != hashtable->nbatch_original)
791 			break;				/* must process due to rule 2 */
792 		if (hashtable->outerBatchFile[curbatch] &&
793 			nbatch != hashtable->nbatch_outstart)
794 			break;				/* must process due to rule 3 */
795 		/* We can ignore this batch. */
796 		/* Release associated temp files right away. */
797 		if (hashtable->innerBatchFile[curbatch])
798 			BufFileClose(hashtable->innerBatchFile[curbatch]);
799 		hashtable->innerBatchFile[curbatch] = NULL;
800 		if (hashtable->outerBatchFile[curbatch])
801 			BufFileClose(hashtable->outerBatchFile[curbatch]);
802 		hashtable->outerBatchFile[curbatch] = NULL;
803 		curbatch++;
804 	}
805 
806 	if (curbatch >= nbatch)
807 		return false;			/* no more batches */
808 
809 	hashtable->curbatch = curbatch;
810 
811 	/*
812 	 * Reload the hash table with the new inner batch (which could be empty)
813 	 */
814 	ExecHashTableReset(hashtable);
815 
816 	innerFile = hashtable->innerBatchFile[curbatch];
817 
818 	if (innerFile != NULL)
819 	{
820 		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
821 			ereport(ERROR,
822 					(errcode_for_file_access(),
823 				   errmsg("could not rewind hash-join temporary file")));
824 
825 		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
826 												 innerFile,
827 												 &hashvalue,
828 												 hjstate->hj_HashTupleSlot)))
829 		{
830 			/*
831 			 * NOTE: some tuples may be sent to future batches.  Also, it is
832 			 * possible for hashtable->nbatch to be increased here!
833 			 */
834 			ExecHashTableInsert(hashtable, slot, hashvalue);
835 		}
836 
837 		/*
838 		 * after we build the hash table, the inner batch file is no longer
839 		 * needed
840 		 */
841 		BufFileClose(innerFile);
842 		hashtable->innerBatchFile[curbatch] = NULL;
843 	}
844 
845 	/*
846 	 * Rewind outer batch file (if present), so that we can start reading it.
847 	 */
848 	if (hashtable->outerBatchFile[curbatch] != NULL)
849 	{
850 		if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
851 			ereport(ERROR,
852 					(errcode_for_file_access(),
853 				   errmsg("could not rewind hash-join temporary file")));
854 	}
855 
856 	return true;
857 }
858 
859 /*
860  * ExecHashJoinSaveTuple
861  *		save a tuple to a batch file.
862  *
863  * The data recorded in the file for each tuple is its hash value,
864  * then the tuple in MinimalTuple format.
865  *
866  * Note: it is important always to call this in the regular executor
867  * context, not in a shorter-lived context; else the temp file buffers
868  * will get messed up.
869  */
870 void
ExecHashJoinSaveTuple(MinimalTuple tuple,uint32 hashvalue,BufFile ** fileptr)871 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
872 					  BufFile **fileptr)
873 {
874 	BufFile    *file = *fileptr;
875 
876 	if (file == NULL)
877 	{
878 		/* First write to this batch file, so open it. */
879 		file = BufFileCreateTemp(false);
880 		*fileptr = file;
881 	}
882 
883 	BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
884 	BufFileWrite(file, (void *) tuple, tuple->t_len);
885 }
886 
887 /*
888  * ExecHashJoinGetSavedTuple
889  *		read the next tuple from a batch file.  Return NULL if no more.
890  *
891  * On success, *hashvalue is set to the tuple's hash value, and the tuple
892  * itself is stored in the given slot.
893  */
894 static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState * hjstate,BufFile * file,uint32 * hashvalue,TupleTableSlot * tupleSlot)895 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
896 						  BufFile *file,
897 						  uint32 *hashvalue,
898 						  TupleTableSlot *tupleSlot)
899 {
900 	uint32		header[2];
901 	size_t		nread;
902 	MinimalTuple tuple;
903 
904 	/*
905 	 * We check for interrupts here because this is typically taken as an
906 	 * alternative code path to an ExecProcNode() call, which would include
907 	 * such a check.
908 	 */
909 	CHECK_FOR_INTERRUPTS();
910 
911 	/*
912 	 * Since both the hash value and the MinimalTuple length word are uint32,
913 	 * we can read them both in one BufFileRead() call without any type
914 	 * cheating.
915 	 */
916 	nread = BufFileRead(file, (void *) header, sizeof(header));
917 	if (nread == 0)				/* end of file */
918 	{
919 		ExecClearTuple(tupleSlot);
920 		return NULL;
921 	}
922 	if (nread != sizeof(header))
923 		ereport(ERROR,
924 				(errcode_for_file_access(),
925 				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
926 						nread, sizeof(header))));
927 	*hashvalue = header[0];
928 	tuple = (MinimalTuple) palloc(header[1]);
929 	tuple->t_len = header[1];
930 	nread = BufFileRead(file,
931 						(void *) ((char *) tuple + sizeof(uint32)),
932 						header[1] - sizeof(uint32));
933 	if (nread != header[1] - sizeof(uint32))
934 		ereport(ERROR,
935 				(errcode_for_file_access(),
936 				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
937 						nread, header[1] - sizeof(uint32))));
938 	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
939 }
940 
941 
942 void
ExecReScanHashJoin(HashJoinState * node)943 ExecReScanHashJoin(HashJoinState *node)
944 {
945 	/*
946 	 * In a multi-batch join, we currently have to do rescans the hard way,
947 	 * primarily because batch temp files may have already been released. But
948 	 * if it's a single-batch join, and there is no parameter change for the
949 	 * inner subnode, then we can just re-use the existing hash table without
950 	 * rebuilding it.
951 	 */
952 	if (node->hj_HashTable != NULL)
953 	{
954 		if (node->hj_HashTable->nbatch == 1 &&
955 			node->js.ps.righttree->chgParam == NULL)
956 		{
957 			/*
958 			 * Okay to reuse the hash table; needn't rescan inner, either.
959 			 *
960 			 * However, if it's a right/full join, we'd better reset the
961 			 * inner-tuple match flags contained in the table.
962 			 */
963 			if (HJ_FILL_INNER(node))
964 				ExecHashTableResetMatchFlags(node->hj_HashTable);
965 
966 			/*
967 			 * Also, we need to reset our state about the emptiness of the
968 			 * outer relation, so that the new scan of the outer will update
969 			 * it correctly if it turns out to be empty this time. (There's no
970 			 * harm in clearing it now because ExecHashJoin won't need the
971 			 * info.  In the other cases, where the hash table doesn't exist
972 			 * or we are destroying it, we leave this state alone because
973 			 * ExecHashJoin will need it the first time through.)
974 			 */
975 			node->hj_OuterNotEmpty = false;
976 
977 			/* ExecHashJoin can skip the BUILD_HASHTABLE step */
978 			node->hj_JoinState = HJ_NEED_NEW_OUTER;
979 		}
980 		else
981 		{
982 			/* must destroy and rebuild hash table */
983 			HashState  *hashNode = castNode(HashState, innerPlanState(node));
984 
985 			/* for safety, be sure to clear child plan node's pointer too */
986 			Assert(hashNode->hashtable == node->hj_HashTable);
987 			hashNode->hashtable = NULL;
988 
989 			ExecHashTableDestroy(node->hj_HashTable);
990 			node->hj_HashTable = NULL;
991 			node->hj_JoinState = HJ_BUILD_HASHTABLE;
992 
993 			/*
994 			 * if chgParam of subnode is not null then plan will be re-scanned
995 			 * by first ExecProcNode.
996 			 */
997 			if (node->js.ps.righttree->chgParam == NULL)
998 				ExecReScan(node->js.ps.righttree);
999 		}
1000 	}
1001 
1002 	/* Always reset intra-tuple state */
1003 	node->hj_CurHashValue = 0;
1004 	node->hj_CurBucketNo = 0;
1005 	node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
1006 	node->hj_CurTuple = NULL;
1007 
1008 	node->js.ps.ps_TupFromTlist = false;
1009 	node->hj_MatchedOuter = false;
1010 	node->hj_FirstOuterTupleSlot = NULL;
1011 
1012 	/*
1013 	 * if chgParam of subnode is not null then plan will be re-scanned by
1014 	 * first ExecProcNode.
1015 	 */
1016 	if (node->js.ps.lefttree->chgParam == NULL)
1017 		ExecReScan(node->js.ps.lefttree);
1018 }
1019