1 /*-------------------------------------------------------------------------
2  *
3  * nodeHashjoin.c
4  *	  Routines to handle hash join nodes
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/executor/nodeHashjoin.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 
16 #include "postgres.h"
17 
18 #include "access/htup_details.h"
19 #include "executor/executor.h"
20 #include "executor/hashjoin.h"
21 #include "executor/nodeHash.h"
22 #include "executor/nodeHashjoin.h"
23 #include "miscadmin.h"
24 #include "utils/memutils.h"
25 
26 
27 /*
28  * States of the ExecHashJoin state machine
29  */
30 #define HJ_BUILD_HASHTABLE		1
31 #define HJ_NEED_NEW_OUTER		2
32 #define HJ_SCAN_BUCKET			3
33 #define HJ_FILL_OUTER_TUPLE		4
34 #define HJ_FILL_INNER_TUPLES	5
35 #define HJ_NEED_NEW_BATCH		6
36 
37 /* Returns true if doing null-fill on outer relation */
38 #define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
39 /* Returns true if doing null-fill on inner relation */
40 #define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
41 
42 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
43 						  HashJoinState *hjstate,
44 						  uint32 *hashvalue);
45 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
46 						  BufFile *file,
47 						  uint32 *hashvalue,
48 						  TupleTableSlot *tupleSlot);
49 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
50 
51 
52 /* ----------------------------------------------------------------
53  *		ExecHashJoin
54  *
55  *		This function implements the Hybrid Hashjoin algorithm.
56  *
57  *		Note: the relation we build hash table on is the "inner"
58  *			  the other one is "outer".
59  * ----------------------------------------------------------------
60  */
61 static TupleTableSlot *			/* return: a tuple or NULL */
ExecHashJoin(PlanState * pstate)62 ExecHashJoin(PlanState *pstate)
63 {
64 	HashJoinState *node = castNode(HashJoinState, pstate);
65 	PlanState  *outerNode;
66 	HashState  *hashNode;
67 	ExprState  *joinqual;
68 	ExprState  *otherqual;
69 	ExprContext *econtext;
70 	HashJoinTable hashtable;
71 	TupleTableSlot *outerTupleSlot;
72 	uint32		hashvalue;
73 	int			batchno;
74 
75 	/*
76 	 * get information from HashJoin node
77 	 */
78 	joinqual = node->js.joinqual;
79 	otherqual = node->js.ps.qual;
80 	hashNode = (HashState *) innerPlanState(node);
81 	outerNode = outerPlanState(node);
82 	hashtable = node->hj_HashTable;
83 	econtext = node->js.ps.ps_ExprContext;
84 
85 	/*
86 	 * Reset per-tuple memory context to free any expression evaluation
87 	 * storage allocated in the previous tuple cycle.
88 	 */
89 	ResetExprContext(econtext);
90 
91 	/*
92 	 * run the hash join state machine
93 	 */
94 	for (;;)
95 	{
96 		/*
97 		 * It's possible to iterate this loop many times before returning a
98 		 * tuple, in some pathological cases such as needing to move much of
99 		 * the current batch to a later batch.  So let's check for interrupts
100 		 * each time through.
101 		 */
102 		CHECK_FOR_INTERRUPTS();
103 
104 		switch (node->hj_JoinState)
105 		{
106 			case HJ_BUILD_HASHTABLE:
107 
108 				/*
109 				 * First time through: build hash table for inner relation.
110 				 */
111 				Assert(hashtable == NULL);
112 
113 				/*
114 				 * If the outer relation is completely empty, and it's not
115 				 * right/full join, we can quit without building the hash
116 				 * table.  However, for an inner join it is only a win to
117 				 * check this when the outer relation's startup cost is less
118 				 * than the projected cost of building the hash table.
119 				 * Otherwise it's best to build the hash table first and see
120 				 * if the inner relation is empty.  (When it's a left join, we
121 				 * should always make this check, since we aren't going to be
122 				 * able to skip the join on the strength of an empty inner
123 				 * relation anyway.)
124 				 *
125 				 * If we are rescanning the join, we make use of information
126 				 * gained on the previous scan: don't bother to try the
127 				 * prefetch if the previous scan found the outer relation
128 				 * nonempty. This is not 100% reliable since with new
129 				 * parameters the outer relation might yield different
130 				 * results, but it's a good heuristic.
131 				 *
132 				 * The only way to make the check is to try to fetch a tuple
133 				 * from the outer plan node.  If we succeed, we have to stash
134 				 * it away for later consumption by ExecHashJoinOuterGetTuple.
135 				 */
136 				if (HJ_FILL_INNER(node))
137 				{
138 					/* no chance to not build the hash table */
139 					node->hj_FirstOuterTupleSlot = NULL;
140 				}
141 				else if (HJ_FILL_OUTER(node) ||
142 						 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
143 						  !node->hj_OuterNotEmpty))
144 				{
145 					node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
146 					if (TupIsNull(node->hj_FirstOuterTupleSlot))
147 					{
148 						node->hj_OuterNotEmpty = false;
149 						return NULL;
150 					}
151 					else
152 						node->hj_OuterNotEmpty = true;
153 				}
154 				else
155 					node->hj_FirstOuterTupleSlot = NULL;
156 
157 				/*
158 				 * create the hash table
159 				 */
160 				hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
161 												node->hj_HashOperators,
162 												HJ_FILL_INNER(node));
163 				node->hj_HashTable = hashtable;
164 
165 				/*
166 				 * execute the Hash node, to build the hash table
167 				 */
168 				hashNode->hashtable = hashtable;
169 				(void) MultiExecProcNode((PlanState *) hashNode);
170 
171 				/*
172 				 * If the inner relation is completely empty, and we're not
173 				 * doing a left outer join, we can quit without scanning the
174 				 * outer relation.
175 				 */
176 				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
177 					return NULL;
178 
179 				/*
180 				 * need to remember whether nbatch has increased since we
181 				 * began scanning the outer relation
182 				 */
183 				hashtable->nbatch_outstart = hashtable->nbatch;
184 
185 				/*
186 				 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
187 				 * tuple above, because ExecHashJoinOuterGetTuple will
188 				 * immediately set it again.)
189 				 */
190 				node->hj_OuterNotEmpty = false;
191 
192 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
193 
194 				/* FALL THRU */
195 
196 			case HJ_NEED_NEW_OUTER:
197 
198 				/*
199 				 * We don't have an outer tuple, try to get the next one
200 				 */
201 				outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
202 														   node,
203 														   &hashvalue);
204 				if (TupIsNull(outerTupleSlot))
205 				{
206 					/* end of batch, or maybe whole join */
207 					if (HJ_FILL_INNER(node))
208 					{
209 						/* set up to scan for unmatched inner tuples */
210 						ExecPrepHashTableForUnmatched(node);
211 						node->hj_JoinState = HJ_FILL_INNER_TUPLES;
212 					}
213 					else
214 						node->hj_JoinState = HJ_NEED_NEW_BATCH;
215 					continue;
216 				}
217 
218 				econtext->ecxt_outertuple = outerTupleSlot;
219 				node->hj_MatchedOuter = false;
220 
221 				/*
222 				 * Find the corresponding bucket for this tuple in the main
223 				 * hash table or skew hash table.
224 				 */
225 				node->hj_CurHashValue = hashvalue;
226 				ExecHashGetBucketAndBatch(hashtable, hashvalue,
227 										  &node->hj_CurBucketNo, &batchno);
228 				node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
229 																 hashvalue);
230 				node->hj_CurTuple = NULL;
231 
232 				/*
233 				 * The tuple might not belong to the current batch (where
234 				 * "current batch" includes the skew buckets if any).
235 				 */
236 				if (batchno != hashtable->curbatch &&
237 					node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
238 				{
239 					/*
240 					 * Need to postpone this outer tuple to a later batch.
241 					 * Save it in the corresponding outer-batch file.
242 					 */
243 					Assert(batchno > hashtable->curbatch);
244 					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
245 										  hashvalue,
246 										  &hashtable->outerBatchFile[batchno]);
247 					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
248 					continue;
249 				}
250 
251 				/* OK, let's scan the bucket for matches */
252 				node->hj_JoinState = HJ_SCAN_BUCKET;
253 
254 				/* FALL THRU */
255 
256 			case HJ_SCAN_BUCKET:
257 
258 				/*
259 				 * Scan the selected hash bucket for matches to current outer
260 				 */
261 				if (!ExecScanHashBucket(node, econtext))
262 				{
263 					/* out of matches; check for possible outer-join fill */
264 					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
265 					continue;
266 				}
267 
268 				/*
269 				 * We've got a match, but still need to test non-hashed quals.
270 				 * ExecScanHashBucket already set up all the state needed to
271 				 * call ExecQual.
272 				 *
273 				 * If we pass the qual, then save state for next call and have
274 				 * ExecProject form the projection, store it in the tuple
275 				 * table, and return the slot.
276 				 *
277 				 * Only the joinquals determine tuple match status, but all
278 				 * quals must pass to actually return the tuple.
279 				 */
280 				if (joinqual == NULL || ExecQual(joinqual, econtext))
281 				{
282 					node->hj_MatchedOuter = true;
283 					HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
284 
285 					/* In an antijoin, we never return a matched tuple */
286 					if (node->js.jointype == JOIN_ANTI)
287 					{
288 						node->hj_JoinState = HJ_NEED_NEW_OUTER;
289 						continue;
290 					}
291 
292 					/*
293 					 * If we only need to join to the first matching inner
294 					 * tuple, then consider returning this one, but after that
295 					 * continue with next outer tuple.
296 					 */
297 					if (node->js.single_match)
298 						node->hj_JoinState = HJ_NEED_NEW_OUTER;
299 
300 					if (otherqual == NULL || ExecQual(otherqual, econtext))
301 						return ExecProject(node->js.ps.ps_ProjInfo);
302 					else
303 						InstrCountFiltered2(node, 1);
304 				}
305 				else
306 					InstrCountFiltered1(node, 1);
307 				break;
308 
309 			case HJ_FILL_OUTER_TUPLE:
310 
311 				/*
312 				 * The current outer tuple has run out of matches, so check
313 				 * whether to emit a dummy outer-join tuple.  Whether we emit
314 				 * one or not, the next state is NEED_NEW_OUTER.
315 				 */
316 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
317 
318 				if (!node->hj_MatchedOuter &&
319 					HJ_FILL_OUTER(node))
320 				{
321 					/*
322 					 * Generate a fake join tuple with nulls for the inner
323 					 * tuple, and return it if it passes the non-join quals.
324 					 */
325 					econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
326 
327 					if (otherqual == NULL || ExecQual(otherqual, econtext))
328 						return ExecProject(node->js.ps.ps_ProjInfo);
329 					else
330 						InstrCountFiltered2(node, 1);
331 				}
332 				break;
333 
334 			case HJ_FILL_INNER_TUPLES:
335 
336 				/*
337 				 * We have finished a batch, but we are doing right/full join,
338 				 * so any unmatched inner tuples in the hashtable have to be
339 				 * emitted before we continue to the next batch.
340 				 */
341 				if (!ExecScanHashTableForUnmatched(node, econtext))
342 				{
343 					/* no more unmatched tuples */
344 					node->hj_JoinState = HJ_NEED_NEW_BATCH;
345 					continue;
346 				}
347 
348 				/*
349 				 * Generate a fake join tuple with nulls for the outer tuple,
350 				 * and return it if it passes the non-join quals.
351 				 */
352 				econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
353 
354 				if (otherqual == NULL || ExecQual(otherqual, econtext))
355 					return ExecProject(node->js.ps.ps_ProjInfo);
356 				else
357 					InstrCountFiltered2(node, 1);
358 				break;
359 
360 			case HJ_NEED_NEW_BATCH:
361 
362 				/*
363 				 * Try to advance to next batch.  Done if there are no more.
364 				 */
365 				if (!ExecHashJoinNewBatch(node))
366 					return NULL;	/* end of join */
367 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
368 				break;
369 
370 			default:
371 				elog(ERROR, "unrecognized hashjoin state: %d",
372 					 (int) node->hj_JoinState);
373 		}
374 	}
375 }
376 
377 /* ----------------------------------------------------------------
378  *		ExecInitHashJoin
379  *
380  *		Init routine for HashJoin node.
381  * ----------------------------------------------------------------
382  */
383 HashJoinState *
ExecInitHashJoin(HashJoin * node,EState * estate,int eflags)384 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
385 {
386 	HashJoinState *hjstate;
387 	Plan	   *outerNode;
388 	Hash	   *hashNode;
389 	List	   *lclauses;
390 	List	   *rclauses;
391 	List	   *hoperators;
392 	ListCell   *l;
393 
394 	/* check for unsupported flags */
395 	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
396 
397 	/*
398 	 * create state structure
399 	 */
400 	hjstate = makeNode(HashJoinState);
401 	hjstate->js.ps.plan = (Plan *) node;
402 	hjstate->js.ps.state = estate;
403 	hjstate->js.ps.ExecProcNode = ExecHashJoin;
404 
405 	/*
406 	 * Miscellaneous initialization
407 	 *
408 	 * create expression context for node
409 	 */
410 	ExecAssignExprContext(estate, &hjstate->js.ps);
411 
412 	/*
413 	 * initialize child expressions
414 	 */
415 	hjstate->js.ps.qual =
416 		ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
417 	hjstate->js.jointype = node->join.jointype;
418 	hjstate->js.joinqual =
419 		ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
420 	hjstate->hashclauses =
421 		ExecInitQual(node->hashclauses, (PlanState *) hjstate);
422 
423 	/*
424 	 * initialize child nodes
425 	 *
426 	 * Note: we could suppress the REWIND flag for the inner input, which
427 	 * would amount to betting that the hash will be a single batch.  Not
428 	 * clear if this would be a win or not.
429 	 */
430 	outerNode = outerPlan(node);
431 	hashNode = (Hash *) innerPlan(node);
432 
433 	outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
434 	innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
435 
436 	/*
437 	 * tuple table initialization
438 	 */
439 	ExecInitResultTupleSlot(estate, &hjstate->js.ps);
440 	hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
441 
442 	/*
443 	 * detect whether we need only consider the first matching inner tuple
444 	 */
445 	hjstate->js.single_match = (node->join.inner_unique ||
446 								node->join.jointype == JOIN_SEMI);
447 
448 	/* set up null tuples for outer joins, if needed */
449 	switch (node->join.jointype)
450 	{
451 		case JOIN_INNER:
452 		case JOIN_SEMI:
453 			break;
454 		case JOIN_LEFT:
455 		case JOIN_ANTI:
456 			hjstate->hj_NullInnerTupleSlot =
457 				ExecInitNullTupleSlot(estate,
458 									  ExecGetResultType(innerPlanState(hjstate)));
459 			break;
460 		case JOIN_RIGHT:
461 			hjstate->hj_NullOuterTupleSlot =
462 				ExecInitNullTupleSlot(estate,
463 									  ExecGetResultType(outerPlanState(hjstate)));
464 			break;
465 		case JOIN_FULL:
466 			hjstate->hj_NullOuterTupleSlot =
467 				ExecInitNullTupleSlot(estate,
468 									  ExecGetResultType(outerPlanState(hjstate)));
469 			hjstate->hj_NullInnerTupleSlot =
470 				ExecInitNullTupleSlot(estate,
471 									  ExecGetResultType(innerPlanState(hjstate)));
472 			break;
473 		default:
474 			elog(ERROR, "unrecognized join type: %d",
475 				 (int) node->join.jointype);
476 	}
477 
478 	/*
479 	 * now for some voodoo.  our temporary tuple slot is actually the result
480 	 * tuple slot of the Hash node (which is our inner plan).  we can do this
481 	 * because Hash nodes don't return tuples via ExecProcNode() -- instead
482 	 * the hash join node uses ExecScanHashBucket() to get at the contents of
483 	 * the hash table.  -cim 6/9/91
484 	 */
485 	{
486 		HashState  *hashstate = (HashState *) innerPlanState(hjstate);
487 		TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
488 
489 		hjstate->hj_HashTupleSlot = slot;
490 	}
491 
492 	/*
493 	 * initialize tuple type and projection info
494 	 */
495 	ExecAssignResultTypeFromTL(&hjstate->js.ps);
496 	ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
497 
498 	ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
499 						  ExecGetResultType(outerPlanState(hjstate)));
500 
501 	/*
502 	 * initialize hash-specific info
503 	 */
504 	hjstate->hj_HashTable = NULL;
505 	hjstate->hj_FirstOuterTupleSlot = NULL;
506 
507 	hjstate->hj_CurHashValue = 0;
508 	hjstate->hj_CurBucketNo = 0;
509 	hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
510 	hjstate->hj_CurTuple = NULL;
511 
512 	/*
513 	 * Deconstruct the hash clauses into outer and inner argument values, so
514 	 * that we can evaluate those subexpressions separately.  Also make a list
515 	 * of the hash operator OIDs, in preparation for looking up the hash
516 	 * functions to use.
517 	 */
518 	lclauses = NIL;
519 	rclauses = NIL;
520 	hoperators = NIL;
521 	foreach(l, node->hashclauses)
522 	{
523 		OpExpr	   *hclause = lfirst_node(OpExpr, l);
524 
525 		lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args),
526 												  (PlanState *) hjstate));
527 		rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args),
528 												  (PlanState *) hjstate));
529 		hoperators = lappend_oid(hoperators, hclause->opno);
530 	}
531 	hjstate->hj_OuterHashKeys = lclauses;
532 	hjstate->hj_InnerHashKeys = rclauses;
533 	hjstate->hj_HashOperators = hoperators;
534 	/* child Hash node needs to evaluate inner hash keys, too */
535 	((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
536 
537 	hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
538 	hjstate->hj_MatchedOuter = false;
539 	hjstate->hj_OuterNotEmpty = false;
540 
541 	return hjstate;
542 }
543 
544 /* ----------------------------------------------------------------
545  *		ExecEndHashJoin
546  *
547  *		clean up routine for HashJoin node
548  * ----------------------------------------------------------------
549  */
550 void
ExecEndHashJoin(HashJoinState * node)551 ExecEndHashJoin(HashJoinState *node)
552 {
553 	/*
554 	 * Free hash table
555 	 */
556 	if (node->hj_HashTable)
557 	{
558 		ExecHashTableDestroy(node->hj_HashTable);
559 		node->hj_HashTable = NULL;
560 	}
561 
562 	/*
563 	 * Free the exprcontext
564 	 */
565 	ExecFreeExprContext(&node->js.ps);
566 
567 	/*
568 	 * clean out the tuple table
569 	 */
570 	ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
571 	ExecClearTuple(node->hj_OuterTupleSlot);
572 	ExecClearTuple(node->hj_HashTupleSlot);
573 
574 	/*
575 	 * clean up subtrees
576 	 */
577 	ExecEndNode(outerPlanState(node));
578 	ExecEndNode(innerPlanState(node));
579 }
580 
581 /*
582  * ExecHashJoinOuterGetTuple
583  *
584  *		get the next outer tuple for hashjoin: either by
585  *		executing the outer plan node in the first pass, or from
586  *		the temp files for the hashjoin batches.
587  *
588  * Returns a null slot if no more outer tuples (within the current batch).
589  *
590  * On success, the tuple's hash value is stored at *hashvalue --- this is
591  * either originally computed, or re-read from the temp file.
592  */
593 static TupleTableSlot *
ExecHashJoinOuterGetTuple(PlanState * outerNode,HashJoinState * hjstate,uint32 * hashvalue)594 ExecHashJoinOuterGetTuple(PlanState *outerNode,
595 						  HashJoinState *hjstate,
596 						  uint32 *hashvalue)
597 {
598 	HashJoinTable hashtable = hjstate->hj_HashTable;
599 	int			curbatch = hashtable->curbatch;
600 	TupleTableSlot *slot;
601 
602 	if (curbatch == 0)			/* if it is the first pass */
603 	{
604 		/*
605 		 * Check to see if first outer tuple was already fetched by
606 		 * ExecHashJoin() and not used yet.
607 		 */
608 		slot = hjstate->hj_FirstOuterTupleSlot;
609 		if (!TupIsNull(slot))
610 			hjstate->hj_FirstOuterTupleSlot = NULL;
611 		else
612 			slot = ExecProcNode(outerNode);
613 
614 		while (!TupIsNull(slot))
615 		{
616 			/*
617 			 * We have to compute the tuple's hash value.
618 			 */
619 			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
620 
621 			econtext->ecxt_outertuple = slot;
622 			if (ExecHashGetHashValue(hashtable, econtext,
623 									 hjstate->hj_OuterHashKeys,
624 									 true,	/* outer tuple */
625 									 HJ_FILL_OUTER(hjstate),
626 									 hashvalue))
627 			{
628 				/* remember outer relation is not empty for possible rescan */
629 				hjstate->hj_OuterNotEmpty = true;
630 
631 				return slot;
632 			}
633 
634 			/*
635 			 * That tuple couldn't match because of a NULL, so discard it and
636 			 * continue with the next one.
637 			 */
638 			slot = ExecProcNode(outerNode);
639 		}
640 	}
641 	else if (curbatch < hashtable->nbatch)
642 	{
643 		BufFile    *file = hashtable->outerBatchFile[curbatch];
644 
645 		/*
646 		 * In outer-join cases, we could get here even though the batch file
647 		 * is empty.
648 		 */
649 		if (file == NULL)
650 			return NULL;
651 
652 		slot = ExecHashJoinGetSavedTuple(hjstate,
653 										 file,
654 										 hashvalue,
655 										 hjstate->hj_OuterTupleSlot);
656 		if (!TupIsNull(slot))
657 			return slot;
658 	}
659 
660 	/* End of this batch */
661 	return NULL;
662 }
663 
664 /*
665  * ExecHashJoinNewBatch
666  *		switch to a new hashjoin batch
667  *
668  * Returns true if successful, false if there are no more batches.
669  */
670 static bool
ExecHashJoinNewBatch(HashJoinState * hjstate)671 ExecHashJoinNewBatch(HashJoinState *hjstate)
672 {
673 	HashJoinTable hashtable = hjstate->hj_HashTable;
674 	int			nbatch;
675 	int			curbatch;
676 	BufFile    *innerFile;
677 	TupleTableSlot *slot;
678 	uint32		hashvalue;
679 
680 	nbatch = hashtable->nbatch;
681 	curbatch = hashtable->curbatch;
682 
683 	if (curbatch > 0)
684 	{
685 		/*
686 		 * We no longer need the previous outer batch file; close it right
687 		 * away to free disk space.
688 		 */
689 		if (hashtable->outerBatchFile[curbatch])
690 			BufFileClose(hashtable->outerBatchFile[curbatch]);
691 		hashtable->outerBatchFile[curbatch] = NULL;
692 	}
693 	else						/* we just finished the first batch */
694 	{
695 		/*
696 		 * Reset some of the skew optimization state variables, since we no
697 		 * longer need to consider skew tuples after the first batch. The
698 		 * memory context reset we are about to do will release the skew
699 		 * hashtable itself.
700 		 */
701 		hashtable->skewEnabled = false;
702 		hashtable->skewBucket = NULL;
703 		hashtable->skewBucketNums = NULL;
704 		hashtable->nSkewBuckets = 0;
705 		hashtable->spaceUsedSkew = 0;
706 	}
707 
708 	/*
709 	 * We can always skip over any batches that are completely empty on both
710 	 * sides.  We can sometimes skip over batches that are empty on only one
711 	 * side, but there are exceptions:
712 	 *
713 	 * 1. In a left/full outer join, we have to process outer batches even if
714 	 * the inner batch is empty.  Similarly, in a right/full outer join, we
715 	 * have to process inner batches even if the outer batch is empty.
716 	 *
717 	 * 2. If we have increased nbatch since the initial estimate, we have to
718 	 * scan inner batches since they might contain tuples that need to be
719 	 * reassigned to later inner batches.
720 	 *
721 	 * 3. Similarly, if we have increased nbatch since starting the outer
722 	 * scan, we have to rescan outer batches in case they contain tuples that
723 	 * need to be reassigned.
724 	 */
725 	curbatch++;
726 	while (curbatch < nbatch &&
727 		   (hashtable->outerBatchFile[curbatch] == NULL ||
728 			hashtable->innerBatchFile[curbatch] == NULL))
729 	{
730 		if (hashtable->outerBatchFile[curbatch] &&
731 			HJ_FILL_OUTER(hjstate))
732 			break;				/* must process due to rule 1 */
733 		if (hashtable->innerBatchFile[curbatch] &&
734 			HJ_FILL_INNER(hjstate))
735 			break;				/* must process due to rule 1 */
736 		if (hashtable->innerBatchFile[curbatch] &&
737 			nbatch != hashtable->nbatch_original)
738 			break;				/* must process due to rule 2 */
739 		if (hashtable->outerBatchFile[curbatch] &&
740 			nbatch != hashtable->nbatch_outstart)
741 			break;				/* must process due to rule 3 */
742 		/* We can ignore this batch. */
743 		/* Release associated temp files right away. */
744 		if (hashtable->innerBatchFile[curbatch])
745 			BufFileClose(hashtable->innerBatchFile[curbatch]);
746 		hashtable->innerBatchFile[curbatch] = NULL;
747 		if (hashtable->outerBatchFile[curbatch])
748 			BufFileClose(hashtable->outerBatchFile[curbatch]);
749 		hashtable->outerBatchFile[curbatch] = NULL;
750 		curbatch++;
751 	}
752 
753 	if (curbatch >= nbatch)
754 		return false;			/* no more batches */
755 
756 	hashtable->curbatch = curbatch;
757 
758 	/*
759 	 * Reload the hash table with the new inner batch (which could be empty)
760 	 */
761 	ExecHashTableReset(hashtable);
762 
763 	innerFile = hashtable->innerBatchFile[curbatch];
764 
765 	if (innerFile != NULL)
766 	{
767 		if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
768 			ereport(ERROR,
769 					(errcode_for_file_access(),
770 					 errmsg("could not rewind hash-join temporary file")));
771 
772 		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
773 												 innerFile,
774 												 &hashvalue,
775 												 hjstate->hj_HashTupleSlot)))
776 		{
777 			/*
778 			 * NOTE: some tuples may be sent to future batches.  Also, it is
779 			 * possible for hashtable->nbatch to be increased here!
780 			 */
781 			ExecHashTableInsert(hashtable, slot, hashvalue);
782 		}
783 
784 		/*
785 		 * after we build the hash table, the inner batch file is no longer
786 		 * needed
787 		 */
788 		BufFileClose(innerFile);
789 		hashtable->innerBatchFile[curbatch] = NULL;
790 	}
791 
792 	/*
793 	 * Rewind outer batch file (if present), so that we can start reading it.
794 	 */
795 	if (hashtable->outerBatchFile[curbatch] != NULL)
796 	{
797 		if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
798 			ereport(ERROR,
799 					(errcode_for_file_access(),
800 					 errmsg("could not rewind hash-join temporary file")));
801 	}
802 
803 	return true;
804 }
805 
806 /*
807  * ExecHashJoinSaveTuple
808  *		save a tuple to a batch file.
809  *
810  * The data recorded in the file for each tuple is its hash value,
811  * then the tuple in MinimalTuple format.
812  *
813  * Note: it is important always to call this in the regular executor
814  * context, not in a shorter-lived context; else the temp file buffers
815  * will get messed up.
816  */
817 void
ExecHashJoinSaveTuple(MinimalTuple tuple,uint32 hashvalue,BufFile ** fileptr)818 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
819 					  BufFile **fileptr)
820 {
821 	BufFile    *file = *fileptr;
822 
823 	if (file == NULL)
824 	{
825 		/* First write to this batch file, so open it. */
826 		file = BufFileCreateTemp(false);
827 		*fileptr = file;
828 	}
829 
830 	BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
831 	BufFileWrite(file, (void *) tuple, tuple->t_len);
832 }
833 
834 /*
835  * ExecHashJoinGetSavedTuple
836  *		read the next tuple from a batch file.  Return NULL if no more.
837  *
838  * On success, *hashvalue is set to the tuple's hash value, and the tuple
839  * itself is stored in the given slot.
840  */
841 static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState * hjstate,BufFile * file,uint32 * hashvalue,TupleTableSlot * tupleSlot)842 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
843 						  BufFile *file,
844 						  uint32 *hashvalue,
845 						  TupleTableSlot *tupleSlot)
846 {
847 	uint32		header[2];
848 	size_t		nread;
849 	MinimalTuple tuple;
850 
851 	/*
852 	 * We check for interrupts here because this is typically taken as an
853 	 * alternative code path to an ExecProcNode() call, which would include
854 	 * such a check.
855 	 */
856 	CHECK_FOR_INTERRUPTS();
857 
858 	/*
859 	 * Since both the hash value and the MinimalTuple length word are uint32,
860 	 * we can read them both in one BufFileRead() call without any type
861 	 * cheating.
862 	 */
863 	nread = BufFileRead(file, (void *) header, sizeof(header));
864 	if (nread == 0)				/* end of file */
865 	{
866 		ExecClearTuple(tupleSlot);
867 		return NULL;
868 	}
869 	if (nread != sizeof(header))
870 		ereport(ERROR,
871 				(errcode_for_file_access(),
872 				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
873 						nread, sizeof(header))));
874 	*hashvalue = header[0];
875 	tuple = (MinimalTuple) palloc(header[1]);
876 	tuple->t_len = header[1];
877 	nread = BufFileRead(file,
878 						(void *) ((char *) tuple + sizeof(uint32)),
879 						header[1] - sizeof(uint32));
880 	if (nread != header[1] - sizeof(uint32))
881 		ereport(ERROR,
882 				(errcode_for_file_access(),
883 				 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
884 						nread, header[1] - sizeof(uint32))));
885 	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
886 }
887 
888 
889 void
ExecReScanHashJoin(HashJoinState * node)890 ExecReScanHashJoin(HashJoinState *node)
891 {
892 	/*
893 	 * In a multi-batch join, we currently have to do rescans the hard way,
894 	 * primarily because batch temp files may have already been released. But
895 	 * if it's a single-batch join, and there is no parameter change for the
896 	 * inner subnode, then we can just re-use the existing hash table without
897 	 * rebuilding it.
898 	 */
899 	if (node->hj_HashTable != NULL)
900 	{
901 		if (node->hj_HashTable->nbatch == 1 &&
902 			node->js.ps.righttree->chgParam == NULL)
903 		{
904 			/*
905 			 * Okay to reuse the hash table; needn't rescan inner, either.
906 			 *
907 			 * However, if it's a right/full join, we'd better reset the
908 			 * inner-tuple match flags contained in the table.
909 			 */
910 			if (HJ_FILL_INNER(node))
911 				ExecHashTableResetMatchFlags(node->hj_HashTable);
912 
913 			/*
914 			 * Also, we need to reset our state about the emptiness of the
915 			 * outer relation, so that the new scan of the outer will update
916 			 * it correctly if it turns out to be empty this time. (There's no
917 			 * harm in clearing it now because ExecHashJoin won't need the
918 			 * info.  In the other cases, where the hash table doesn't exist
919 			 * or we are destroying it, we leave this state alone because
920 			 * ExecHashJoin will need it the first time through.)
921 			 */
922 			node->hj_OuterNotEmpty = false;
923 
924 			/* ExecHashJoin can skip the BUILD_HASHTABLE step */
925 			node->hj_JoinState = HJ_NEED_NEW_OUTER;
926 		}
927 		else
928 		{
929 			/* must destroy and rebuild hash table */
930 			HashState  *hashNode = castNode(HashState, innerPlanState(node));
931 
932 			/* for safety, be sure to clear child plan node's pointer too */
933 			Assert(hashNode->hashtable == node->hj_HashTable);
934 			hashNode->hashtable = NULL;
935 
936 			ExecHashTableDestroy(node->hj_HashTable);
937 			node->hj_HashTable = NULL;
938 			node->hj_JoinState = HJ_BUILD_HASHTABLE;
939 
940 			/*
941 			 * if chgParam of subnode is not null then plan will be re-scanned
942 			 * by first ExecProcNode.
943 			 */
944 			if (node->js.ps.righttree->chgParam == NULL)
945 				ExecReScan(node->js.ps.righttree);
946 		}
947 	}
948 
949 	/* Always reset intra-tuple state */
950 	node->hj_CurHashValue = 0;
951 	node->hj_CurBucketNo = 0;
952 	node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
953 	node->hj_CurTuple = NULL;
954 
955 	node->hj_MatchedOuter = false;
956 	node->hj_FirstOuterTupleSlot = NULL;
957 
958 	/*
959 	 * if chgParam of subnode is not null then plan will be re-scanned by
960 	 * first ExecProcNode.
961 	 */
962 	if (node->js.ps.lefttree->chgParam == NULL)
963 		ExecReScan(node->js.ps.lefttree);
964 }
965