1 /*-------------------------------------------------------------------------
2 *
3 * nodeHashjoin.c
4 * Routines to handle hash join nodes
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/executor/nodeHashjoin.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16 #include "postgres.h"
17
18 #include "access/htup_details.h"
19 #include "executor/executor.h"
20 #include "executor/hashjoin.h"
21 #include "executor/nodeHash.h"
22 #include "executor/nodeHashjoin.h"
23 #include "miscadmin.h"
24 #include "utils/memutils.h"
25
26
27 /*
28 * States of the ExecHashJoin state machine
29 */
30 #define HJ_BUILD_HASHTABLE 1
31 #define HJ_NEED_NEW_OUTER 2
32 #define HJ_SCAN_BUCKET 3
33 #define HJ_FILL_OUTER_TUPLE 4
34 #define HJ_FILL_INNER_TUPLES 5
35 #define HJ_NEED_NEW_BATCH 6
36
37 /* Returns true if doing null-fill on outer relation */
38 #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL)
39 /* Returns true if doing null-fill on inner relation */
40 #define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL)
41
42 static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
43 HashJoinState *hjstate,
44 uint32 *hashvalue);
45 static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
46 BufFile *file,
47 uint32 *hashvalue,
48 TupleTableSlot *tupleSlot);
49 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
50
51
52 /* ----------------------------------------------------------------
53 * ExecHashJoin
54 *
55 * This function implements the Hybrid Hashjoin algorithm.
56 *
57 * Note: the relation we build hash table on is the "inner"
58 * the other one is "outer".
59 * ----------------------------------------------------------------
60 */
61 static TupleTableSlot * /* return: a tuple or NULL */
ExecHashJoin(PlanState * pstate)62 ExecHashJoin(PlanState *pstate)
63 {
64 HashJoinState *node = castNode(HashJoinState, pstate);
65 PlanState *outerNode;
66 HashState *hashNode;
67 ExprState *joinqual;
68 ExprState *otherqual;
69 ExprContext *econtext;
70 HashJoinTable hashtable;
71 TupleTableSlot *outerTupleSlot;
72 uint32 hashvalue;
73 int batchno;
74
75 /*
76 * get information from HashJoin node
77 */
78 joinqual = node->js.joinqual;
79 otherqual = node->js.ps.qual;
80 hashNode = (HashState *) innerPlanState(node);
81 outerNode = outerPlanState(node);
82 hashtable = node->hj_HashTable;
83 econtext = node->js.ps.ps_ExprContext;
84
85 /*
86 * Reset per-tuple memory context to free any expression evaluation
87 * storage allocated in the previous tuple cycle.
88 */
89 ResetExprContext(econtext);
90
91 /*
92 * run the hash join state machine
93 */
94 for (;;)
95 {
96 /*
97 * It's possible to iterate this loop many times before returning a
98 * tuple, in some pathological cases such as needing to move much of
99 * the current batch to a later batch. So let's check for interrupts
100 * each time through.
101 */
102 CHECK_FOR_INTERRUPTS();
103
104 switch (node->hj_JoinState)
105 {
106 case HJ_BUILD_HASHTABLE:
107
108 /*
109 * First time through: build hash table for inner relation.
110 */
111 Assert(hashtable == NULL);
112
113 /*
114 * If the outer relation is completely empty, and it's not
115 * right/full join, we can quit without building the hash
116 * table. However, for an inner join it is only a win to
117 * check this when the outer relation's startup cost is less
118 * than the projected cost of building the hash table.
119 * Otherwise it's best to build the hash table first and see
120 * if the inner relation is empty. (When it's a left join, we
121 * should always make this check, since we aren't going to be
122 * able to skip the join on the strength of an empty inner
123 * relation anyway.)
124 *
125 * If we are rescanning the join, we make use of information
126 * gained on the previous scan: don't bother to try the
127 * prefetch if the previous scan found the outer relation
128 * nonempty. This is not 100% reliable since with new
129 * parameters the outer relation might yield different
130 * results, but it's a good heuristic.
131 *
132 * The only way to make the check is to try to fetch a tuple
133 * from the outer plan node. If we succeed, we have to stash
134 * it away for later consumption by ExecHashJoinOuterGetTuple.
135 */
136 if (HJ_FILL_INNER(node))
137 {
138 /* no chance to not build the hash table */
139 node->hj_FirstOuterTupleSlot = NULL;
140 }
141 else if (HJ_FILL_OUTER(node) ||
142 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
143 !node->hj_OuterNotEmpty))
144 {
145 node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
146 if (TupIsNull(node->hj_FirstOuterTupleSlot))
147 {
148 node->hj_OuterNotEmpty = false;
149 return NULL;
150 }
151 else
152 node->hj_OuterNotEmpty = true;
153 }
154 else
155 node->hj_FirstOuterTupleSlot = NULL;
156
157 /*
158 * create the hash table
159 */
160 hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
161 node->hj_HashOperators,
162 HJ_FILL_INNER(node));
163 node->hj_HashTable = hashtable;
164
165 /*
166 * execute the Hash node, to build the hash table
167 */
168 hashNode->hashtable = hashtable;
169 (void) MultiExecProcNode((PlanState *) hashNode);
170
171 /*
172 * If the inner relation is completely empty, and we're not
173 * doing a left outer join, we can quit without scanning the
174 * outer relation.
175 */
176 if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
177 return NULL;
178
179 /*
180 * need to remember whether nbatch has increased since we
181 * began scanning the outer relation
182 */
183 hashtable->nbatch_outstart = hashtable->nbatch;
184
185 /*
186 * Reset OuterNotEmpty for scan. (It's OK if we fetched a
187 * tuple above, because ExecHashJoinOuterGetTuple will
188 * immediately set it again.)
189 */
190 node->hj_OuterNotEmpty = false;
191
192 node->hj_JoinState = HJ_NEED_NEW_OUTER;
193
194 /* FALL THRU */
195
196 case HJ_NEED_NEW_OUTER:
197
198 /*
199 * We don't have an outer tuple, try to get the next one
200 */
201 outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
202 node,
203 &hashvalue);
204 if (TupIsNull(outerTupleSlot))
205 {
206 /* end of batch, or maybe whole join */
207 if (HJ_FILL_INNER(node))
208 {
209 /* set up to scan for unmatched inner tuples */
210 ExecPrepHashTableForUnmatched(node);
211 node->hj_JoinState = HJ_FILL_INNER_TUPLES;
212 }
213 else
214 node->hj_JoinState = HJ_NEED_NEW_BATCH;
215 continue;
216 }
217
218 econtext->ecxt_outertuple = outerTupleSlot;
219 node->hj_MatchedOuter = false;
220
221 /*
222 * Find the corresponding bucket for this tuple in the main
223 * hash table or skew hash table.
224 */
225 node->hj_CurHashValue = hashvalue;
226 ExecHashGetBucketAndBatch(hashtable, hashvalue,
227 &node->hj_CurBucketNo, &batchno);
228 node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable,
229 hashvalue);
230 node->hj_CurTuple = NULL;
231
232 /*
233 * The tuple might not belong to the current batch (where
234 * "current batch" includes the skew buckets if any).
235 */
236 if (batchno != hashtable->curbatch &&
237 node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO)
238 {
239 /*
240 * Need to postpone this outer tuple to a later batch.
241 * Save it in the corresponding outer-batch file.
242 */
243 Assert(batchno > hashtable->curbatch);
244 ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
245 hashvalue,
246 &hashtable->outerBatchFile[batchno]);
247 /* Loop around, staying in HJ_NEED_NEW_OUTER state */
248 continue;
249 }
250
251 /* OK, let's scan the bucket for matches */
252 node->hj_JoinState = HJ_SCAN_BUCKET;
253
254 /* FALL THRU */
255
256 case HJ_SCAN_BUCKET:
257
258 /*
259 * Scan the selected hash bucket for matches to current outer
260 */
261 if (!ExecScanHashBucket(node, econtext))
262 {
263 /* out of matches; check for possible outer-join fill */
264 node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
265 continue;
266 }
267
268 /*
269 * We've got a match, but still need to test non-hashed quals.
270 * ExecScanHashBucket already set up all the state needed to
271 * call ExecQual.
272 *
273 * If we pass the qual, then save state for next call and have
274 * ExecProject form the projection, store it in the tuple
275 * table, and return the slot.
276 *
277 * Only the joinquals determine tuple match status, but all
278 * quals must pass to actually return the tuple.
279 */
280 if (joinqual == NULL || ExecQual(joinqual, econtext))
281 {
282 node->hj_MatchedOuter = true;
283 HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
284
285 /* In an antijoin, we never return a matched tuple */
286 if (node->js.jointype == JOIN_ANTI)
287 {
288 node->hj_JoinState = HJ_NEED_NEW_OUTER;
289 continue;
290 }
291
292 /*
293 * If we only need to join to the first matching inner
294 * tuple, then consider returning this one, but after that
295 * continue with next outer tuple.
296 */
297 if (node->js.single_match)
298 node->hj_JoinState = HJ_NEED_NEW_OUTER;
299
300 if (otherqual == NULL || ExecQual(otherqual, econtext))
301 return ExecProject(node->js.ps.ps_ProjInfo);
302 else
303 InstrCountFiltered2(node, 1);
304 }
305 else
306 InstrCountFiltered1(node, 1);
307 break;
308
309 case HJ_FILL_OUTER_TUPLE:
310
311 /*
312 * The current outer tuple has run out of matches, so check
313 * whether to emit a dummy outer-join tuple. Whether we emit
314 * one or not, the next state is NEED_NEW_OUTER.
315 */
316 node->hj_JoinState = HJ_NEED_NEW_OUTER;
317
318 if (!node->hj_MatchedOuter &&
319 HJ_FILL_OUTER(node))
320 {
321 /*
322 * Generate a fake join tuple with nulls for the inner
323 * tuple, and return it if it passes the non-join quals.
324 */
325 econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;
326
327 if (otherqual == NULL || ExecQual(otherqual, econtext))
328 return ExecProject(node->js.ps.ps_ProjInfo);
329 else
330 InstrCountFiltered2(node, 1);
331 }
332 break;
333
334 case HJ_FILL_INNER_TUPLES:
335
336 /*
337 * We have finished a batch, but we are doing right/full join,
338 * so any unmatched inner tuples in the hashtable have to be
339 * emitted before we continue to the next batch.
340 */
341 if (!ExecScanHashTableForUnmatched(node, econtext))
342 {
343 /* no more unmatched tuples */
344 node->hj_JoinState = HJ_NEED_NEW_BATCH;
345 continue;
346 }
347
348 /*
349 * Generate a fake join tuple with nulls for the outer tuple,
350 * and return it if it passes the non-join quals.
351 */
352 econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
353
354 if (otherqual == NULL || ExecQual(otherqual, econtext))
355 return ExecProject(node->js.ps.ps_ProjInfo);
356 else
357 InstrCountFiltered2(node, 1);
358 break;
359
360 case HJ_NEED_NEW_BATCH:
361
362 /*
363 * Try to advance to next batch. Done if there are no more.
364 */
365 if (!ExecHashJoinNewBatch(node))
366 return NULL; /* end of join */
367 node->hj_JoinState = HJ_NEED_NEW_OUTER;
368 break;
369
370 default:
371 elog(ERROR, "unrecognized hashjoin state: %d",
372 (int) node->hj_JoinState);
373 }
374 }
375 }
376
377 /* ----------------------------------------------------------------
378 * ExecInitHashJoin
379 *
380 * Init routine for HashJoin node.
381 * ----------------------------------------------------------------
382 */
383 HashJoinState *
ExecInitHashJoin(HashJoin * node,EState * estate,int eflags)384 ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
385 {
386 HashJoinState *hjstate;
387 Plan *outerNode;
388 Hash *hashNode;
389 List *lclauses;
390 List *rclauses;
391 List *hoperators;
392 ListCell *l;
393
394 /* check for unsupported flags */
395 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
396
397 /*
398 * create state structure
399 */
400 hjstate = makeNode(HashJoinState);
401 hjstate->js.ps.plan = (Plan *) node;
402 hjstate->js.ps.state = estate;
403 hjstate->js.ps.ExecProcNode = ExecHashJoin;
404
405 /*
406 * Miscellaneous initialization
407 *
408 * create expression context for node
409 */
410 ExecAssignExprContext(estate, &hjstate->js.ps);
411
412 /*
413 * initialize child expressions
414 */
415 hjstate->js.ps.qual =
416 ExecInitQual(node->join.plan.qual, (PlanState *) hjstate);
417 hjstate->js.jointype = node->join.jointype;
418 hjstate->js.joinqual =
419 ExecInitQual(node->join.joinqual, (PlanState *) hjstate);
420 hjstate->hashclauses =
421 ExecInitQual(node->hashclauses, (PlanState *) hjstate);
422
423 /*
424 * initialize child nodes
425 *
426 * Note: we could suppress the REWIND flag for the inner input, which
427 * would amount to betting that the hash will be a single batch. Not
428 * clear if this would be a win or not.
429 */
430 outerNode = outerPlan(node);
431 hashNode = (Hash *) innerPlan(node);
432
433 outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags);
434 innerPlanState(hjstate) = ExecInitNode((Plan *) hashNode, estate, eflags);
435
436 /*
437 * tuple table initialization
438 */
439 ExecInitResultTupleSlot(estate, &hjstate->js.ps);
440 hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate);
441
442 /*
443 * detect whether we need only consider the first matching inner tuple
444 */
445 hjstate->js.single_match = (node->join.inner_unique ||
446 node->join.jointype == JOIN_SEMI);
447
448 /* set up null tuples for outer joins, if needed */
449 switch (node->join.jointype)
450 {
451 case JOIN_INNER:
452 case JOIN_SEMI:
453 break;
454 case JOIN_LEFT:
455 case JOIN_ANTI:
456 hjstate->hj_NullInnerTupleSlot =
457 ExecInitNullTupleSlot(estate,
458 ExecGetResultType(innerPlanState(hjstate)));
459 break;
460 case JOIN_RIGHT:
461 hjstate->hj_NullOuterTupleSlot =
462 ExecInitNullTupleSlot(estate,
463 ExecGetResultType(outerPlanState(hjstate)));
464 break;
465 case JOIN_FULL:
466 hjstate->hj_NullOuterTupleSlot =
467 ExecInitNullTupleSlot(estate,
468 ExecGetResultType(outerPlanState(hjstate)));
469 hjstate->hj_NullInnerTupleSlot =
470 ExecInitNullTupleSlot(estate,
471 ExecGetResultType(innerPlanState(hjstate)));
472 break;
473 default:
474 elog(ERROR, "unrecognized join type: %d",
475 (int) node->join.jointype);
476 }
477
478 /*
479 * now for some voodoo. our temporary tuple slot is actually the result
480 * tuple slot of the Hash node (which is our inner plan). we can do this
481 * because Hash nodes don't return tuples via ExecProcNode() -- instead
482 * the hash join node uses ExecScanHashBucket() to get at the contents of
483 * the hash table. -cim 6/9/91
484 */
485 {
486 HashState *hashstate = (HashState *) innerPlanState(hjstate);
487 TupleTableSlot *slot = hashstate->ps.ps_ResultTupleSlot;
488
489 hjstate->hj_HashTupleSlot = slot;
490 }
491
492 /*
493 * initialize tuple type and projection info
494 */
495 ExecAssignResultTypeFromTL(&hjstate->js.ps);
496 ExecAssignProjectionInfo(&hjstate->js.ps, NULL);
497
498 ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot,
499 ExecGetResultType(outerPlanState(hjstate)));
500
501 /*
502 * initialize hash-specific info
503 */
504 hjstate->hj_HashTable = NULL;
505 hjstate->hj_FirstOuterTupleSlot = NULL;
506
507 hjstate->hj_CurHashValue = 0;
508 hjstate->hj_CurBucketNo = 0;
509 hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
510 hjstate->hj_CurTuple = NULL;
511
512 /*
513 * Deconstruct the hash clauses into outer and inner argument values, so
514 * that we can evaluate those subexpressions separately. Also make a list
515 * of the hash operator OIDs, in preparation for looking up the hash
516 * functions to use.
517 */
518 lclauses = NIL;
519 rclauses = NIL;
520 hoperators = NIL;
521 foreach(l, node->hashclauses)
522 {
523 OpExpr *hclause = lfirst_node(OpExpr, l);
524
525 lclauses = lappend(lclauses, ExecInitExpr(linitial(hclause->args),
526 (PlanState *) hjstate));
527 rclauses = lappend(rclauses, ExecInitExpr(lsecond(hclause->args),
528 (PlanState *) hjstate));
529 hoperators = lappend_oid(hoperators, hclause->opno);
530 }
531 hjstate->hj_OuterHashKeys = lclauses;
532 hjstate->hj_InnerHashKeys = rclauses;
533 hjstate->hj_HashOperators = hoperators;
534 /* child Hash node needs to evaluate inner hash keys, too */
535 ((HashState *) innerPlanState(hjstate))->hashkeys = rclauses;
536
537 hjstate->hj_JoinState = HJ_BUILD_HASHTABLE;
538 hjstate->hj_MatchedOuter = false;
539 hjstate->hj_OuterNotEmpty = false;
540
541 return hjstate;
542 }
543
544 /* ----------------------------------------------------------------
545 * ExecEndHashJoin
546 *
547 * clean up routine for HashJoin node
548 * ----------------------------------------------------------------
549 */
550 void
ExecEndHashJoin(HashJoinState * node)551 ExecEndHashJoin(HashJoinState *node)
552 {
553 /*
554 * Free hash table
555 */
556 if (node->hj_HashTable)
557 {
558 ExecHashTableDestroy(node->hj_HashTable);
559 node->hj_HashTable = NULL;
560 }
561
562 /*
563 * Free the exprcontext
564 */
565 ExecFreeExprContext(&node->js.ps);
566
567 /*
568 * clean out the tuple table
569 */
570 ExecClearTuple(node->js.ps.ps_ResultTupleSlot);
571 ExecClearTuple(node->hj_OuterTupleSlot);
572 ExecClearTuple(node->hj_HashTupleSlot);
573
574 /*
575 * clean up subtrees
576 */
577 ExecEndNode(outerPlanState(node));
578 ExecEndNode(innerPlanState(node));
579 }
580
581 /*
582 * ExecHashJoinOuterGetTuple
583 *
584 * get the next outer tuple for hashjoin: either by
585 * executing the outer plan node in the first pass, or from
586 * the temp files for the hashjoin batches.
587 *
588 * Returns a null slot if no more outer tuples (within the current batch).
589 *
590 * On success, the tuple's hash value is stored at *hashvalue --- this is
591 * either originally computed, or re-read from the temp file.
592 */
593 static TupleTableSlot *
ExecHashJoinOuterGetTuple(PlanState * outerNode,HashJoinState * hjstate,uint32 * hashvalue)594 ExecHashJoinOuterGetTuple(PlanState *outerNode,
595 HashJoinState *hjstate,
596 uint32 *hashvalue)
597 {
598 HashJoinTable hashtable = hjstate->hj_HashTable;
599 int curbatch = hashtable->curbatch;
600 TupleTableSlot *slot;
601
602 if (curbatch == 0) /* if it is the first pass */
603 {
604 /*
605 * Check to see if first outer tuple was already fetched by
606 * ExecHashJoin() and not used yet.
607 */
608 slot = hjstate->hj_FirstOuterTupleSlot;
609 if (!TupIsNull(slot))
610 hjstate->hj_FirstOuterTupleSlot = NULL;
611 else
612 slot = ExecProcNode(outerNode);
613
614 while (!TupIsNull(slot))
615 {
616 /*
617 * We have to compute the tuple's hash value.
618 */
619 ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
620
621 econtext->ecxt_outertuple = slot;
622 if (ExecHashGetHashValue(hashtable, econtext,
623 hjstate->hj_OuterHashKeys,
624 true, /* outer tuple */
625 HJ_FILL_OUTER(hjstate),
626 hashvalue))
627 {
628 /* remember outer relation is not empty for possible rescan */
629 hjstate->hj_OuterNotEmpty = true;
630
631 return slot;
632 }
633
634 /*
635 * That tuple couldn't match because of a NULL, so discard it and
636 * continue with the next one.
637 */
638 slot = ExecProcNode(outerNode);
639 }
640 }
641 else if (curbatch < hashtable->nbatch)
642 {
643 BufFile *file = hashtable->outerBatchFile[curbatch];
644
645 /*
646 * In outer-join cases, we could get here even though the batch file
647 * is empty.
648 */
649 if (file == NULL)
650 return NULL;
651
652 slot = ExecHashJoinGetSavedTuple(hjstate,
653 file,
654 hashvalue,
655 hjstate->hj_OuterTupleSlot);
656 if (!TupIsNull(slot))
657 return slot;
658 }
659
660 /* End of this batch */
661 return NULL;
662 }
663
664 /*
665 * ExecHashJoinNewBatch
666 * switch to a new hashjoin batch
667 *
668 * Returns true if successful, false if there are no more batches.
669 */
670 static bool
ExecHashJoinNewBatch(HashJoinState * hjstate)671 ExecHashJoinNewBatch(HashJoinState *hjstate)
672 {
673 HashJoinTable hashtable = hjstate->hj_HashTable;
674 int nbatch;
675 int curbatch;
676 BufFile *innerFile;
677 TupleTableSlot *slot;
678 uint32 hashvalue;
679
680 nbatch = hashtable->nbatch;
681 curbatch = hashtable->curbatch;
682
683 if (curbatch > 0)
684 {
685 /*
686 * We no longer need the previous outer batch file; close it right
687 * away to free disk space.
688 */
689 if (hashtable->outerBatchFile[curbatch])
690 BufFileClose(hashtable->outerBatchFile[curbatch]);
691 hashtable->outerBatchFile[curbatch] = NULL;
692 }
693 else /* we just finished the first batch */
694 {
695 /*
696 * Reset some of the skew optimization state variables, since we no
697 * longer need to consider skew tuples after the first batch. The
698 * memory context reset we are about to do will release the skew
699 * hashtable itself.
700 */
701 hashtable->skewEnabled = false;
702 hashtable->skewBucket = NULL;
703 hashtable->skewBucketNums = NULL;
704 hashtable->nSkewBuckets = 0;
705 hashtable->spaceUsedSkew = 0;
706 }
707
708 /*
709 * We can always skip over any batches that are completely empty on both
710 * sides. We can sometimes skip over batches that are empty on only one
711 * side, but there are exceptions:
712 *
713 * 1. In a left/full outer join, we have to process outer batches even if
714 * the inner batch is empty. Similarly, in a right/full outer join, we
715 * have to process inner batches even if the outer batch is empty.
716 *
717 * 2. If we have increased nbatch since the initial estimate, we have to
718 * scan inner batches since they might contain tuples that need to be
719 * reassigned to later inner batches.
720 *
721 * 3. Similarly, if we have increased nbatch since starting the outer
722 * scan, we have to rescan outer batches in case they contain tuples that
723 * need to be reassigned.
724 */
725 curbatch++;
726 while (curbatch < nbatch &&
727 (hashtable->outerBatchFile[curbatch] == NULL ||
728 hashtable->innerBatchFile[curbatch] == NULL))
729 {
730 if (hashtable->outerBatchFile[curbatch] &&
731 HJ_FILL_OUTER(hjstate))
732 break; /* must process due to rule 1 */
733 if (hashtable->innerBatchFile[curbatch] &&
734 HJ_FILL_INNER(hjstate))
735 break; /* must process due to rule 1 */
736 if (hashtable->innerBatchFile[curbatch] &&
737 nbatch != hashtable->nbatch_original)
738 break; /* must process due to rule 2 */
739 if (hashtable->outerBatchFile[curbatch] &&
740 nbatch != hashtable->nbatch_outstart)
741 break; /* must process due to rule 3 */
742 /* We can ignore this batch. */
743 /* Release associated temp files right away. */
744 if (hashtable->innerBatchFile[curbatch])
745 BufFileClose(hashtable->innerBatchFile[curbatch]);
746 hashtable->innerBatchFile[curbatch] = NULL;
747 if (hashtable->outerBatchFile[curbatch])
748 BufFileClose(hashtable->outerBatchFile[curbatch]);
749 hashtable->outerBatchFile[curbatch] = NULL;
750 curbatch++;
751 }
752
753 if (curbatch >= nbatch)
754 return false; /* no more batches */
755
756 hashtable->curbatch = curbatch;
757
758 /*
759 * Reload the hash table with the new inner batch (which could be empty)
760 */
761 ExecHashTableReset(hashtable);
762
763 innerFile = hashtable->innerBatchFile[curbatch];
764
765 if (innerFile != NULL)
766 {
767 if (BufFileSeek(innerFile, 0, 0L, SEEK_SET))
768 ereport(ERROR,
769 (errcode_for_file_access(),
770 errmsg("could not rewind hash-join temporary file")));
771
772 while ((slot = ExecHashJoinGetSavedTuple(hjstate,
773 innerFile,
774 &hashvalue,
775 hjstate->hj_HashTupleSlot)))
776 {
777 /*
778 * NOTE: some tuples may be sent to future batches. Also, it is
779 * possible for hashtable->nbatch to be increased here!
780 */
781 ExecHashTableInsert(hashtable, slot, hashvalue);
782 }
783
784 /*
785 * after we build the hash table, the inner batch file is no longer
786 * needed
787 */
788 BufFileClose(innerFile);
789 hashtable->innerBatchFile[curbatch] = NULL;
790 }
791
792 /*
793 * Rewind outer batch file (if present), so that we can start reading it.
794 */
795 if (hashtable->outerBatchFile[curbatch] != NULL)
796 {
797 if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET))
798 ereport(ERROR,
799 (errcode_for_file_access(),
800 errmsg("could not rewind hash-join temporary file")));
801 }
802
803 return true;
804 }
805
806 /*
807 * ExecHashJoinSaveTuple
808 * save a tuple to a batch file.
809 *
810 * The data recorded in the file for each tuple is its hash value,
811 * then the tuple in MinimalTuple format.
812 *
813 * Note: it is important always to call this in the regular executor
814 * context, not in a shorter-lived context; else the temp file buffers
815 * will get messed up.
816 */
817 void
ExecHashJoinSaveTuple(MinimalTuple tuple,uint32 hashvalue,BufFile ** fileptr)818 ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
819 BufFile **fileptr)
820 {
821 BufFile *file = *fileptr;
822
823 if (file == NULL)
824 {
825 /* First write to this batch file, so open it. */
826 file = BufFileCreateTemp(false);
827 *fileptr = file;
828 }
829
830 BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
831 BufFileWrite(file, (void *) tuple, tuple->t_len);
832 }
833
834 /*
835 * ExecHashJoinGetSavedTuple
836 * read the next tuple from a batch file. Return NULL if no more.
837 *
838 * On success, *hashvalue is set to the tuple's hash value, and the tuple
839 * itself is stored in the given slot.
840 */
841 static TupleTableSlot *
ExecHashJoinGetSavedTuple(HashJoinState * hjstate,BufFile * file,uint32 * hashvalue,TupleTableSlot * tupleSlot)842 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
843 BufFile *file,
844 uint32 *hashvalue,
845 TupleTableSlot *tupleSlot)
846 {
847 uint32 header[2];
848 size_t nread;
849 MinimalTuple tuple;
850
851 /*
852 * We check for interrupts here because this is typically taken as an
853 * alternative code path to an ExecProcNode() call, which would include
854 * such a check.
855 */
856 CHECK_FOR_INTERRUPTS();
857
858 /*
859 * Since both the hash value and the MinimalTuple length word are uint32,
860 * we can read them both in one BufFileRead() call without any type
861 * cheating.
862 */
863 nread = BufFileRead(file, (void *) header, sizeof(header));
864 if (nread == 0) /* end of file */
865 {
866 ExecClearTuple(tupleSlot);
867 return NULL;
868 }
869 if (nread != sizeof(header))
870 ereport(ERROR,
871 (errcode_for_file_access(),
872 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
873 nread, sizeof(header))));
874 *hashvalue = header[0];
875 tuple = (MinimalTuple) palloc(header[1]);
876 tuple->t_len = header[1];
877 nread = BufFileRead(file,
878 (void *) ((char *) tuple + sizeof(uint32)),
879 header[1] - sizeof(uint32));
880 if (nread != header[1] - sizeof(uint32))
881 ereport(ERROR,
882 (errcode_for_file_access(),
883 errmsg("could not read from hash-join temporary file: read only %zu of %zu bytes",
884 nread, header[1] - sizeof(uint32))));
885 return ExecStoreMinimalTuple(tuple, tupleSlot, true);
886 }
887
888
889 void
ExecReScanHashJoin(HashJoinState * node)890 ExecReScanHashJoin(HashJoinState *node)
891 {
892 /*
893 * In a multi-batch join, we currently have to do rescans the hard way,
894 * primarily because batch temp files may have already been released. But
895 * if it's a single-batch join, and there is no parameter change for the
896 * inner subnode, then we can just re-use the existing hash table without
897 * rebuilding it.
898 */
899 if (node->hj_HashTable != NULL)
900 {
901 if (node->hj_HashTable->nbatch == 1 &&
902 node->js.ps.righttree->chgParam == NULL)
903 {
904 /*
905 * Okay to reuse the hash table; needn't rescan inner, either.
906 *
907 * However, if it's a right/full join, we'd better reset the
908 * inner-tuple match flags contained in the table.
909 */
910 if (HJ_FILL_INNER(node))
911 ExecHashTableResetMatchFlags(node->hj_HashTable);
912
913 /*
914 * Also, we need to reset our state about the emptiness of the
915 * outer relation, so that the new scan of the outer will update
916 * it correctly if it turns out to be empty this time. (There's no
917 * harm in clearing it now because ExecHashJoin won't need the
918 * info. In the other cases, where the hash table doesn't exist
919 * or we are destroying it, we leave this state alone because
920 * ExecHashJoin will need it the first time through.)
921 */
922 node->hj_OuterNotEmpty = false;
923
924 /* ExecHashJoin can skip the BUILD_HASHTABLE step */
925 node->hj_JoinState = HJ_NEED_NEW_OUTER;
926 }
927 else
928 {
929 /* must destroy and rebuild hash table */
930 HashState *hashNode = castNode(HashState, innerPlanState(node));
931
932 /* for safety, be sure to clear child plan node's pointer too */
933 Assert(hashNode->hashtable == node->hj_HashTable);
934 hashNode->hashtable = NULL;
935
936 ExecHashTableDestroy(node->hj_HashTable);
937 node->hj_HashTable = NULL;
938 node->hj_JoinState = HJ_BUILD_HASHTABLE;
939
940 /*
941 * if chgParam of subnode is not null then plan will be re-scanned
942 * by first ExecProcNode.
943 */
944 if (node->js.ps.righttree->chgParam == NULL)
945 ExecReScan(node->js.ps.righttree);
946 }
947 }
948
949 /* Always reset intra-tuple state */
950 node->hj_CurHashValue = 0;
951 node->hj_CurBucketNo = 0;
952 node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO;
953 node->hj_CurTuple = NULL;
954
955 node->hj_MatchedOuter = false;
956 node->hj_FirstOuterTupleSlot = NULL;
957
958 /*
959 * if chgParam of subnode is not null then plan will be re-scanned by
960 * first ExecProcNode.
961 */
962 if (node->js.ps.lefttree->chgParam == NULL)
963 ExecReScan(node->js.ps.lefttree);
964 }
965