1 /*-------------------------------------------------------------------------
2  *
3  * gistbuildbuffers.c
4  *	  node buffer management functions for GiST buffering build algorithm.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/gist/gistbuildbuffers.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/gist_private.h"
19 #include "catalog/index.h"
20 #include "miscadmin.h"
21 #include "storage/buffile.h"
22 #include "storage/bufmgr.h"
23 #include "utils/memutils.h"
24 #include "utils/rel.h"
25 
26 static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
27 static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
28 								GISTNodeBuffer *nodeBuffer);
29 static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
30 							   GISTNodeBuffer *nodeBuffer);
31 static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
32 								 GISTNodeBuffer *nodeBuffer);
33 static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
34 								IndexTuple item);
35 static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
36 								IndexTuple *item);
37 static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
38 static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
39 
40 static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
41 static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
42 
43 
44 /*
45  * Initialize GiST build buffers.
46  */
47 GISTBuildBuffers *
48 gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
49 {
50 	GISTBuildBuffers *gfbb;
51 	HASHCTL		hashCtl;
52 
53 	gfbb = palloc(sizeof(GISTBuildBuffers));
54 	gfbb->pagesPerBuffer = pagesPerBuffer;
55 	gfbb->levelStep = levelStep;
56 
57 	/*
58 	 * Create a temporary file to hold buffer pages that are swapped out of
59 	 * memory.
60 	 */
61 	gfbb->pfile = BufFileCreateTemp(false);
62 	gfbb->nFileBlocks = 0;
63 
64 	/* Initialize free page management. */
65 	gfbb->nFreeBlocks = 0;
66 	gfbb->freeBlocksLen = 32;
67 	gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
68 
69 	/*
70 	 * Current memory context will be used for all in-memory data structures
71 	 * of buffers which are persistent during buffering build.
72 	 */
73 	gfbb->context = CurrentMemoryContext;
74 
75 	/*
76 	 * nodeBuffersTab hash is association between index blocks and it's
77 	 * buffers.
78 	 */
79 	memset(&hashCtl, 0, sizeof(hashCtl));
80 	hashCtl.keysize = sizeof(BlockNumber);
81 	hashCtl.entrysize = sizeof(GISTNodeBuffer);
82 	hashCtl.hcxt = CurrentMemoryContext;
83 	gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
84 									   1024,
85 									   &hashCtl,
86 									   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
87 
88 	gfbb->bufferEmptyingQueue = NIL;
89 
90 	/*
91 	 * Per-level node buffers lists for final buffers emptying process. Node
92 	 * buffers are inserted here when they are created.
93 	 */
94 	gfbb->buffersOnLevelsLen = 1;
95 	gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
96 											 gfbb->buffersOnLevelsLen);
97 	gfbb->buffersOnLevels[0] = NIL;
98 
99 	/*
100 	 * Block numbers of node buffers which last pages are currently loaded
101 	 * into main memory.
102 	 */
103 	gfbb->loadedBuffersLen = 32;
104 	gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
105 													 sizeof(GISTNodeBuffer *));
106 	gfbb->loadedBuffersCount = 0;
107 
108 	gfbb->rootlevel = maxLevel;
109 
110 	return gfbb;
111 }
112 
113 /*
114  * Returns a node buffer for given block. The buffer is created if it
115  * doesn't exist yet.
116  */
117 GISTNodeBuffer *
118 gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
119 				  BlockNumber nodeBlocknum, int level)
120 {
121 	GISTNodeBuffer *nodeBuffer;
122 	bool		found;
123 
124 	/* Find node buffer in hash table */
125 	nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
126 												(const void *) &nodeBlocknum,
127 												HASH_ENTER,
128 												&found);
129 	if (!found)
130 	{
131 		/*
132 		 * Node buffer wasn't found. Initialize the new buffer as empty.
133 		 */
134 		MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
135 
136 		/* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
137 		nodeBuffer->blocksCount = 0;
138 		nodeBuffer->pageBlocknum = InvalidBlockNumber;
139 		nodeBuffer->pageBuffer = NULL;
140 		nodeBuffer->queuedForEmptying = false;
141 		nodeBuffer->isTemp = false;
142 		nodeBuffer->level = level;
143 
144 		/*
145 		 * Add this buffer to the list of buffers on this level. Enlarge
146 		 * buffersOnLevels array if needed.
147 		 */
148 		if (level >= gfbb->buffersOnLevelsLen)
149 		{
150 			int			i;
151 
152 			gfbb->buffersOnLevels =
153 				(List **) repalloc(gfbb->buffersOnLevels,
154 								   (level + 1) * sizeof(List *));
155 
156 			/* initialize the enlarged portion */
157 			for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
158 				gfbb->buffersOnLevels[i] = NIL;
159 			gfbb->buffersOnLevelsLen = level + 1;
160 		}
161 
162 		/*
163 		 * Prepend the new buffer to the list of buffers on this level. It's
164 		 * not arbitrary that the new buffer is put to the beginning of the
165 		 * list: in the final emptying phase we loop through all buffers at
166 		 * each level, and flush them. If a page is split during the emptying,
167 		 * it's more efficient to flush the new splitted pages first, before
168 		 * moving on to pre-existing pages on the level. The buffers just
169 		 * created during the page split are likely still in cache, so
170 		 * flushing them immediately is more efficient than putting them to
171 		 * the end of the queue.
172 		 */
173 		gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
174 											 gfbb->buffersOnLevels[level]);
175 
176 		MemoryContextSwitchTo(oldcxt);
177 	}
178 
179 	return nodeBuffer;
180 }
181 
182 /*
183  * Allocate memory for a buffer page.
184  */
185 static GISTNodeBufferPage *
186 gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
187 {
188 	GISTNodeBufferPage *pageBuffer;
189 
190 	pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
191 															   BLCKSZ);
192 	pageBuffer->prev = InvalidBlockNumber;
193 
194 	/* Set page free space */
195 	PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
196 	return pageBuffer;
197 }
198 
199 /*
200  * Add specified buffer into loadedBuffers array.
201  */
202 static void
203 gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
204 {
205 	/* Never add a temporary buffer to the array */
206 	if (nodeBuffer->isTemp)
207 		return;
208 
209 	/* Enlarge the array if needed */
210 	if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
211 	{
212 		gfbb->loadedBuffersLen *= 2;
213 		gfbb->loadedBuffers = (GISTNodeBuffer **)
214 			repalloc(gfbb->loadedBuffers,
215 					 gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
216 	}
217 
218 	gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
219 	gfbb->loadedBuffersCount++;
220 }
221 
222 /*
223  * Load last page of node buffer into main memory.
224  */
225 static void
226 gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
227 {
228 	/* Check if we really should load something */
229 	if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
230 	{
231 		/* Allocate memory for page */
232 		nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
233 
234 		/* Read block from temporary file */
235 		ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
236 						  nodeBuffer->pageBuffer);
237 
238 		/* Mark file block as free */
239 		gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
240 
241 		/* Mark node buffer as loaded */
242 		gistAddLoadedBuffer(gfbb, nodeBuffer);
243 		nodeBuffer->pageBlocknum = InvalidBlockNumber;
244 	}
245 }
246 
247 /*
248  * Write last page of node buffer to the disk.
249  */
250 static void
251 gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
252 {
253 	/* Check if we have something to write */
254 	if (nodeBuffer->pageBuffer)
255 	{
256 		BlockNumber blkno;
257 
258 		/* Get free file block */
259 		blkno = gistBuffersGetFreeBlock(gfbb);
260 
261 		/* Write block to the temporary file */
262 		WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
263 
264 		/* Free memory of that page */
265 		pfree(nodeBuffer->pageBuffer);
266 		nodeBuffer->pageBuffer = NULL;
267 
268 		/* Save block number */
269 		nodeBuffer->pageBlocknum = blkno;
270 	}
271 }
272 
273 /*
274  * Write last pages of all node buffers to the disk.
275  */
276 void
277 gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
278 {
279 	int			i;
280 
281 	/* Unload all the buffers that have a page loaded in memory. */
282 	for (i = 0; i < gfbb->loadedBuffersCount; i++)
283 		gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
284 
285 	/* Now there are no node buffers with loaded last page */
286 	gfbb->loadedBuffersCount = 0;
287 }
288 
289 /*
290  * Add index tuple to buffer page.
291  */
292 static void
293 gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
294 {
295 	Size		itupsz = IndexTupleSize(itup);
296 	char	   *ptr;
297 
298 	/* There should be enough of space. */
299 	Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
300 
301 	/* Reduce free space value of page to reserve a spot for the tuple. */
302 	PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
303 
304 	/* Get pointer to the spot we reserved (ie. end of free space). */
305 	ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
306 		+ PAGE_FREE_SPACE(pageBuffer);
307 
308 	/* Copy the index tuple there. */
309 	memcpy(ptr, itup, itupsz);
310 }
311 
312 /*
313  * Get last item from buffer page and remove it from page.
314  */
315 static void
316 gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
317 {
318 	IndexTuple	ptr;
319 	Size		itupsz;
320 
321 	Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
322 
323 	/* Get pointer to last index tuple */
324 	ptr = (IndexTuple) ((char *) pageBuffer
325 						+ BUFFER_PAGE_DATA_OFFSET
326 						+ PAGE_FREE_SPACE(pageBuffer));
327 	itupsz = IndexTupleSize(ptr);
328 
329 	/* Make a copy of the tuple */
330 	*itup = (IndexTuple) palloc(itupsz);
331 	memcpy(*itup, ptr, itupsz);
332 
333 	/* Mark the space used by the tuple as free */
334 	PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
335 }
336 
337 /*
338  * Push an index tuple to node buffer.
339  */
340 void
341 gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
342 						 IndexTuple itup)
343 {
344 	/*
345 	 * Most part of memory operations will be in buffering build persistent
346 	 * context. So, let's switch to it.
347 	 */
348 	MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
349 
350 	/*
351 	 * If the buffer is currently empty, create the first page.
352 	 */
353 	if (nodeBuffer->blocksCount == 0)
354 	{
355 		nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
356 		nodeBuffer->blocksCount = 1;
357 		gistAddLoadedBuffer(gfbb, nodeBuffer);
358 	}
359 
360 	/* Load last page of node buffer if it wasn't in memory already */
361 	if (!nodeBuffer->pageBuffer)
362 		gistLoadNodeBuffer(gfbb, nodeBuffer);
363 
364 	/*
365 	 * Check if there is enough space on the last page for the tuple.
366 	 */
367 	if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
368 	{
369 		/*
370 		 * Nope. Swap previous block to disk and allocate a new one.
371 		 */
372 		BlockNumber blkno;
373 
374 		/* Write filled page to the disk */
375 		blkno = gistBuffersGetFreeBlock(gfbb);
376 		WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
377 
378 		/*
379 		 * Reset the in-memory page as empty, and link the previous block to
380 		 * the new page by storing its block number in the prev-link.
381 		 */
382 		PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
383 			BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
384 		nodeBuffer->pageBuffer->prev = blkno;
385 
386 		/* We've just added one more page */
387 		nodeBuffer->blocksCount++;
388 	}
389 
390 	gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
391 
392 	/*
393 	 * If the buffer just overflowed, add it to the emptying queue.
394 	 */
395 	if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
396 	{
397 		gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
398 										  gfbb->bufferEmptyingQueue);
399 		nodeBuffer->queuedForEmptying = true;
400 	}
401 
402 	/* Restore memory context */
403 	MemoryContextSwitchTo(oldcxt);
404 }
405 
406 /*
407  * Removes one index tuple from node buffer. Returns true if success and false
408  * if node buffer is empty.
409  */
410 bool
411 gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
412 						  IndexTuple *itup)
413 {
414 	/*
415 	 * If node buffer is empty then return false.
416 	 */
417 	if (nodeBuffer->blocksCount <= 0)
418 		return false;
419 
420 	/* Load last page of node buffer if needed */
421 	if (!nodeBuffer->pageBuffer)
422 		gistLoadNodeBuffer(gfbb, nodeBuffer);
423 
424 	/*
425 	 * Get index tuple from last non-empty page.
426 	 */
427 	gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
428 
429 	/*
430 	 * If we just removed the last tuple from the page, fetch previous page on
431 	 * this node buffer (if any).
432 	 */
433 	if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
434 	{
435 		BlockNumber prevblkno;
436 
437 		/*
438 		 * blocksCount includes the page in pageBuffer, so decrease it now.
439 		 */
440 		nodeBuffer->blocksCount--;
441 
442 		/*
443 		 * If there's more pages, fetch previous one.
444 		 */
445 		prevblkno = nodeBuffer->pageBuffer->prev;
446 		if (prevblkno != InvalidBlockNumber)
447 		{
448 			/* There is a previous page. Fetch it. */
449 			Assert(nodeBuffer->blocksCount > 0);
450 			ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
451 
452 			/*
453 			 * Now that we've read the block in memory, we can release its
454 			 * on-disk block for reuse.
455 			 */
456 			gistBuffersReleaseBlock(gfbb, prevblkno);
457 		}
458 		else
459 		{
460 			/* No more pages. Free memory. */
461 			Assert(nodeBuffer->blocksCount == 0);
462 			pfree(nodeBuffer->pageBuffer);
463 			nodeBuffer->pageBuffer = NULL;
464 		}
465 	}
466 	return true;
467 }
468 
469 /*
470  * Select a currently unused block for writing to.
471  */
472 static long
473 gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
474 {
475 	/*
476 	 * If there are multiple free blocks, we select the one appearing last in
477 	 * freeBlocks[].  If there are none, assign the next block at the end of
478 	 * the file (causing the file to be extended).
479 	 */
480 	if (gfbb->nFreeBlocks > 0)
481 		return gfbb->freeBlocks[--gfbb->nFreeBlocks];
482 	else
483 		return gfbb->nFileBlocks++;
484 }
485 
486 /*
487  * Return a block# to the freelist.
488  */
489 static void
490 gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
491 {
492 	int			ndx;
493 
494 	/* Enlarge freeBlocks array if full. */
495 	if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
496 	{
497 		gfbb->freeBlocksLen *= 2;
498 		gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
499 											 gfbb->freeBlocksLen *
500 											 sizeof(long));
501 	}
502 
503 	/* Add blocknum to array */
504 	ndx = gfbb->nFreeBlocks++;
505 	gfbb->freeBlocks[ndx] = blocknum;
506 }
507 
508 /*
509  * Free buffering build data structure.
510  */
511 void
512 gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
513 {
514 	/* Close buffers file. */
515 	BufFileClose(gfbb->pfile);
516 
517 	/* All other things will be freed on memory context release */
518 }
519 
520 /*
521  * Data structure representing information about node buffer for index tuples
522  * relocation from splitted node buffer.
523  */
524 typedef struct
525 {
526 	GISTENTRY	entry[INDEX_MAX_KEYS];
527 	bool		isnull[INDEX_MAX_KEYS];
528 	GISTPageSplitInfo *splitinfo;
529 	GISTNodeBuffer *nodeBuffer;
530 } RelocationBufferInfo;
531 
532 /*
533  * At page split, distribute tuples from the buffer of the split page to
534  * new buffers for the created page halves. This also adjusts the downlinks
535  * in 'splitinfo' to include the tuples in the buffers.
536  */
537 void
538 gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
539 								Relation r, int level,
540 								Buffer buffer, List *splitinfo)
541 {
542 	RelocationBufferInfo *relocationBuffersInfos;
543 	bool		found;
544 	GISTNodeBuffer *nodeBuffer;
545 	BlockNumber blocknum;
546 	IndexTuple	itup;
547 	int			splitPagesCount = 0,
548 				i;
549 	GISTENTRY	entry[INDEX_MAX_KEYS];
550 	bool		isnull[INDEX_MAX_KEYS];
551 	GISTNodeBuffer oldBuf;
552 	ListCell   *lc;
553 
554 	/* If the splitted page doesn't have buffers, we have nothing to do. */
555 	if (!LEVEL_HAS_BUFFERS(level, gfbb))
556 		return;
557 
558 	/*
559 	 * Get the node buffer of the splitted page.
560 	 */
561 	blocknum = BufferGetBlockNumber(buffer);
562 	nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
563 							 HASH_FIND, &found);
564 	if (!found)
565 	{
566 		/* The page has no buffer, so we have nothing to do. */
567 		return;
568 	}
569 
570 	/*
571 	 * Make a copy of the old buffer, as we're going reuse it as the buffer
572 	 * for the new left page, which is on the same block as the old page.
573 	 * That's not true for the root page, but that's fine because we never
574 	 * have a buffer on the root page anyway. The original algorithm as
575 	 * described by Arge et al did, but it's of no use, as you might as well
576 	 * read the tuples straight from the heap instead of the root buffer.
577 	 */
578 	Assert(blocknum != GIST_ROOT_BLKNO);
579 	memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer));
580 	oldBuf.isTemp = true;
581 
582 	/* Reset the old buffer, used for the new left page from now on */
583 	nodeBuffer->blocksCount = 0;
584 	nodeBuffer->pageBuffer = NULL;
585 	nodeBuffer->pageBlocknum = InvalidBlockNumber;
586 
587 	/*
588 	 * Allocate memory for information about relocation buffers.
589 	 */
590 	splitPagesCount = list_length(splitinfo);
591 	relocationBuffersInfos =
592 		(RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
593 										splitPagesCount);
594 
595 	/*
596 	 * Fill relocation buffers information for node buffers of pages produced
597 	 * by split.
598 	 */
599 	i = 0;
600 	foreach(lc, splitinfo)
601 	{
602 		GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
603 		GISTNodeBuffer *newNodeBuffer;
604 
605 		/* Decompress parent index tuple of node buffer page. */
606 		gistDeCompressAtt(giststate, r,
607 						  si->downlink, NULL, (OffsetNumber) 0,
608 						  relocationBuffersInfos[i].entry,
609 						  relocationBuffersInfos[i].isnull);
610 
611 		/*
612 		 * Create a node buffer for the page. The leftmost half is on the same
613 		 * block as the old page before split, so for the leftmost half this
614 		 * will return the original buffer. The tuples on the original buffer
615 		 * were relinked to the temporary buffer, so the original one is now
616 		 * empty.
617 		 */
618 		newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
619 
620 		relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
621 		relocationBuffersInfos[i].splitinfo = si;
622 
623 		i++;
624 	}
625 
626 	/*
627 	 * Loop through all index tuples in the buffer of the page being split,
628 	 * moving them to buffers for the new pages.  We try to move each tuple to
629 	 * the page that will result in the lowest penalty for the leading column
630 	 * or, in the case of a tie, the lowest penalty for the earliest column
631 	 * that is not tied.
632 	 *
633 	 * The page searching logic is very similar to gistchoose().
634 	 */
635 	while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
636 	{
637 		float		best_penalty[INDEX_MAX_KEYS];
638 		int			i,
639 					which;
640 		IndexTuple	newtup;
641 		RelocationBufferInfo *targetBufferInfo;
642 
643 		gistDeCompressAtt(giststate, r,
644 						  itup, NULL, (OffsetNumber) 0, entry, isnull);
645 
646 		/* default to using first page (shouldn't matter) */
647 		which = 0;
648 
649 		/*
650 		 * best_penalty[j] is the best penalty we have seen so far for column
651 		 * j, or -1 when we haven't yet examined column j.  Array entries to
652 		 * the right of the first -1 are undefined.
653 		 */
654 		best_penalty[0] = -1;
655 
656 		/*
657 		 * Loop over possible target pages, looking for one to move this tuple
658 		 * to.
659 		 */
660 		for (i = 0; i < splitPagesCount; i++)
661 		{
662 			RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
663 			bool		zero_penalty;
664 			int			j;
665 
666 			zero_penalty = true;
667 
668 			/* Loop over index attributes. */
669 			for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++)
670 			{
671 				float		usize;
672 
673 				/* Compute penalty for this column. */
674 				usize = gistpenalty(giststate, j,
675 									&splitPageInfo->entry[j],
676 									splitPageInfo->isnull[j],
677 									&entry[j], isnull[j]);
678 				if (usize > 0)
679 					zero_penalty = false;
680 
681 				if (best_penalty[j] < 0 || usize < best_penalty[j])
682 				{
683 					/*
684 					 * New best penalty for column.  Tentatively select this
685 					 * page as the target, and record the best penalty.  Then
686 					 * reset the next column's penalty to "unknown" (and
687 					 * indirectly, the same for all the ones to its right).
688 					 * This will force us to adopt this page's penalty values
689 					 * as the best for all the remaining columns during
690 					 * subsequent loop iterations.
691 					 */
692 					which = i;
693 					best_penalty[j] = usize;
694 
695 					if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1)
696 						best_penalty[j + 1] = -1;
697 				}
698 				else if (best_penalty[j] == usize)
699 				{
700 					/*
701 					 * The current page is exactly as good for this column as
702 					 * the best page seen so far.  The next iteration of this
703 					 * loop will compare the next column.
704 					 */
705 				}
706 				else
707 				{
708 					/*
709 					 * The current page is worse for this column than the best
710 					 * page seen so far.  Skip the remaining columns and move
711 					 * on to the next page, if any.
712 					 */
713 					zero_penalty = false;	/* so outer loop won't exit */
714 					break;
715 				}
716 			}
717 
718 			/*
719 			 * If we find a page with zero penalty for all columns, there's no
720 			 * need to examine remaining pages; just break out of the loop and
721 			 * return it.
722 			 */
723 			if (zero_penalty)
724 				break;
725 		}
726 
727 		/* OK, "which" is the page index to push the tuple to */
728 		targetBufferInfo = &relocationBuffersInfos[which];
729 
730 		/* Push item to selected node buffer */
731 		gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
732 
733 		/* Adjust the downlink for this page, if needed. */
734 		newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
735 								 itup, giststate);
736 		if (newtup)
737 		{
738 			gistDeCompressAtt(giststate, r,
739 							  newtup, NULL, (OffsetNumber) 0,
740 							  targetBufferInfo->entry,
741 							  targetBufferInfo->isnull);
742 
743 			targetBufferInfo->splitinfo->downlink = newtup;
744 		}
745 	}
746 
747 	pfree(relocationBuffersInfos);
748 }
749 
750 
751 /*
752  * Wrappers around BufFile operations. The main difference is that these
753  * wrappers report errors with ereport(), so that the callers don't need
754  * to check the return code.
755  */
756 
757 static void
758 ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
759 {
760 	size_t		nread;
761 
762 	if (BufFileSeekBlock(file, blknum) != 0)
763 		elog(ERROR, "could not seek to block %ld in temporary file", blknum);
764 	nread = BufFileRead(file, ptr, BLCKSZ);
765 	if (nread != BLCKSZ)
766 		elog(ERROR, "could not read temporary file: read only %zu of %zu bytes",
767 			 nread, (size_t) BLCKSZ);
768 }
769 
770 static void
771 WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
772 {
773 	if (BufFileSeekBlock(file, blknum) != 0)
774 		elog(ERROR, "could not seek to block %ld in temporary file", blknum);
775 	BufFileWrite(file, ptr, BLCKSZ);
776 }
777