1 /*-------------------------------------------------------------------------
2  *
3  * gistbuildbuffers.c
4  *	  node buffer management functions for GiST buffering build algorithm.
5  *
6  *
7  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/gist/gistbuildbuffers.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/gist_private.h"
19 #include "catalog/index.h"
20 #include "miscadmin.h"
21 #include "storage/buffile.h"
22 #include "storage/bufmgr.h"
23 #include "utils/memutils.h"
24 #include "utils/rel.h"
25 
26 static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
27 static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
28 								GISTNodeBuffer *nodeBuffer);
29 static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
30 							   GISTNodeBuffer *nodeBuffer);
31 static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
32 								 GISTNodeBuffer *nodeBuffer);
33 static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
34 								IndexTuple item);
35 static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
36 								IndexTuple *item);
37 static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
38 static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
39 
40 static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
41 static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
42 
43 
44 /*
45  * Initialize GiST build buffers.
46  */
47 GISTBuildBuffers *
gistInitBuildBuffers(int pagesPerBuffer,int levelStep,int maxLevel)48 gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
49 {
50 	GISTBuildBuffers *gfbb;
51 	HASHCTL		hashCtl;
52 
53 	gfbb = palloc(sizeof(GISTBuildBuffers));
54 	gfbb->pagesPerBuffer = pagesPerBuffer;
55 	gfbb->levelStep = levelStep;
56 
57 	/*
58 	 * Create a temporary file to hold buffer pages that are swapped out of
59 	 * memory.
60 	 */
61 	gfbb->pfile = BufFileCreateTemp(false);
62 	gfbb->nFileBlocks = 0;
63 
64 	/* Initialize free page management. */
65 	gfbb->nFreeBlocks = 0;
66 	gfbb->freeBlocksLen = 32;
67 	gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
68 
69 	/*
70 	 * Current memory context will be used for all in-memory data structures
71 	 * of buffers which are persistent during buffering build.
72 	 */
73 	gfbb->context = CurrentMemoryContext;
74 
75 	/*
76 	 * nodeBuffersTab hash is association between index blocks and it's
77 	 * buffers.
78 	 */
79 	hashCtl.keysize = sizeof(BlockNumber);
80 	hashCtl.entrysize = sizeof(GISTNodeBuffer);
81 	hashCtl.hcxt = CurrentMemoryContext;
82 	gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
83 									   1024,
84 									   &hashCtl,
85 									   HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
86 
87 	gfbb->bufferEmptyingQueue = NIL;
88 
89 	/*
90 	 * Per-level node buffers lists for final buffers emptying process. Node
91 	 * buffers are inserted here when they are created.
92 	 */
93 	gfbb->buffersOnLevelsLen = 1;
94 	gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
95 											 gfbb->buffersOnLevelsLen);
96 	gfbb->buffersOnLevels[0] = NIL;
97 
98 	/*
99 	 * Block numbers of node buffers which last pages are currently loaded
100 	 * into main memory.
101 	 */
102 	gfbb->loadedBuffersLen = 32;
103 	gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
104 													 sizeof(GISTNodeBuffer *));
105 	gfbb->loadedBuffersCount = 0;
106 
107 	gfbb->rootlevel = maxLevel;
108 
109 	return gfbb;
110 }
111 
112 /*
113  * Returns a node buffer for given block. The buffer is created if it
114  * doesn't exist yet.
115  */
116 GISTNodeBuffer *
gistGetNodeBuffer(GISTBuildBuffers * gfbb,GISTSTATE * giststate,BlockNumber nodeBlocknum,int level)117 gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
118 				  BlockNumber nodeBlocknum, int level)
119 {
120 	GISTNodeBuffer *nodeBuffer;
121 	bool		found;
122 
123 	/* Find node buffer in hash table */
124 	nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
125 												(const void *) &nodeBlocknum,
126 												HASH_ENTER,
127 												&found);
128 	if (!found)
129 	{
130 		/*
131 		 * Node buffer wasn't found. Initialize the new buffer as empty.
132 		 */
133 		MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
134 
135 		/* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
136 		nodeBuffer->blocksCount = 0;
137 		nodeBuffer->pageBlocknum = InvalidBlockNumber;
138 		nodeBuffer->pageBuffer = NULL;
139 		nodeBuffer->queuedForEmptying = false;
140 		nodeBuffer->isTemp = false;
141 		nodeBuffer->level = level;
142 
143 		/*
144 		 * Add this buffer to the list of buffers on this level. Enlarge
145 		 * buffersOnLevels array if needed.
146 		 */
147 		if (level >= gfbb->buffersOnLevelsLen)
148 		{
149 			int			i;
150 
151 			gfbb->buffersOnLevels =
152 				(List **) repalloc(gfbb->buffersOnLevels,
153 								   (level + 1) * sizeof(List *));
154 
155 			/* initialize the enlarged portion */
156 			for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
157 				gfbb->buffersOnLevels[i] = NIL;
158 			gfbb->buffersOnLevelsLen = level + 1;
159 		}
160 
161 		/*
162 		 * Prepend the new buffer to the list of buffers on this level. It's
163 		 * not arbitrary that the new buffer is put to the beginning of the
164 		 * list: in the final emptying phase we loop through all buffers at
165 		 * each level, and flush them. If a page is split during the emptying,
166 		 * it's more efficient to flush the new splitted pages first, before
167 		 * moving on to pre-existing pages on the level. The buffers just
168 		 * created during the page split are likely still in cache, so
169 		 * flushing them immediately is more efficient than putting them to
170 		 * the end of the queue.
171 		 */
172 		gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
173 											 gfbb->buffersOnLevels[level]);
174 
175 		MemoryContextSwitchTo(oldcxt);
176 	}
177 
178 	return nodeBuffer;
179 }
180 
181 /*
182  * Allocate memory for a buffer page.
183  */
184 static GISTNodeBufferPage *
gistAllocateNewPageBuffer(GISTBuildBuffers * gfbb)185 gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
186 {
187 	GISTNodeBufferPage *pageBuffer;
188 
189 	pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
190 															   BLCKSZ);
191 	pageBuffer->prev = InvalidBlockNumber;
192 
193 	/* Set page free space */
194 	PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
195 	return pageBuffer;
196 }
197 
198 /*
199  * Add specified buffer into loadedBuffers array.
200  */
201 static void
gistAddLoadedBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)202 gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
203 {
204 	/* Never add a temporary buffer to the array */
205 	if (nodeBuffer->isTemp)
206 		return;
207 
208 	/* Enlarge the array if needed */
209 	if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
210 	{
211 		gfbb->loadedBuffersLen *= 2;
212 		gfbb->loadedBuffers = (GISTNodeBuffer **)
213 			repalloc(gfbb->loadedBuffers,
214 					 gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
215 	}
216 
217 	gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
218 	gfbb->loadedBuffersCount++;
219 }
220 
221 /*
222  * Load last page of node buffer into main memory.
223  */
224 static void
gistLoadNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)225 gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
226 {
227 	/* Check if we really should load something */
228 	if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
229 	{
230 		/* Allocate memory for page */
231 		nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
232 
233 		/* Read block from temporary file */
234 		ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
235 						  nodeBuffer->pageBuffer);
236 
237 		/* Mark file block as free */
238 		gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
239 
240 		/* Mark node buffer as loaded */
241 		gistAddLoadedBuffer(gfbb, nodeBuffer);
242 		nodeBuffer->pageBlocknum = InvalidBlockNumber;
243 	}
244 }
245 
246 /*
247  * Write last page of node buffer to the disk.
248  */
249 static void
gistUnloadNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)250 gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
251 {
252 	/* Check if we have something to write */
253 	if (nodeBuffer->pageBuffer)
254 	{
255 		BlockNumber blkno;
256 
257 		/* Get free file block */
258 		blkno = gistBuffersGetFreeBlock(gfbb);
259 
260 		/* Write block to the temporary file */
261 		WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
262 
263 		/* Free memory of that page */
264 		pfree(nodeBuffer->pageBuffer);
265 		nodeBuffer->pageBuffer = NULL;
266 
267 		/* Save block number */
268 		nodeBuffer->pageBlocknum = blkno;
269 	}
270 }
271 
272 /*
273  * Write last pages of all node buffers to the disk.
274  */
275 void
gistUnloadNodeBuffers(GISTBuildBuffers * gfbb)276 gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
277 {
278 	int			i;
279 
280 	/* Unload all the buffers that have a page loaded in memory. */
281 	for (i = 0; i < gfbb->loadedBuffersCount; i++)
282 		gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
283 
284 	/* Now there are no node buffers with loaded last page */
285 	gfbb->loadedBuffersCount = 0;
286 }
287 
288 /*
289  * Add index tuple to buffer page.
290  */
291 static void
gistPlaceItupToPage(GISTNodeBufferPage * pageBuffer,IndexTuple itup)292 gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
293 {
294 	Size		itupsz = IndexTupleSize(itup);
295 	char	   *ptr;
296 
297 	/* There should be enough of space. */
298 	Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
299 
300 	/* Reduce free space value of page to reserve a spot for the tuple. */
301 	PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
302 
303 	/* Get pointer to the spot we reserved (ie. end of free space). */
304 	ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
305 		+ PAGE_FREE_SPACE(pageBuffer);
306 
307 	/* Copy the index tuple there. */
308 	memcpy(ptr, itup, itupsz);
309 }
310 
311 /*
312  * Get last item from buffer page and remove it from page.
313  */
314 static void
gistGetItupFromPage(GISTNodeBufferPage * pageBuffer,IndexTuple * itup)315 gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
316 {
317 	IndexTuple	ptr;
318 	Size		itupsz;
319 
320 	Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
321 
322 	/* Get pointer to last index tuple */
323 	ptr = (IndexTuple) ((char *) pageBuffer
324 						+ BUFFER_PAGE_DATA_OFFSET
325 						+ PAGE_FREE_SPACE(pageBuffer));
326 	itupsz = IndexTupleSize(ptr);
327 
328 	/* Make a copy of the tuple */
329 	*itup = (IndexTuple) palloc(itupsz);
330 	memcpy(*itup, ptr, itupsz);
331 
332 	/* Mark the space used by the tuple as free */
333 	PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
334 }
335 
336 /*
337  * Push an index tuple to node buffer.
338  */
339 void
gistPushItupToNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer,IndexTuple itup)340 gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
341 						 IndexTuple itup)
342 {
343 	/*
344 	 * Most part of memory operations will be in buffering build persistent
345 	 * context. So, let's switch to it.
346 	 */
347 	MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
348 
349 	/*
350 	 * If the buffer is currently empty, create the first page.
351 	 */
352 	if (nodeBuffer->blocksCount == 0)
353 	{
354 		nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
355 		nodeBuffer->blocksCount = 1;
356 		gistAddLoadedBuffer(gfbb, nodeBuffer);
357 	}
358 
359 	/* Load last page of node buffer if it wasn't in memory already */
360 	if (!nodeBuffer->pageBuffer)
361 		gistLoadNodeBuffer(gfbb, nodeBuffer);
362 
363 	/*
364 	 * Check if there is enough space on the last page for the tuple.
365 	 */
366 	if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
367 	{
368 		/*
369 		 * Nope. Swap previous block to disk and allocate a new one.
370 		 */
371 		BlockNumber blkno;
372 
373 		/* Write filled page to the disk */
374 		blkno = gistBuffersGetFreeBlock(gfbb);
375 		WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
376 
377 		/*
378 		 * Reset the in-memory page as empty, and link the previous block to
379 		 * the new page by storing its block number in the prev-link.
380 		 */
381 		PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
382 			BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
383 		nodeBuffer->pageBuffer->prev = blkno;
384 
385 		/* We've just added one more page */
386 		nodeBuffer->blocksCount++;
387 	}
388 
389 	gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
390 
391 	/*
392 	 * If the buffer just overflowed, add it to the emptying queue.
393 	 */
394 	if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
395 	{
396 		gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
397 										  gfbb->bufferEmptyingQueue);
398 		nodeBuffer->queuedForEmptying = true;
399 	}
400 
401 	/* Restore memory context */
402 	MemoryContextSwitchTo(oldcxt);
403 }
404 
405 /*
406  * Removes one index tuple from node buffer. Returns true if success and false
407  * if node buffer is empty.
408  */
409 bool
gistPopItupFromNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer,IndexTuple * itup)410 gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
411 						  IndexTuple *itup)
412 {
413 	/*
414 	 * If node buffer is empty then return false.
415 	 */
416 	if (nodeBuffer->blocksCount <= 0)
417 		return false;
418 
419 	/* Load last page of node buffer if needed */
420 	if (!nodeBuffer->pageBuffer)
421 		gistLoadNodeBuffer(gfbb, nodeBuffer);
422 
423 	/*
424 	 * Get index tuple from last non-empty page.
425 	 */
426 	gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
427 
428 	/*
429 	 * If we just removed the last tuple from the page, fetch previous page on
430 	 * this node buffer (if any).
431 	 */
432 	if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
433 	{
434 		BlockNumber prevblkno;
435 
436 		/*
437 		 * blocksCount includes the page in pageBuffer, so decrease it now.
438 		 */
439 		nodeBuffer->blocksCount--;
440 
441 		/*
442 		 * If there's more pages, fetch previous one.
443 		 */
444 		prevblkno = nodeBuffer->pageBuffer->prev;
445 		if (prevblkno != InvalidBlockNumber)
446 		{
447 			/* There is a previous page. Fetch it. */
448 			Assert(nodeBuffer->blocksCount > 0);
449 			ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
450 
451 			/*
452 			 * Now that we've read the block in memory, we can release its
453 			 * on-disk block for reuse.
454 			 */
455 			gistBuffersReleaseBlock(gfbb, prevblkno);
456 		}
457 		else
458 		{
459 			/* No more pages. Free memory. */
460 			Assert(nodeBuffer->blocksCount == 0);
461 			pfree(nodeBuffer->pageBuffer);
462 			nodeBuffer->pageBuffer = NULL;
463 		}
464 	}
465 	return true;
466 }
467 
468 /*
469  * Select a currently unused block for writing to.
470  */
471 static long
gistBuffersGetFreeBlock(GISTBuildBuffers * gfbb)472 gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
473 {
474 	/*
475 	 * If there are multiple free blocks, we select the one appearing last in
476 	 * freeBlocks[].  If there are none, assign the next block at the end of
477 	 * the file (causing the file to be extended).
478 	 */
479 	if (gfbb->nFreeBlocks > 0)
480 		return gfbb->freeBlocks[--gfbb->nFreeBlocks];
481 	else
482 		return gfbb->nFileBlocks++;
483 }
484 
485 /*
486  * Return a block# to the freelist.
487  */
488 static void
gistBuffersReleaseBlock(GISTBuildBuffers * gfbb,long blocknum)489 gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
490 {
491 	int			ndx;
492 
493 	/* Enlarge freeBlocks array if full. */
494 	if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
495 	{
496 		gfbb->freeBlocksLen *= 2;
497 		gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
498 											 gfbb->freeBlocksLen *
499 											 sizeof(long));
500 	}
501 
502 	/* Add blocknum to array */
503 	ndx = gfbb->nFreeBlocks++;
504 	gfbb->freeBlocks[ndx] = blocknum;
505 }
506 
507 /*
508  * Free buffering build data structure.
509  */
510 void
gistFreeBuildBuffers(GISTBuildBuffers * gfbb)511 gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
512 {
513 	/* Close buffers file. */
514 	BufFileClose(gfbb->pfile);
515 
516 	/* All other things will be freed on memory context release */
517 }
518 
519 /*
520  * Data structure representing information about node buffer for index tuples
521  * relocation from splitted node buffer.
522  */
523 typedef struct
524 {
525 	GISTENTRY	entry[INDEX_MAX_KEYS];
526 	bool		isnull[INDEX_MAX_KEYS];
527 	GISTPageSplitInfo *splitinfo;
528 	GISTNodeBuffer *nodeBuffer;
529 } RelocationBufferInfo;
530 
531 /*
532  * At page split, distribute tuples from the buffer of the split page to
533  * new buffers for the created page halves. This also adjusts the downlinks
534  * in 'splitinfo' to include the tuples in the buffers.
535  */
536 void
gistRelocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb,GISTSTATE * giststate,Relation r,int level,Buffer buffer,List * splitinfo)537 gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
538 								Relation r, int level,
539 								Buffer buffer, List *splitinfo)
540 {
541 	RelocationBufferInfo *relocationBuffersInfos;
542 	bool		found;
543 	GISTNodeBuffer *nodeBuffer;
544 	BlockNumber blocknum;
545 	IndexTuple	itup;
546 	int			splitPagesCount = 0,
547 				i;
548 	GISTENTRY	entry[INDEX_MAX_KEYS];
549 	bool		isnull[INDEX_MAX_KEYS];
550 	GISTNodeBuffer oldBuf;
551 	ListCell   *lc;
552 
553 	/* If the splitted page doesn't have buffers, we have nothing to do. */
554 	if (!LEVEL_HAS_BUFFERS(level, gfbb))
555 		return;
556 
557 	/*
558 	 * Get the node buffer of the splitted page.
559 	 */
560 	blocknum = BufferGetBlockNumber(buffer);
561 	nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
562 							 HASH_FIND, &found);
563 	if (!found)
564 	{
565 		/* The page has no buffer, so we have nothing to do. */
566 		return;
567 	}
568 
569 	/*
570 	 * Make a copy of the old buffer, as we're going reuse it as the buffer
571 	 * for the new left page, which is on the same block as the old page.
572 	 * That's not true for the root page, but that's fine because we never
573 	 * have a buffer on the root page anyway. The original algorithm as
574 	 * described by Arge et al did, but it's of no use, as you might as well
575 	 * read the tuples straight from the heap instead of the root buffer.
576 	 */
577 	Assert(blocknum != GIST_ROOT_BLKNO);
578 	memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer));
579 	oldBuf.isTemp = true;
580 
581 	/* Reset the old buffer, used for the new left page from now on */
582 	nodeBuffer->blocksCount = 0;
583 	nodeBuffer->pageBuffer = NULL;
584 	nodeBuffer->pageBlocknum = InvalidBlockNumber;
585 
586 	/*
587 	 * Allocate memory for information about relocation buffers.
588 	 */
589 	splitPagesCount = list_length(splitinfo);
590 	relocationBuffersInfos =
591 		(RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
592 										splitPagesCount);
593 
594 	/*
595 	 * Fill relocation buffers information for node buffers of pages produced
596 	 * by split.
597 	 */
598 	i = 0;
599 	foreach(lc, splitinfo)
600 	{
601 		GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
602 		GISTNodeBuffer *newNodeBuffer;
603 
604 		/* Decompress parent index tuple of node buffer page. */
605 		gistDeCompressAtt(giststate, r,
606 						  si->downlink, NULL, (OffsetNumber) 0,
607 						  relocationBuffersInfos[i].entry,
608 						  relocationBuffersInfos[i].isnull);
609 
610 		/*
611 		 * Create a node buffer for the page. The leftmost half is on the same
612 		 * block as the old page before split, so for the leftmost half this
613 		 * will return the original buffer. The tuples on the original buffer
614 		 * were relinked to the temporary buffer, so the original one is now
615 		 * empty.
616 		 */
617 		newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
618 
619 		relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
620 		relocationBuffersInfos[i].splitinfo = si;
621 
622 		i++;
623 	}
624 
625 	/*
626 	 * Loop through all index tuples in the buffer of the page being split,
627 	 * moving them to buffers for the new pages.  We try to move each tuple to
628 	 * the page that will result in the lowest penalty for the leading column
629 	 * or, in the case of a tie, the lowest penalty for the earliest column
630 	 * that is not tied.
631 	 *
632 	 * The page searching logic is very similar to gistchoose().
633 	 */
634 	while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
635 	{
636 		float		best_penalty[INDEX_MAX_KEYS];
637 		int			i,
638 					which;
639 		IndexTuple	newtup;
640 		RelocationBufferInfo *targetBufferInfo;
641 
642 		gistDeCompressAtt(giststate, r,
643 						  itup, NULL, (OffsetNumber) 0, entry, isnull);
644 
645 		/* default to using first page (shouldn't matter) */
646 		which = 0;
647 
648 		/*
649 		 * best_penalty[j] is the best penalty we have seen so far for column
650 		 * j, or -1 when we haven't yet examined column j.  Array entries to
651 		 * the right of the first -1 are undefined.
652 		 */
653 		best_penalty[0] = -1;
654 
655 		/*
656 		 * Loop over possible target pages, looking for one to move this tuple
657 		 * to.
658 		 */
659 		for (i = 0; i < splitPagesCount; i++)
660 		{
661 			RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
662 			bool		zero_penalty;
663 			int			j;
664 
665 			zero_penalty = true;
666 
667 			/* Loop over index attributes. */
668 			for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++)
669 			{
670 				float		usize;
671 
672 				/* Compute penalty for this column. */
673 				usize = gistpenalty(giststate, j,
674 									&splitPageInfo->entry[j],
675 									splitPageInfo->isnull[j],
676 									&entry[j], isnull[j]);
677 				if (usize > 0)
678 					zero_penalty = false;
679 
680 				if (best_penalty[j] < 0 || usize < best_penalty[j])
681 				{
682 					/*
683 					 * New best penalty for column.  Tentatively select this
684 					 * page as the target, and record the best penalty.  Then
685 					 * reset the next column's penalty to "unknown" (and
686 					 * indirectly, the same for all the ones to its right).
687 					 * This will force us to adopt this page's penalty values
688 					 * as the best for all the remaining columns during
689 					 * subsequent loop iterations.
690 					 */
691 					which = i;
692 					best_penalty[j] = usize;
693 
694 					if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1)
695 						best_penalty[j + 1] = -1;
696 				}
697 				else if (best_penalty[j] == usize)
698 				{
699 					/*
700 					 * The current page is exactly as good for this column as
701 					 * the best page seen so far.  The next iteration of this
702 					 * loop will compare the next column.
703 					 */
704 				}
705 				else
706 				{
707 					/*
708 					 * The current page is worse for this column than the best
709 					 * page seen so far.  Skip the remaining columns and move
710 					 * on to the next page, if any.
711 					 */
712 					zero_penalty = false;	/* so outer loop won't exit */
713 					break;
714 				}
715 			}
716 
717 			/*
718 			 * If we find a page with zero penalty for all columns, there's no
719 			 * need to examine remaining pages; just break out of the loop and
720 			 * return it.
721 			 */
722 			if (zero_penalty)
723 				break;
724 		}
725 
726 		/* OK, "which" is the page index to push the tuple to */
727 		targetBufferInfo = &relocationBuffersInfos[which];
728 
729 		/* Push item to selected node buffer */
730 		gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
731 
732 		/* Adjust the downlink for this page, if needed. */
733 		newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
734 								 itup, giststate);
735 		if (newtup)
736 		{
737 			gistDeCompressAtt(giststate, r,
738 							  newtup, NULL, (OffsetNumber) 0,
739 							  targetBufferInfo->entry,
740 							  targetBufferInfo->isnull);
741 
742 			targetBufferInfo->splitinfo->downlink = newtup;
743 		}
744 	}
745 
746 	pfree(relocationBuffersInfos);
747 }
748 
749 
750 /*
751  * Wrappers around BufFile operations. The main difference is that these
752  * wrappers report errors with ereport(), so that the callers don't need
753  * to check the return code.
754  */
755 
756 static void
ReadTempFileBlock(BufFile * file,long blknum,void * ptr)757 ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
758 {
759 	size_t		nread;
760 
761 	if (BufFileSeekBlock(file, blknum) != 0)
762 		elog(ERROR, "could not seek to block %ld in temporary file", blknum);
763 	nread = BufFileRead(file, ptr, BLCKSZ);
764 	if (nread != BLCKSZ)
765 		elog(ERROR, "could not read temporary file: read only %zu of %zu bytes",
766 			 nread, (size_t) BLCKSZ);
767 }
768 
769 static void
WriteTempFileBlock(BufFile * file,long blknum,void * ptr)770 WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
771 {
772 	if (BufFileSeekBlock(file, blknum) != 0)
773 		elog(ERROR, "could not seek to block %ld in temporary file", blknum);
774 	BufFileWrite(file, ptr, BLCKSZ);
775 }
776