1 /*-------------------------------------------------------------------------
2 *
3 * gistbuildbuffers.c
4 * node buffer management functions for GiST buffering build algorithm.
5 *
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gist/gistbuildbuffers.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include "access/genam.h"
18 #include "access/gist_private.h"
19 #include "catalog/index.h"
20 #include "miscadmin.h"
21 #include "storage/buffile.h"
22 #include "storage/bufmgr.h"
23 #include "utils/memutils.h"
24 #include "utils/rel.h"
25
26 static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
27 static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
28 GISTNodeBuffer *nodeBuffer);
29 static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
30 GISTNodeBuffer *nodeBuffer);
31 static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
32 GISTNodeBuffer *nodeBuffer);
33 static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
34 IndexTuple item);
35 static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
36 IndexTuple *item);
37 static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
38 static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
39
40 static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
41 static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
42
43
44 /*
45 * Initialize GiST build buffers.
46 */
47 GISTBuildBuffers *
gistInitBuildBuffers(int pagesPerBuffer,int levelStep,int maxLevel)48 gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
49 {
50 GISTBuildBuffers *gfbb;
51 HASHCTL hashCtl;
52
53 gfbb = palloc(sizeof(GISTBuildBuffers));
54 gfbb->pagesPerBuffer = pagesPerBuffer;
55 gfbb->levelStep = levelStep;
56
57 /*
58 * Create a temporary file to hold buffer pages that are swapped out of
59 * memory.
60 */
61 gfbb->pfile = BufFileCreateTemp(false);
62 gfbb->nFileBlocks = 0;
63
64 /* Initialize free page management. */
65 gfbb->nFreeBlocks = 0;
66 gfbb->freeBlocksLen = 32;
67 gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
68
69 /*
70 * Current memory context will be used for all in-memory data structures
71 * of buffers which are persistent during buffering build.
72 */
73 gfbb->context = CurrentMemoryContext;
74
75 /*
76 * nodeBuffersTab hash is association between index blocks and it's
77 * buffers.
78 */
79 hashCtl.keysize = sizeof(BlockNumber);
80 hashCtl.entrysize = sizeof(GISTNodeBuffer);
81 hashCtl.hcxt = CurrentMemoryContext;
82 gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
83 1024,
84 &hashCtl,
85 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
86
87 gfbb->bufferEmptyingQueue = NIL;
88
89 /*
90 * Per-level node buffers lists for final buffers emptying process. Node
91 * buffers are inserted here when they are created.
92 */
93 gfbb->buffersOnLevelsLen = 1;
94 gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
95 gfbb->buffersOnLevelsLen);
96 gfbb->buffersOnLevels[0] = NIL;
97
98 /*
99 * Block numbers of node buffers which last pages are currently loaded
100 * into main memory.
101 */
102 gfbb->loadedBuffersLen = 32;
103 gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
104 sizeof(GISTNodeBuffer *));
105 gfbb->loadedBuffersCount = 0;
106
107 gfbb->rootlevel = maxLevel;
108
109 return gfbb;
110 }
111
112 /*
113 * Returns a node buffer for given block. The buffer is created if it
114 * doesn't exist yet.
115 */
116 GISTNodeBuffer *
gistGetNodeBuffer(GISTBuildBuffers * gfbb,GISTSTATE * giststate,BlockNumber nodeBlocknum,int level)117 gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
118 BlockNumber nodeBlocknum, int level)
119 {
120 GISTNodeBuffer *nodeBuffer;
121 bool found;
122
123 /* Find node buffer in hash table */
124 nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
125 (const void *) &nodeBlocknum,
126 HASH_ENTER,
127 &found);
128 if (!found)
129 {
130 /*
131 * Node buffer wasn't found. Initialize the new buffer as empty.
132 */
133 MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
134
135 /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */
136 nodeBuffer->blocksCount = 0;
137 nodeBuffer->pageBlocknum = InvalidBlockNumber;
138 nodeBuffer->pageBuffer = NULL;
139 nodeBuffer->queuedForEmptying = false;
140 nodeBuffer->isTemp = false;
141 nodeBuffer->level = level;
142
143 /*
144 * Add this buffer to the list of buffers on this level. Enlarge
145 * buffersOnLevels array if needed.
146 */
147 if (level >= gfbb->buffersOnLevelsLen)
148 {
149 int i;
150
151 gfbb->buffersOnLevels =
152 (List **) repalloc(gfbb->buffersOnLevels,
153 (level + 1) * sizeof(List *));
154
155 /* initialize the enlarged portion */
156 for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
157 gfbb->buffersOnLevels[i] = NIL;
158 gfbb->buffersOnLevelsLen = level + 1;
159 }
160
161 /*
162 * Prepend the new buffer to the list of buffers on this level. It's
163 * not arbitrary that the new buffer is put to the beginning of the
164 * list: in the final emptying phase we loop through all buffers at
165 * each level, and flush them. If a page is split during the emptying,
166 * it's more efficient to flush the new splitted pages first, before
167 * moving on to pre-existing pages on the level. The buffers just
168 * created during the page split are likely still in cache, so
169 * flushing them immediately is more efficient than putting them to
170 * the end of the queue.
171 */
172 gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
173 gfbb->buffersOnLevels[level]);
174
175 MemoryContextSwitchTo(oldcxt);
176 }
177
178 return nodeBuffer;
179 }
180
181 /*
182 * Allocate memory for a buffer page.
183 */
184 static GISTNodeBufferPage *
gistAllocateNewPageBuffer(GISTBuildBuffers * gfbb)185 gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
186 {
187 GISTNodeBufferPage *pageBuffer;
188
189 pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context,
190 BLCKSZ);
191 pageBuffer->prev = InvalidBlockNumber;
192
193 /* Set page free space */
194 PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
195 return pageBuffer;
196 }
197
198 /*
199 * Add specified buffer into loadedBuffers array.
200 */
201 static void
gistAddLoadedBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)202 gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
203 {
204 /* Never add a temporary buffer to the array */
205 if (nodeBuffer->isTemp)
206 return;
207
208 /* Enlarge the array if needed */
209 if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
210 {
211 gfbb->loadedBuffersLen *= 2;
212 gfbb->loadedBuffers = (GISTNodeBuffer **)
213 repalloc(gfbb->loadedBuffers,
214 gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
215 }
216
217 gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
218 gfbb->loadedBuffersCount++;
219 }
220
221 /*
222 * Load last page of node buffer into main memory.
223 */
224 static void
gistLoadNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)225 gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
226 {
227 /* Check if we really should load something */
228 if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
229 {
230 /* Allocate memory for page */
231 nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
232
233 /* Read block from temporary file */
234 ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
235 nodeBuffer->pageBuffer);
236
237 /* Mark file block as free */
238 gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
239
240 /* Mark node buffer as loaded */
241 gistAddLoadedBuffer(gfbb, nodeBuffer);
242 nodeBuffer->pageBlocknum = InvalidBlockNumber;
243 }
244 }
245
246 /*
247 * Write last page of node buffer to the disk.
248 */
249 static void
gistUnloadNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer)250 gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
251 {
252 /* Check if we have something to write */
253 if (nodeBuffer->pageBuffer)
254 {
255 BlockNumber blkno;
256
257 /* Get free file block */
258 blkno = gistBuffersGetFreeBlock(gfbb);
259
260 /* Write block to the temporary file */
261 WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
262
263 /* Free memory of that page */
264 pfree(nodeBuffer->pageBuffer);
265 nodeBuffer->pageBuffer = NULL;
266
267 /* Save block number */
268 nodeBuffer->pageBlocknum = blkno;
269 }
270 }
271
272 /*
273 * Write last pages of all node buffers to the disk.
274 */
275 void
gistUnloadNodeBuffers(GISTBuildBuffers * gfbb)276 gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
277 {
278 int i;
279
280 /* Unload all the buffers that have a page loaded in memory. */
281 for (i = 0; i < gfbb->loadedBuffersCount; i++)
282 gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
283
284 /* Now there are no node buffers with loaded last page */
285 gfbb->loadedBuffersCount = 0;
286 }
287
288 /*
289 * Add index tuple to buffer page.
290 */
291 static void
gistPlaceItupToPage(GISTNodeBufferPage * pageBuffer,IndexTuple itup)292 gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
293 {
294 Size itupsz = IndexTupleSize(itup);
295 char *ptr;
296
297 /* There should be enough of space. */
298 Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
299
300 /* Reduce free space value of page to reserve a spot for the tuple. */
301 PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
302
303 /* Get pointer to the spot we reserved (ie. end of free space). */
304 ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
305 + PAGE_FREE_SPACE(pageBuffer);
306
307 /* Copy the index tuple there. */
308 memcpy(ptr, itup, itupsz);
309 }
310
311 /*
312 * Get last item from buffer page and remove it from page.
313 */
314 static void
gistGetItupFromPage(GISTNodeBufferPage * pageBuffer,IndexTuple * itup)315 gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
316 {
317 IndexTuple ptr;
318 Size itupsz;
319
320 Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
321
322 /* Get pointer to last index tuple */
323 ptr = (IndexTuple) ((char *) pageBuffer
324 + BUFFER_PAGE_DATA_OFFSET
325 + PAGE_FREE_SPACE(pageBuffer));
326 itupsz = IndexTupleSize(ptr);
327
328 /* Make a copy of the tuple */
329 *itup = (IndexTuple) palloc(itupsz);
330 memcpy(*itup, ptr, itupsz);
331
332 /* Mark the space used by the tuple as free */
333 PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
334 }
335
336 /*
337 * Push an index tuple to node buffer.
338 */
339 void
gistPushItupToNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer,IndexTuple itup)340 gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
341 IndexTuple itup)
342 {
343 /*
344 * Most part of memory operations will be in buffering build persistent
345 * context. So, let's switch to it.
346 */
347 MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
348
349 /*
350 * If the buffer is currently empty, create the first page.
351 */
352 if (nodeBuffer->blocksCount == 0)
353 {
354 nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
355 nodeBuffer->blocksCount = 1;
356 gistAddLoadedBuffer(gfbb, nodeBuffer);
357 }
358
359 /* Load last page of node buffer if it wasn't in memory already */
360 if (!nodeBuffer->pageBuffer)
361 gistLoadNodeBuffer(gfbb, nodeBuffer);
362
363 /*
364 * Check if there is enough space on the last page for the tuple.
365 */
366 if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
367 {
368 /*
369 * Nope. Swap previous block to disk and allocate a new one.
370 */
371 BlockNumber blkno;
372
373 /* Write filled page to the disk */
374 blkno = gistBuffersGetFreeBlock(gfbb);
375 WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
376
377 /*
378 * Reset the in-memory page as empty, and link the previous block to
379 * the new page by storing its block number in the prev-link.
380 */
381 PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
382 BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
383 nodeBuffer->pageBuffer->prev = blkno;
384
385 /* We've just added one more page */
386 nodeBuffer->blocksCount++;
387 }
388
389 gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
390
391 /*
392 * If the buffer just overflowed, add it to the emptying queue.
393 */
394 if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
395 {
396 gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
397 gfbb->bufferEmptyingQueue);
398 nodeBuffer->queuedForEmptying = true;
399 }
400
401 /* Restore memory context */
402 MemoryContextSwitchTo(oldcxt);
403 }
404
405 /*
406 * Removes one index tuple from node buffer. Returns true if success and false
407 * if node buffer is empty.
408 */
409 bool
gistPopItupFromNodeBuffer(GISTBuildBuffers * gfbb,GISTNodeBuffer * nodeBuffer,IndexTuple * itup)410 gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
411 IndexTuple *itup)
412 {
413 /*
414 * If node buffer is empty then return false.
415 */
416 if (nodeBuffer->blocksCount <= 0)
417 return false;
418
419 /* Load last page of node buffer if needed */
420 if (!nodeBuffer->pageBuffer)
421 gistLoadNodeBuffer(gfbb, nodeBuffer);
422
423 /*
424 * Get index tuple from last non-empty page.
425 */
426 gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
427
428 /*
429 * If we just removed the last tuple from the page, fetch previous page on
430 * this node buffer (if any).
431 */
432 if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
433 {
434 BlockNumber prevblkno;
435
436 /*
437 * blocksCount includes the page in pageBuffer, so decrease it now.
438 */
439 nodeBuffer->blocksCount--;
440
441 /*
442 * If there's more pages, fetch previous one.
443 */
444 prevblkno = nodeBuffer->pageBuffer->prev;
445 if (prevblkno != InvalidBlockNumber)
446 {
447 /* There is a previous page. Fetch it. */
448 Assert(nodeBuffer->blocksCount > 0);
449 ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
450
451 /*
452 * Now that we've read the block in memory, we can release its
453 * on-disk block for reuse.
454 */
455 gistBuffersReleaseBlock(gfbb, prevblkno);
456 }
457 else
458 {
459 /* No more pages. Free memory. */
460 Assert(nodeBuffer->blocksCount == 0);
461 pfree(nodeBuffer->pageBuffer);
462 nodeBuffer->pageBuffer = NULL;
463 }
464 }
465 return true;
466 }
467
468 /*
469 * Select a currently unused block for writing to.
470 */
471 static long
gistBuffersGetFreeBlock(GISTBuildBuffers * gfbb)472 gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
473 {
474 /*
475 * If there are multiple free blocks, we select the one appearing last in
476 * freeBlocks[]. If there are none, assign the next block at the end of
477 * the file (causing the file to be extended).
478 */
479 if (gfbb->nFreeBlocks > 0)
480 return gfbb->freeBlocks[--gfbb->nFreeBlocks];
481 else
482 return gfbb->nFileBlocks++;
483 }
484
485 /*
486 * Return a block# to the freelist.
487 */
488 static void
gistBuffersReleaseBlock(GISTBuildBuffers * gfbb,long blocknum)489 gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
490 {
491 int ndx;
492
493 /* Enlarge freeBlocks array if full. */
494 if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
495 {
496 gfbb->freeBlocksLen *= 2;
497 gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
498 gfbb->freeBlocksLen *
499 sizeof(long));
500 }
501
502 /* Add blocknum to array */
503 ndx = gfbb->nFreeBlocks++;
504 gfbb->freeBlocks[ndx] = blocknum;
505 }
506
507 /*
508 * Free buffering build data structure.
509 */
510 void
gistFreeBuildBuffers(GISTBuildBuffers * gfbb)511 gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
512 {
513 /* Close buffers file. */
514 BufFileClose(gfbb->pfile);
515
516 /* All other things will be freed on memory context release */
517 }
518
519 /*
520 * Data structure representing information about node buffer for index tuples
521 * relocation from splitted node buffer.
522 */
523 typedef struct
524 {
525 GISTENTRY entry[INDEX_MAX_KEYS];
526 bool isnull[INDEX_MAX_KEYS];
527 GISTPageSplitInfo *splitinfo;
528 GISTNodeBuffer *nodeBuffer;
529 } RelocationBufferInfo;
530
531 /*
532 * At page split, distribute tuples from the buffer of the split page to
533 * new buffers for the created page halves. This also adjusts the downlinks
534 * in 'splitinfo' to include the tuples in the buffers.
535 */
536 void
gistRelocateBuildBuffersOnSplit(GISTBuildBuffers * gfbb,GISTSTATE * giststate,Relation r,int level,Buffer buffer,List * splitinfo)537 gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
538 Relation r, int level,
539 Buffer buffer, List *splitinfo)
540 {
541 RelocationBufferInfo *relocationBuffersInfos;
542 bool found;
543 GISTNodeBuffer *nodeBuffer;
544 BlockNumber blocknum;
545 IndexTuple itup;
546 int splitPagesCount = 0,
547 i;
548 GISTENTRY entry[INDEX_MAX_KEYS];
549 bool isnull[INDEX_MAX_KEYS];
550 GISTNodeBuffer oldBuf;
551 ListCell *lc;
552
553 /* If the splitted page doesn't have buffers, we have nothing to do. */
554 if (!LEVEL_HAS_BUFFERS(level, gfbb))
555 return;
556
557 /*
558 * Get the node buffer of the splitted page.
559 */
560 blocknum = BufferGetBlockNumber(buffer);
561 nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
562 HASH_FIND, &found);
563 if (!found)
564 {
565 /* The page has no buffer, so we have nothing to do. */
566 return;
567 }
568
569 /*
570 * Make a copy of the old buffer, as we're going reuse it as the buffer
571 * for the new left page, which is on the same block as the old page.
572 * That's not true for the root page, but that's fine because we never
573 * have a buffer on the root page anyway. The original algorithm as
574 * described by Arge et al did, but it's of no use, as you might as well
575 * read the tuples straight from the heap instead of the root buffer.
576 */
577 Assert(blocknum != GIST_ROOT_BLKNO);
578 memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer));
579 oldBuf.isTemp = true;
580
581 /* Reset the old buffer, used for the new left page from now on */
582 nodeBuffer->blocksCount = 0;
583 nodeBuffer->pageBuffer = NULL;
584 nodeBuffer->pageBlocknum = InvalidBlockNumber;
585
586 /*
587 * Allocate memory for information about relocation buffers.
588 */
589 splitPagesCount = list_length(splitinfo);
590 relocationBuffersInfos =
591 (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
592 splitPagesCount);
593
594 /*
595 * Fill relocation buffers information for node buffers of pages produced
596 * by split.
597 */
598 i = 0;
599 foreach(lc, splitinfo)
600 {
601 GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
602 GISTNodeBuffer *newNodeBuffer;
603
604 /* Decompress parent index tuple of node buffer page. */
605 gistDeCompressAtt(giststate, r,
606 si->downlink, NULL, (OffsetNumber) 0,
607 relocationBuffersInfos[i].entry,
608 relocationBuffersInfos[i].isnull);
609
610 /*
611 * Create a node buffer for the page. The leftmost half is on the same
612 * block as the old page before split, so for the leftmost half this
613 * will return the original buffer. The tuples on the original buffer
614 * were relinked to the temporary buffer, so the original one is now
615 * empty.
616 */
617 newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level);
618
619 relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
620 relocationBuffersInfos[i].splitinfo = si;
621
622 i++;
623 }
624
625 /*
626 * Loop through all index tuples in the buffer of the page being split,
627 * moving them to buffers for the new pages. We try to move each tuple to
628 * the page that will result in the lowest penalty for the leading column
629 * or, in the case of a tie, the lowest penalty for the earliest column
630 * that is not tied.
631 *
632 * The page searching logic is very similar to gistchoose().
633 */
634 while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup))
635 {
636 float best_penalty[INDEX_MAX_KEYS];
637 int i,
638 which;
639 IndexTuple newtup;
640 RelocationBufferInfo *targetBufferInfo;
641
642 gistDeCompressAtt(giststate, r,
643 itup, NULL, (OffsetNumber) 0, entry, isnull);
644
645 /* default to using first page (shouldn't matter) */
646 which = 0;
647
648 /*
649 * best_penalty[j] is the best penalty we have seen so far for column
650 * j, or -1 when we haven't yet examined column j. Array entries to
651 * the right of the first -1 are undefined.
652 */
653 best_penalty[0] = -1;
654
655 /*
656 * Loop over possible target pages, looking for one to move this tuple
657 * to.
658 */
659 for (i = 0; i < splitPagesCount; i++)
660 {
661 RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
662 bool zero_penalty;
663 int j;
664
665 zero_penalty = true;
666
667 /* Loop over index attributes. */
668 for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++)
669 {
670 float usize;
671
672 /* Compute penalty for this column. */
673 usize = gistpenalty(giststate, j,
674 &splitPageInfo->entry[j],
675 splitPageInfo->isnull[j],
676 &entry[j], isnull[j]);
677 if (usize > 0)
678 zero_penalty = false;
679
680 if (best_penalty[j] < 0 || usize < best_penalty[j])
681 {
682 /*
683 * New best penalty for column. Tentatively select this
684 * page as the target, and record the best penalty. Then
685 * reset the next column's penalty to "unknown" (and
686 * indirectly, the same for all the ones to its right).
687 * This will force us to adopt this page's penalty values
688 * as the best for all the remaining columns during
689 * subsequent loop iterations.
690 */
691 which = i;
692 best_penalty[j] = usize;
693
694 if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1)
695 best_penalty[j + 1] = -1;
696 }
697 else if (best_penalty[j] == usize)
698 {
699 /*
700 * The current page is exactly as good for this column as
701 * the best page seen so far. The next iteration of this
702 * loop will compare the next column.
703 */
704 }
705 else
706 {
707 /*
708 * The current page is worse for this column than the best
709 * page seen so far. Skip the remaining columns and move
710 * on to the next page, if any.
711 */
712 zero_penalty = false; /* so outer loop won't exit */
713 break;
714 }
715 }
716
717 /*
718 * If we find a page with zero penalty for all columns, there's no
719 * need to examine remaining pages; just break out of the loop and
720 * return it.
721 */
722 if (zero_penalty)
723 break;
724 }
725
726 /* OK, "which" is the page index to push the tuple to */
727 targetBufferInfo = &relocationBuffersInfos[which];
728
729 /* Push item to selected node buffer */
730 gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
731
732 /* Adjust the downlink for this page, if needed. */
733 newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
734 itup, giststate);
735 if (newtup)
736 {
737 gistDeCompressAtt(giststate, r,
738 newtup, NULL, (OffsetNumber) 0,
739 targetBufferInfo->entry,
740 targetBufferInfo->isnull);
741
742 targetBufferInfo->splitinfo->downlink = newtup;
743 }
744 }
745
746 pfree(relocationBuffersInfos);
747 }
748
749
750 /*
751 * Wrappers around BufFile operations. The main difference is that these
752 * wrappers report errors with ereport(), so that the callers don't need
753 * to check the return code.
754 */
755
756 static void
ReadTempFileBlock(BufFile * file,long blknum,void * ptr)757 ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
758 {
759 size_t nread;
760
761 if (BufFileSeekBlock(file, blknum) != 0)
762 elog(ERROR, "could not seek to block %ld in temporary file", blknum);
763 nread = BufFileRead(file, ptr, BLCKSZ);
764 if (nread != BLCKSZ)
765 elog(ERROR, "could not read temporary file: read only %zu of %zu bytes",
766 nread, (size_t) BLCKSZ);
767 }
768
769 static void
WriteTempFileBlock(BufFile * file,long blknum,void * ptr)770 WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
771 {
772 if (BufFileSeekBlock(file, blknum) != 0)
773 elog(ERROR, "could not seek to block %ld in temporary file", blknum);
774 BufFileWrite(file, ptr, BLCKSZ);
775 }
776