1 /*-------------------------------------------------------------------------
2 *
3 * gistbuild.c
4 * build algorithm for GiST indexes implementation.
5 *
6 *
7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * IDENTIFICATION
11 * src/backend/access/gist/gistbuild.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include <math.h>
18
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "access/gistxlog.h"
22 #include "access/tableam.h"
23 #include "access/xloginsert.h"
24 #include "catalog/index.h"
25 #include "miscadmin.h"
26 #include "optimizer/optimizer.h"
27 #include "storage/bufmgr.h"
28 #include "storage/smgr.h"
29 #include "utils/memutils.h"
30 #include "utils/rel.h"
31
32 /* Step of index tuples for check whether to switch to buffering build mode */
33 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
34
35 /*
36 * Number of tuples to process in the slow way before switching to buffering
37 * mode, when buffering is explicitly turned on. Also, the number of tuples
38 * to process between readjusting the buffer size parameter, while in
39 * buffering mode.
40 */
41 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42
43 typedef enum
44 {
45 GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to
46 * switch */
47 GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to
48 * buffering build mode if the index grows too
49 * big */
50 GIST_BUFFERING_STATS, /* gathering statistics of index tuple size
51 * before switching to the buffering build
52 * mode */
53 GIST_BUFFERING_ACTIVE /* in buffering build mode */
54 } GistBufferingMode;
55
56 /* Working state for gistbuild and its callback */
57 typedef struct
58 {
59 Relation indexrel;
60 Relation heaprel;
61 GISTSTATE *giststate;
62
63 int64 indtuples; /* number of tuples indexed */
64 int64 indtuplesSize; /* total size of all indexed tuples */
65
66 Size freespace; /* amount of free space to leave on pages */
67
68 /*
69 * Extra data structures used during a buffering build. 'gfbb' contains
70 * information related to managing the build buffers. 'parentMap' is a
71 * lookup table of the parent of each internal page.
72 */
73 GISTBuildBuffers *gfbb;
74 HTAB *parentMap;
75
76 GistBufferingMode bufferingMode;
77 } GISTBuildState;
78
79 /* prototypes for private functions */
80 static void gistInitBuffering(GISTBuildState *buildstate);
81 static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
82 static void gistBuildCallback(Relation index,
83 ItemPointer tid,
84 Datum *values,
85 bool *isnull,
86 bool tupleIsAlive,
87 void *state);
88 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
89 IndexTuple itup);
90 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
91 BlockNumber startblkno, int startlevel);
92 static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
93 Buffer buffer, int level,
94 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
95 BlockNumber parentblk, OffsetNumber downlinkoffnum);
96 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
97 BlockNumber childblkno, int level,
98 BlockNumber *parentblk,
99 OffsetNumber *downlinkoffnum);
100 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
101 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
102 static int gistGetMaxLevel(Relation index);
103
104 static void gistInitParentMap(GISTBuildState *buildstate);
105 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
106 BlockNumber parent);
107 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
108 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
109
110 /*
111 * Main entry point to GiST index build. Initially calls insert over and over,
112 * but switches to more efficient buffering build algorithm after a certain
113 * number of tuples (unless buffering mode is disabled).
114 */
115 IndexBuildResult *
gistbuild(Relation heap,Relation index,IndexInfo * indexInfo)116 gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
117 {
118 IndexBuildResult *result;
119 double reltuples;
120 GISTBuildState buildstate;
121 Buffer buffer;
122 Page page;
123 MemoryContext oldcxt = CurrentMemoryContext;
124 int fillfactor;
125
126 buildstate.indexrel = index;
127 buildstate.heaprel = heap;
128
129 if (index->rd_options)
130 {
131 /* Get buffering mode from the options string */
132 GiSTOptions *options = (GiSTOptions *) index->rd_options;
133
134 if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
135 buildstate.bufferingMode = GIST_BUFFERING_STATS;
136 else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
137 buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
138 else
139 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
140
141 fillfactor = options->fillfactor;
142 }
143 else
144 {
145 /*
146 * By default, switch to buffering mode when the index grows too large
147 * to fit in cache.
148 */
149 buildstate.bufferingMode = GIST_BUFFERING_AUTO;
150 fillfactor = GIST_DEFAULT_FILLFACTOR;
151 }
152 /* Calculate target amount of free space to leave on pages */
153 buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
154
155 /*
156 * We expect to be called exactly once for any index relation. If that's
157 * not the case, big trouble's what we have.
158 */
159 if (RelationGetNumberOfBlocks(index) != 0)
160 elog(ERROR, "index \"%s\" already contains data",
161 RelationGetRelationName(index));
162
163 /* no locking is needed */
164 buildstate.giststate = initGISTstate(index);
165
166 /*
167 * Create a temporary memory context that is reset once for each tuple
168 * processed. (Note: we don't bother to make this a child of the
169 * giststate's scanCxt, so we have to delete it separately at the end.)
170 */
171 buildstate.giststate->tempCxt = createTempGistContext();
172
173 /* initialize the root page */
174 buffer = gistNewBuffer(index);
175 Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
176 page = BufferGetPage(buffer);
177
178 START_CRIT_SECTION();
179
180 GISTInitBuffer(buffer, F_LEAF);
181
182 MarkBufferDirty(buffer);
183 PageSetLSN(page, GistBuildLSN);
184
185 UnlockReleaseBuffer(buffer);
186
187 END_CRIT_SECTION();
188
189 /* build the index */
190 buildstate.indtuples = 0;
191 buildstate.indtuplesSize = 0;
192
193 /*
194 * Do the heap scan.
195 */
196 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
197 gistBuildCallback,
198 (void *) &buildstate, NULL);
199
200 /*
201 * If buffering was used, flush out all the tuples that are still in the
202 * buffers.
203 */
204 if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
205 {
206 elog(DEBUG1, "all tuples processed, emptying buffers");
207 gistEmptyAllBuffers(&buildstate);
208 gistFreeBuildBuffers(buildstate.gfbb);
209 }
210
211 /* okay, all heap tuples are indexed */
212 MemoryContextSwitchTo(oldcxt);
213 MemoryContextDelete(buildstate.giststate->tempCxt);
214
215 freeGISTstate(buildstate.giststate);
216
217 /*
218 * We didn't write WAL records as we built the index, so if WAL-logging is
219 * required, write all pages to the WAL now.
220 */
221 if (RelationNeedsWAL(index))
222 {
223 log_newpage_range(index, MAIN_FORKNUM,
224 0, RelationGetNumberOfBlocks(index),
225 true);
226 }
227
228 /*
229 * Return statistics
230 */
231 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
232
233 result->heap_tuples = reltuples;
234 result->index_tuples = (double) buildstate.indtuples;
235
236 return result;
237 }
238
239 /*
240 * Attempt to switch to buffering mode.
241 *
242 * If there is not enough memory for buffering build, sets bufferingMode
243 * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
244 * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
245 * GIST_BUFFERING_ACTIVE.
246 */
247 static void
gistInitBuffering(GISTBuildState * buildstate)248 gistInitBuffering(GISTBuildState *buildstate)
249 {
250 Relation index = buildstate->indexrel;
251 int pagesPerBuffer;
252 Size pageFreeSpace;
253 Size itupAvgSize,
254 itupMinSize;
255 double avgIndexTuplesPerPage,
256 maxIndexTuplesPerPage;
257 int i;
258 int levelStep;
259
260 /* Calc space of index page which is available for index tuples */
261 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
262 - sizeof(ItemIdData)
263 - buildstate->freespace;
264
265 /*
266 * Calculate average size of already inserted index tuples using gathered
267 * statistics.
268 */
269 itupAvgSize = (double) buildstate->indtuplesSize /
270 (double) buildstate->indtuples;
271
272 /*
273 * Calculate minimal possible size of index tuple by index metadata.
274 * Minimal possible size of varlena is VARHDRSZ.
275 *
276 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
277 * And we should take padding into account here.
278 */
279 itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
280 for (i = 0; i < index->rd_att->natts; i++)
281 {
282 if (TupleDescAttr(index->rd_att, i)->attlen < 0)
283 itupMinSize += VARHDRSZ;
284 else
285 itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
286 }
287
288 /* Calculate average and maximal number of index tuples which fit to page */
289 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
290 maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
291
292 /*
293 * We need to calculate two parameters for the buffering algorithm:
294 * levelStep and pagesPerBuffer.
295 *
296 * levelStep determines the size of subtree that we operate on, while
297 * emptying a buffer. A higher value is better, as you need fewer buffer
298 * emptying steps to build the index. However, if you set it too high, the
299 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
300 * of the buffers.
301 *
302 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
303 * the number of tuples on page (ie. fanout), and M is the amount of
304 * internal memory available. Curiously, they doesn't explain *why* that
305 * setting is optimal. We calculate it by taking the highest levelStep so
306 * that a subtree still fits in cache. For a small B, our way of
307 * calculating levelStep is very close to Arge et al's formula. For a
308 * large B, our formula gives a value that is 2x higher.
309 *
310 * The average size (in pages) of a subtree of depth n can be calculated
311 * as a geometric series:
312 *
313 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
314 *
315 * where B is the average number of index tuples on page. The subtree is
316 * cached in the shared buffer cache and the OS cache, so we choose
317 * levelStep so that the subtree size is comfortably smaller than
318 * effective_cache_size, with a safety factor of 4.
319 *
320 * The estimate on the average number of index tuples on page is based on
321 * average tuple sizes observed before switching to buffered build, so the
322 * real subtree size can be somewhat larger. Also, it would selfish to
323 * gobble the whole cache for our index build. The safety factor of 4
324 * should account for those effects.
325 *
326 * The other limiting factor for setting levelStep is that while
327 * processing a subtree, we need to hold one page for each buffer at the
328 * next lower buffered level. The max. number of buffers needed for that
329 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
330 * hopefully maintenance_work_mem is set high enough that you're
331 * constrained by effective_cache_size rather than maintenance_work_mem.
332 *
333 * XXX: the buffer hash table consumes a fair amount of memory too per
334 * buffer, but that is not currently taken into account. That scales on
335 * the total number of buffers used, ie. the index size and on levelStep.
336 * Note that a higher levelStep *reduces* the amount of memory needed for
337 * the hash table.
338 */
339 levelStep = 1;
340 for (;;)
341 {
342 double subtreesize;
343 double maxlowestlevelpages;
344
345 /* size of an average subtree at this levelStep (in pages). */
346 subtreesize =
347 (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
348 (1 - avgIndexTuplesPerPage);
349
350 /* max number of pages at the lowest level of a subtree */
351 maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
352
353 /* subtree must fit in cache (with safety factor of 4) */
354 if (subtreesize > effective_cache_size / 4)
355 break;
356
357 /* each node in the lowest level of a subtree has one page in memory */
358 if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
359 break;
360
361 /* Good, we can handle this levelStep. See if we can go one higher. */
362 levelStep++;
363 }
364
365 /*
366 * We just reached an unacceptable value of levelStep in previous loop.
367 * So, decrease levelStep to get last acceptable value.
368 */
369 levelStep--;
370
371 /*
372 * If there's not enough cache or maintenance_work_mem, fall back to plain
373 * inserts.
374 */
375 if (levelStep <= 0)
376 {
377 elog(DEBUG1, "failed to switch to buffered GiST build");
378 buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
379 return;
380 }
381
382 /*
383 * The second parameter to set is pagesPerBuffer, which determines the
384 * size of each buffer. We adjust pagesPerBuffer also during the build,
385 * which is why this calculation is in a separate function.
386 */
387 pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
388
389 /* Initialize GISTBuildBuffers with these parameters */
390 buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
391 gistGetMaxLevel(index));
392
393 gistInitParentMap(buildstate);
394
395 buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
396
397 elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
398 levelStep, pagesPerBuffer);
399 }
400
401 /*
402 * Calculate pagesPerBuffer parameter for the buffering algorithm.
403 *
404 * Buffer size is chosen so that assuming that tuples are distributed
405 * randomly, emptying half a buffer fills on average one page in every buffer
406 * at the next lower level.
407 */
408 static int
calculatePagesPerBuffer(GISTBuildState * buildstate,int levelStep)409 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
410 {
411 double pagesPerBuffer;
412 double avgIndexTuplesPerPage;
413 double itupAvgSize;
414 Size pageFreeSpace;
415
416 /* Calc space of index page which is available for index tuples */
417 pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
418 - sizeof(ItemIdData)
419 - buildstate->freespace;
420
421 /*
422 * Calculate average size of already inserted index tuples using gathered
423 * statistics.
424 */
425 itupAvgSize = (double) buildstate->indtuplesSize /
426 (double) buildstate->indtuples;
427
428 avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
429
430 /*
431 * Recalculate required size of buffers.
432 */
433 pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
434
435 return (int) rint(pagesPerBuffer);
436 }
437
438 /*
439 * Per-tuple callback for table_index_build_scan.
440 */
441 static void
gistBuildCallback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)442 gistBuildCallback(Relation index,
443 ItemPointer tid,
444 Datum *values,
445 bool *isnull,
446 bool tupleIsAlive,
447 void *state)
448 {
449 GISTBuildState *buildstate = (GISTBuildState *) state;
450 IndexTuple itup;
451 MemoryContext oldCtx;
452
453 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
454
455 /* form an index tuple and point it at the heap tuple */
456 itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
457 itup->t_tid = *tid;
458
459 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
460 {
461 /* We have buffers, so use them. */
462 gistBufferingBuildInsert(buildstate, itup);
463 }
464 else
465 {
466 /*
467 * There's no buffers (yet). Since we already have the index relation
468 * locked, we call gistdoinsert directly.
469 */
470 gistdoinsert(index, itup, buildstate->freespace,
471 buildstate->giststate, buildstate->heaprel, true);
472 }
473
474 /* Update tuple count and total size. */
475 buildstate->indtuples += 1;
476 buildstate->indtuplesSize += IndexTupleSize(itup);
477
478 MemoryContextSwitchTo(oldCtx);
479 MemoryContextReset(buildstate->giststate->tempCxt);
480
481 if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
482 buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
483 {
484 /* Adjust the target buffer size now */
485 buildstate->gfbb->pagesPerBuffer =
486 calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
487 }
488
489 /*
490 * In 'auto' mode, check if the index has grown too large to fit in cache,
491 * and switch to buffering mode if it has.
492 *
493 * To avoid excessive calls to smgrnblocks(), only check this every
494 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
495 */
496 if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
497 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
498 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
499 (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
500 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
501 {
502 /*
503 * Index doesn't fit in effective cache anymore. Try to switch to
504 * buffering build mode.
505 */
506 gistInitBuffering(buildstate);
507 }
508 }
509
510 /*
511 * Insert function for buffering index build.
512 */
513 static void
gistBufferingBuildInsert(GISTBuildState * buildstate,IndexTuple itup)514 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
515 {
516 /* Insert the tuple to buffers. */
517 gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
518
519 /* If we filled up (half of a) buffer, process buffer emptying. */
520 gistProcessEmptyingQueue(buildstate);
521 }
522
523 /*
524 * Process an index tuple. Runs the tuple down the tree until we reach a leaf
525 * page or node buffer, and inserts the tuple there. Returns true if we have
526 * to stop buffer emptying process (because one of child buffers can't take
527 * index tuples anymore).
528 */
529 static bool
gistProcessItup(GISTBuildState * buildstate,IndexTuple itup,BlockNumber startblkno,int startlevel)530 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
531 BlockNumber startblkno, int startlevel)
532 {
533 GISTSTATE *giststate = buildstate->giststate;
534 GISTBuildBuffers *gfbb = buildstate->gfbb;
535 Relation indexrel = buildstate->indexrel;
536 BlockNumber childblkno;
537 Buffer buffer;
538 bool result = false;
539 BlockNumber blkno;
540 int level;
541 OffsetNumber downlinkoffnum = InvalidOffsetNumber;
542 BlockNumber parentblkno = InvalidBlockNumber;
543
544 CHECK_FOR_INTERRUPTS();
545
546 /*
547 * Loop until we reach a leaf page (level == 0) or a level with buffers
548 * (not including the level we start at, because we would otherwise make
549 * no progress).
550 */
551 blkno = startblkno;
552 level = startlevel;
553 for (;;)
554 {
555 ItemId iid;
556 IndexTuple idxtuple,
557 newtup;
558 Page page;
559 OffsetNumber childoffnum;
560
561 /* Have we reached a level with buffers? */
562 if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
563 break;
564
565 /* Have we reached a leaf page? */
566 if (level == 0)
567 break;
568
569 /*
570 * Nope. Descend down to the next level then. Choose a child to
571 * descend down to.
572 */
573
574 buffer = ReadBuffer(indexrel, blkno);
575 LockBuffer(buffer, GIST_EXCLUSIVE);
576
577 page = (Page) BufferGetPage(buffer);
578 childoffnum = gistchoose(indexrel, page, itup, giststate);
579 iid = PageGetItemId(page, childoffnum);
580 idxtuple = (IndexTuple) PageGetItem(page, iid);
581 childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
582
583 if (level > 1)
584 gistMemorizeParent(buildstate, childblkno, blkno);
585
586 /*
587 * Check that the key representing the target child node is consistent
588 * with the key we're inserting. Update it if it's not.
589 */
590 newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
591 if (newtup)
592 {
593 blkno = gistbufferinginserttuples(buildstate,
594 buffer,
595 level,
596 &newtup,
597 1,
598 childoffnum,
599 InvalidBlockNumber,
600 InvalidOffsetNumber);
601 /* gistbufferinginserttuples() released the buffer */
602 }
603 else
604 UnlockReleaseBuffer(buffer);
605
606 /* Descend to the child */
607 parentblkno = blkno;
608 blkno = childblkno;
609 downlinkoffnum = childoffnum;
610 Assert(level > 0);
611 level--;
612 }
613
614 if (LEVEL_HAS_BUFFERS(level, gfbb))
615 {
616 /*
617 * We've reached level with buffers. Place the index tuple to the
618 * buffer, and add the buffer to the emptying queue if it overflows.
619 */
620 GISTNodeBuffer *childNodeBuffer;
621
622 /* Find the buffer or create a new one */
623 childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
624
625 /* Add index tuple to it */
626 gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
627
628 if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
629 result = true;
630 }
631 else
632 {
633 /*
634 * We've reached a leaf page. Place the tuple here.
635 */
636 Assert(level == 0);
637 buffer = ReadBuffer(indexrel, blkno);
638 LockBuffer(buffer, GIST_EXCLUSIVE);
639 gistbufferinginserttuples(buildstate, buffer, level,
640 &itup, 1, InvalidOffsetNumber,
641 parentblkno, downlinkoffnum);
642 /* gistbufferinginserttuples() released the buffer */
643 }
644
645 return result;
646 }
647
648 /*
649 * Insert tuples to a given page.
650 *
651 * This is analogous with gistinserttuples() in the regular insertion code.
652 *
653 * Returns the block number of the page where the (first) new or updated tuple
654 * was inserted. Usually that's the original page, but might be a sibling page
655 * if the original page was split.
656 *
657 * Caller should hold a lock on 'buffer' on entry. This function will unlock
658 * and unpin it.
659 */
660 static BlockNumber
gistbufferinginserttuples(GISTBuildState * buildstate,Buffer buffer,int level,IndexTuple * itup,int ntup,OffsetNumber oldoffnum,BlockNumber parentblk,OffsetNumber downlinkoffnum)661 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
662 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
663 BlockNumber parentblk, OffsetNumber downlinkoffnum)
664 {
665 GISTBuildBuffers *gfbb = buildstate->gfbb;
666 List *splitinfo;
667 bool is_split;
668 BlockNumber placed_to_blk = InvalidBlockNumber;
669
670 is_split = gistplacetopage(buildstate->indexrel,
671 buildstate->freespace,
672 buildstate->giststate,
673 buffer,
674 itup, ntup, oldoffnum, &placed_to_blk,
675 InvalidBuffer,
676 &splitinfo,
677 false,
678 buildstate->heaprel, true);
679
680 /*
681 * If this is a root split, update the root path item kept in memory. This
682 * ensures that all path stacks are always complete, including all parent
683 * nodes up to the root. That simplifies the algorithm to re-find correct
684 * parent.
685 */
686 if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
687 {
688 Page page = BufferGetPage(buffer);
689 OffsetNumber off;
690 OffsetNumber maxoff;
691
692 Assert(level == gfbb->rootlevel);
693 gfbb->rootlevel++;
694
695 elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
696
697 /*
698 * All the downlinks on the old root page are now on one of the child
699 * pages. Visit all the new child pages to memorize the parents of the
700 * grandchildren.
701 */
702 if (gfbb->rootlevel > 1)
703 {
704 maxoff = PageGetMaxOffsetNumber(page);
705 for (off = FirstOffsetNumber; off <= maxoff; off++)
706 {
707 ItemId iid = PageGetItemId(page, off);
708 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
709 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
710 Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno);
711
712 LockBuffer(childbuf, GIST_SHARE);
713 gistMemorizeAllDownlinks(buildstate, childbuf);
714 UnlockReleaseBuffer(childbuf);
715
716 /*
717 * Also remember that the parent of the new child page is the
718 * root block.
719 */
720 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
721 }
722 }
723 }
724
725 if (splitinfo)
726 {
727 /*
728 * Insert the downlinks to the parent. This is analogous with
729 * gistfinishsplit() in the regular insertion code, but the locking is
730 * simpler, and we have to maintain the buffers on internal nodes and
731 * the parent map.
732 */
733 IndexTuple *downlinks;
734 int ndownlinks,
735 i;
736 Buffer parentBuffer;
737 ListCell *lc;
738
739 /* Parent may have changed since we memorized this path. */
740 parentBuffer =
741 gistBufferingFindCorrectParent(buildstate,
742 BufferGetBlockNumber(buffer),
743 level,
744 &parentblk,
745 &downlinkoffnum);
746
747 /*
748 * If there's a buffer associated with this page, that needs to be
749 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
750 * downlinks in 'splitinfo', to make sure they're consistent not only
751 * with the tuples already on the pages, but also the tuples in the
752 * buffers that will eventually be inserted to them.
753 */
754 gistRelocateBuildBuffersOnSplit(gfbb,
755 buildstate->giststate,
756 buildstate->indexrel,
757 level,
758 buffer, splitinfo);
759
760 /* Create an array of all the downlink tuples */
761 ndownlinks = list_length(splitinfo);
762 downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
763 i = 0;
764 foreach(lc, splitinfo)
765 {
766 GISTPageSplitInfo *splitinfo = lfirst(lc);
767
768 /*
769 * Remember the parent of each new child page in our parent map.
770 * This assumes that the downlinks fit on the parent page. If the
771 * parent page is split, too, when we recurse up to insert the
772 * downlinks, the recursive gistbufferinginserttuples() call will
773 * update the map again.
774 */
775 if (level > 0)
776 gistMemorizeParent(buildstate,
777 BufferGetBlockNumber(splitinfo->buf),
778 BufferGetBlockNumber(parentBuffer));
779
780 /*
781 * Also update the parent map for all the downlinks that got moved
782 * to a different page. (actually this also loops through the
783 * downlinks that stayed on the original page, but it does no
784 * harm).
785 */
786 if (level > 1)
787 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
788
789 /*
790 * Since there's no concurrent access, we can release the lower
791 * level buffers immediately. This includes the original page.
792 */
793 UnlockReleaseBuffer(splitinfo->buf);
794 downlinks[i++] = splitinfo->downlink;
795 }
796
797 /* Insert them into parent. */
798 gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
799 downlinks, ndownlinks, downlinkoffnum,
800 InvalidBlockNumber, InvalidOffsetNumber);
801
802 list_free_deep(splitinfo); /* we don't need this anymore */
803 }
804 else
805 UnlockReleaseBuffer(buffer);
806
807 return placed_to_blk;
808 }
809
810 /*
811 * Find the downlink pointing to a child page.
812 *
813 * 'childblkno' indicates the child page to find the parent for. 'level' is
814 * the level of the child. On entry, *parentblkno and *downlinkoffnum can
815 * point to a location where the downlink used to be - we will check that
816 * location first, and save some cycles if it hasn't moved. The function
817 * returns a buffer containing the downlink, exclusively-locked, and
818 * *parentblkno and *downlinkoffnum are set to the real location of the
819 * downlink.
820 *
821 * If the child page is a leaf (level == 0), the caller must supply a correct
822 * parentblkno. Otherwise we use the parent map hash table to find the parent
823 * block.
824 *
825 * This function serves the same purpose as gistFindCorrectParent() during
826 * normal index inserts, but this is simpler because we don't need to deal
827 * with concurrent inserts.
828 */
829 static Buffer
gistBufferingFindCorrectParent(GISTBuildState * buildstate,BlockNumber childblkno,int level,BlockNumber * parentblkno,OffsetNumber * downlinkoffnum)830 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
831 BlockNumber childblkno, int level,
832 BlockNumber *parentblkno,
833 OffsetNumber *downlinkoffnum)
834 {
835 BlockNumber parent;
836 Buffer buffer;
837 Page page;
838 OffsetNumber maxoff;
839 OffsetNumber off;
840
841 if (level > 0)
842 parent = gistGetParent(buildstate, childblkno);
843 else
844 {
845 /*
846 * For a leaf page, the caller must supply a correct parent block
847 * number.
848 */
849 if (*parentblkno == InvalidBlockNumber)
850 elog(ERROR, "no parent buffer provided of child %d", childblkno);
851 parent = *parentblkno;
852 }
853
854 buffer = ReadBuffer(buildstate->indexrel, parent);
855 page = BufferGetPage(buffer);
856 LockBuffer(buffer, GIST_EXCLUSIVE);
857 gistcheckpage(buildstate->indexrel, buffer);
858 maxoff = PageGetMaxOffsetNumber(page);
859
860 /* Check if it was not moved */
861 if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
862 *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
863 {
864 ItemId iid = PageGetItemId(page, *downlinkoffnum);
865 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
866
867 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
868 {
869 /* Still there */
870 return buffer;
871 }
872 }
873
874 /*
875 * Downlink was not at the offset where it used to be. Scan the page to
876 * find it. During normal gist insertions, it might've moved to another
877 * page, to the right, but during a buffering build, we keep track of the
878 * parent of each page in the lookup table so we should always know what
879 * page it's on.
880 */
881 for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
882 {
883 ItemId iid = PageGetItemId(page, off);
884 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
885
886 if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
887 {
888 /* yes!!, found it */
889 *downlinkoffnum = off;
890 return buffer;
891 }
892 }
893
894 elog(ERROR, "failed to re-find parent for block %u", childblkno);
895 return InvalidBuffer; /* keep compiler quiet */
896 }
897
898 /*
899 * Process buffers emptying stack. Emptying of one buffer can cause emptying
900 * of other buffers. This function iterates until this cascading emptying
901 * process finished, e.g. until buffers emptying stack is empty.
902 */
903 static void
gistProcessEmptyingQueue(GISTBuildState * buildstate)904 gistProcessEmptyingQueue(GISTBuildState *buildstate)
905 {
906 GISTBuildBuffers *gfbb = buildstate->gfbb;
907
908 /* Iterate while we have elements in buffers emptying stack. */
909 while (gfbb->bufferEmptyingQueue != NIL)
910 {
911 GISTNodeBuffer *emptyingNodeBuffer;
912
913 /* Get node buffer from emptying stack. */
914 emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
915 gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
916 emptyingNodeBuffer->queuedForEmptying = false;
917
918 /*
919 * We are going to load last pages of buffers where emptying will be
920 * to. So let's unload any previously loaded buffers.
921 */
922 gistUnloadNodeBuffers(gfbb);
923
924 /*
925 * Pop tuples from the buffer and run them down to the buffers at
926 * lower level, or leaf pages. We continue until one of the lower
927 * level buffers fills up, or this buffer runs empty.
928 *
929 * In Arge et al's paper, the buffer emptying is stopped after
930 * processing 1/2 node buffer worth of tuples, to avoid overfilling
931 * any of the lower level buffers. However, it's more efficient to
932 * keep going until one of the lower level buffers actually fills up,
933 * so that's what we do. This doesn't need to be exact, if a buffer
934 * overfills by a few tuples, there's no harm done.
935 */
936 while (true)
937 {
938 IndexTuple itup;
939
940 /* Get next index tuple from the buffer */
941 if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
942 break;
943
944 /*
945 * Run it down to the underlying node buffer or leaf page.
946 *
947 * Note: it's possible that the buffer we're emptying splits as a
948 * result of this call. If that happens, our emptyingNodeBuffer
949 * points to the left half of the split. After split, it's very
950 * likely that the new left buffer is no longer over the half-full
951 * threshold, but we might as well keep flushing tuples from it
952 * until we fill a lower-level buffer.
953 */
954 if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
955 {
956 /*
957 * A lower level buffer filled up. Stop emptying this buffer,
958 * to avoid overflowing the lower level buffer.
959 */
960 break;
961 }
962
963 /* Free all the memory allocated during index tuple processing */
964 MemoryContextReset(buildstate->giststate->tempCxt);
965 }
966 }
967 }
968
969 /*
970 * Empty all node buffers, from top to bottom. This is done at the end of
971 * index build to flush all remaining tuples to the index.
972 *
973 * Note: This destroys the buffersOnLevels lists, so the buffers should not
974 * be inserted to after this call.
975 */
976 static void
gistEmptyAllBuffers(GISTBuildState * buildstate)977 gistEmptyAllBuffers(GISTBuildState *buildstate)
978 {
979 GISTBuildBuffers *gfbb = buildstate->gfbb;
980 MemoryContext oldCtx;
981 int i;
982
983 oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
984
985 /*
986 * Iterate through the levels from top to bottom.
987 */
988 for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
989 {
990 /*
991 * Empty all buffers on this level. Note that new buffers can pop up
992 * in the list during the processing, as a result of page splits, so a
993 * simple walk through the list won't work. We remove buffers from the
994 * list when we see them empty; a buffer can't become non-empty once
995 * it's been fully emptied.
996 */
997 while (gfbb->buffersOnLevels[i] != NIL)
998 {
999 GISTNodeBuffer *nodeBuffer;
1000
1001 nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1002
1003 if (nodeBuffer->blocksCount != 0)
1004 {
1005 /*
1006 * Add this buffer to the emptying queue, and proceed to empty
1007 * the queue.
1008 */
1009 if (!nodeBuffer->queuedForEmptying)
1010 {
1011 MemoryContextSwitchTo(gfbb->context);
1012 nodeBuffer->queuedForEmptying = true;
1013 gfbb->bufferEmptyingQueue =
1014 lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1015 MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1016 }
1017 gistProcessEmptyingQueue(buildstate);
1018 }
1019 else
1020 gfbb->buffersOnLevels[i] =
1021 list_delete_first(gfbb->buffersOnLevels[i]);
1022 }
1023 elog(DEBUG2, "emptied all buffers at level %d", i);
1024 }
1025 MemoryContextSwitchTo(oldCtx);
1026 }
1027
1028 /*
1029 * Get the depth of the GiST index.
1030 */
1031 static int
gistGetMaxLevel(Relation index)1032 gistGetMaxLevel(Relation index)
1033 {
1034 int maxLevel;
1035 BlockNumber blkno;
1036
1037 /*
1038 * Traverse down the tree, starting from the root, until we hit the leaf
1039 * level.
1040 */
1041 maxLevel = 0;
1042 blkno = GIST_ROOT_BLKNO;
1043 while (true)
1044 {
1045 Buffer buffer;
1046 Page page;
1047 IndexTuple itup;
1048
1049 buffer = ReadBuffer(index, blkno);
1050
1051 /*
1052 * There's no concurrent access during index build, so locking is just
1053 * pro forma.
1054 */
1055 LockBuffer(buffer, GIST_SHARE);
1056 page = (Page) BufferGetPage(buffer);
1057
1058 if (GistPageIsLeaf(page))
1059 {
1060 /* We hit the bottom, so we're done. */
1061 UnlockReleaseBuffer(buffer);
1062 break;
1063 }
1064
1065 /*
1066 * Pick the first downlink on the page, and follow it. It doesn't
1067 * matter which downlink we choose, the tree has the same depth
1068 * everywhere, so we just pick the first one.
1069 */
1070 itup = (IndexTuple) PageGetItem(page,
1071 PageGetItemId(page, FirstOffsetNumber));
1072 blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1073 UnlockReleaseBuffer(buffer);
1074
1075 /*
1076 * We're going down on the tree. It means that there is yet one more
1077 * level in the tree.
1078 */
1079 maxLevel++;
1080 }
1081 return maxLevel;
1082 }
1083
1084
1085 /*
1086 * Routines for managing the parent map.
1087 *
1088 * Whenever a page is split, we need to insert the downlinks into the parent.
1089 * We need to somehow find the parent page to do that. In normal insertions,
1090 * we keep a stack of nodes visited when we descend the tree. However, in
1091 * buffering build, we can start descending the tree from any internal node,
1092 * when we empty a buffer by cascading tuples to its children. So we don't
1093 * have a full stack up to the root available at that time.
1094 *
1095 * So instead, we maintain a hash table to track the parent of every internal
1096 * page. We don't need to track the parents of leaf nodes, however. Whenever
1097 * we insert to a leaf, we've just descended down from its parent, so we know
1098 * its immediate parent already. This helps a lot to limit the memory used
1099 * by this hash table.
1100 *
1101 * Whenever an internal node is split, the parent map needs to be updated.
1102 * the parent of the new child page needs to be recorded, and also the
1103 * entries for all page whose downlinks are moved to a new page at the split
1104 * needs to be updated.
1105 *
1106 * We also update the parent map whenever we descend the tree. That might seem
1107 * unnecessary, because we maintain the map whenever a downlink is moved or
1108 * created, but it is needed because we switch to buffering mode after
1109 * creating a tree with regular index inserts. Any pages created before
1110 * switching to buffering mode will not be present in the parent map initially,
1111 * but will be added there the first time we visit them.
1112 */
1113
1114 typedef struct
1115 {
1116 BlockNumber childblkno; /* hash key */
1117 BlockNumber parentblkno;
1118 } ParentMapEntry;
1119
1120 static void
gistInitParentMap(GISTBuildState * buildstate)1121 gistInitParentMap(GISTBuildState *buildstate)
1122 {
1123 HASHCTL hashCtl;
1124
1125 hashCtl.keysize = sizeof(BlockNumber);
1126 hashCtl.entrysize = sizeof(ParentMapEntry);
1127 hashCtl.hcxt = CurrentMemoryContext;
1128 buildstate->parentMap = hash_create("gistbuild parent map",
1129 1024,
1130 &hashCtl,
1131 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1132 }
1133
1134 static void
gistMemorizeParent(GISTBuildState * buildstate,BlockNumber child,BlockNumber parent)1135 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1136 {
1137 ParentMapEntry *entry;
1138 bool found;
1139
1140 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1141 (const void *) &child,
1142 HASH_ENTER,
1143 &found);
1144 entry->parentblkno = parent;
1145 }
1146
1147 /*
1148 * Scan all downlinks on a page, and memorize their parent.
1149 */
1150 static void
gistMemorizeAllDownlinks(GISTBuildState * buildstate,Buffer parentbuf)1151 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1152 {
1153 OffsetNumber maxoff;
1154 OffsetNumber off;
1155 BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1156 Page page = BufferGetPage(parentbuf);
1157
1158 Assert(!GistPageIsLeaf(page));
1159
1160 maxoff = PageGetMaxOffsetNumber(page);
1161 for (off = FirstOffsetNumber; off <= maxoff; off++)
1162 {
1163 ItemId iid = PageGetItemId(page, off);
1164 IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
1165 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1166
1167 gistMemorizeParent(buildstate, childblkno, parentblkno);
1168 }
1169 }
1170
1171 static BlockNumber
gistGetParent(GISTBuildState * buildstate,BlockNumber child)1172 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1173 {
1174 ParentMapEntry *entry;
1175 bool found;
1176
1177 /* Find node buffer in hash table */
1178 entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1179 (const void *) &child,
1180 HASH_FIND,
1181 &found);
1182 if (!found)
1183 elog(ERROR, "could not find parent of block %d in lookup table", child);
1184
1185 return entry->parentblkno;
1186 }
1187