1 /*-------------------------------------------------------------------------
2  *
3  * gistbuild.c
4  *	  build algorithm for GiST indexes implementation.
5  *
6  *
7  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/gist/gistbuild.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/genam.h"
20 #include "access/gist_private.h"
21 #include "access/gistxlog.h"
22 #include "access/tableam.h"
23 #include "access/xloginsert.h"
24 #include "catalog/index.h"
25 #include "miscadmin.h"
26 #include "optimizer/optimizer.h"
27 #include "storage/bufmgr.h"
28 #include "storage/smgr.h"
29 #include "utils/memutils.h"
30 #include "utils/rel.h"
31 
32 /* Step of index tuples for check whether to switch to buffering build mode */
33 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
34 
35 /*
36  * Number of tuples to process in the slow way before switching to buffering
37  * mode, when buffering is explicitly turned on. Also, the number of tuples
38  * to process between readjusting the buffer size parameter, while in
39  * buffering mode.
40  */
41 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
42 
43 typedef enum
44 {
45 	GIST_BUFFERING_DISABLED,	/* in regular build mode and aren't going to
46 								 * switch */
47 	GIST_BUFFERING_AUTO,		/* in regular build mode, but will switch to
48 								 * buffering build mode if the index grows too
49 								 * big */
50 	GIST_BUFFERING_STATS,		/* gathering statistics of index tuple size
51 								 * before switching to the buffering build
52 								 * mode */
53 	GIST_BUFFERING_ACTIVE		/* in buffering build mode */
54 } GistBufferingMode;
55 
56 /* Working state for gistbuild and its callback */
57 typedef struct
58 {
59 	Relation	indexrel;
60 	Relation	heaprel;
61 	GISTSTATE  *giststate;
62 
63 	int64		indtuples;		/* number of tuples indexed */
64 	int64		indtuplesSize;	/* total size of all indexed tuples */
65 
66 	Size		freespace;		/* amount of free space to leave on pages */
67 
68 	/*
69 	 * Extra data structures used during a buffering build. 'gfbb' contains
70 	 * information related to managing the build buffers. 'parentMap' is a
71 	 * lookup table of the parent of each internal page.
72 	 */
73 	GISTBuildBuffers *gfbb;
74 	HTAB	   *parentMap;
75 
76 	GistBufferingMode bufferingMode;
77 } GISTBuildState;
78 
79 /* prototypes for private functions */
80 static void gistInitBuffering(GISTBuildState *buildstate);
81 static int	calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
82 static void gistBuildCallback(Relation index,
83 							  ItemPointer tid,
84 							  Datum *values,
85 							  bool *isnull,
86 							  bool tupleIsAlive,
87 							  void *state);
88 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
89 									 IndexTuple itup);
90 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
91 							BlockNumber startblkno, int startlevel);
92 static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
93 											 Buffer buffer, int level,
94 											 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
95 											 BlockNumber parentblk, OffsetNumber downlinkoffnum);
96 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
97 											 BlockNumber childblkno, int level,
98 											 BlockNumber *parentblk,
99 											 OffsetNumber *downlinkoffnum);
100 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
101 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
102 static int	gistGetMaxLevel(Relation index);
103 
104 static void gistInitParentMap(GISTBuildState *buildstate);
105 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
106 							   BlockNumber parent);
107 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
108 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
109 
110 /*
111  * Main entry point to GiST index build. Initially calls insert over and over,
112  * but switches to more efficient buffering build algorithm after a certain
113  * number of tuples (unless buffering mode is disabled).
114  */
115 IndexBuildResult *
gistbuild(Relation heap,Relation index,IndexInfo * indexInfo)116 gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
117 {
118 	IndexBuildResult *result;
119 	double		reltuples;
120 	GISTBuildState buildstate;
121 	Buffer		buffer;
122 	Page		page;
123 	MemoryContext oldcxt = CurrentMemoryContext;
124 	int			fillfactor;
125 
126 	buildstate.indexrel = index;
127 	buildstate.heaprel = heap;
128 
129 	if (index->rd_options)
130 	{
131 		/* Get buffering mode from the options string */
132 		GiSTOptions *options = (GiSTOptions *) index->rd_options;
133 
134 		if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
135 			buildstate.bufferingMode = GIST_BUFFERING_STATS;
136 		else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
137 			buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
138 		else
139 			buildstate.bufferingMode = GIST_BUFFERING_AUTO;
140 
141 		fillfactor = options->fillfactor;
142 	}
143 	else
144 	{
145 		/*
146 		 * By default, switch to buffering mode when the index grows too large
147 		 * to fit in cache.
148 		 */
149 		buildstate.bufferingMode = GIST_BUFFERING_AUTO;
150 		fillfactor = GIST_DEFAULT_FILLFACTOR;
151 	}
152 	/* Calculate target amount of free space to leave on pages */
153 	buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
154 
155 	/*
156 	 * We expect to be called exactly once for any index relation. If that's
157 	 * not the case, big trouble's what we have.
158 	 */
159 	if (RelationGetNumberOfBlocks(index) != 0)
160 		elog(ERROR, "index \"%s\" already contains data",
161 			 RelationGetRelationName(index));
162 
163 	/* no locking is needed */
164 	buildstate.giststate = initGISTstate(index);
165 
166 	/*
167 	 * Create a temporary memory context that is reset once for each tuple
168 	 * processed.  (Note: we don't bother to make this a child of the
169 	 * giststate's scanCxt, so we have to delete it separately at the end.)
170 	 */
171 	buildstate.giststate->tempCxt = createTempGistContext();
172 
173 	/* initialize the root page */
174 	buffer = gistNewBuffer(index);
175 	Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
176 	page = BufferGetPage(buffer);
177 
178 	START_CRIT_SECTION();
179 
180 	GISTInitBuffer(buffer, F_LEAF);
181 
182 	MarkBufferDirty(buffer);
183 	PageSetLSN(page, GistBuildLSN);
184 
185 	UnlockReleaseBuffer(buffer);
186 
187 	END_CRIT_SECTION();
188 
189 	/* build the index */
190 	buildstate.indtuples = 0;
191 	buildstate.indtuplesSize = 0;
192 
193 	/*
194 	 * Do the heap scan.
195 	 */
196 	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
197 									   gistBuildCallback,
198 									   (void *) &buildstate, NULL);
199 
200 	/*
201 	 * If buffering was used, flush out all the tuples that are still in the
202 	 * buffers.
203 	 */
204 	if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
205 	{
206 		elog(DEBUG1, "all tuples processed, emptying buffers");
207 		gistEmptyAllBuffers(&buildstate);
208 		gistFreeBuildBuffers(buildstate.gfbb);
209 	}
210 
211 	/* okay, all heap tuples are indexed */
212 	MemoryContextSwitchTo(oldcxt);
213 	MemoryContextDelete(buildstate.giststate->tempCxt);
214 
215 	freeGISTstate(buildstate.giststate);
216 
217 	/*
218 	 * We didn't write WAL records as we built the index, so if WAL-logging is
219 	 * required, write all pages to the WAL now.
220 	 */
221 	if (RelationNeedsWAL(index))
222 	{
223 		log_newpage_range(index, MAIN_FORKNUM,
224 						  0, RelationGetNumberOfBlocks(index),
225 						  true);
226 	}
227 
228 	/*
229 	 * Return statistics
230 	 */
231 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
232 
233 	result->heap_tuples = reltuples;
234 	result->index_tuples = (double) buildstate.indtuples;
235 
236 	return result;
237 }
238 
239 /*
240  * Attempt to switch to buffering mode.
241  *
242  * If there is not enough memory for buffering build, sets bufferingMode
243  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
244  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
245  * GIST_BUFFERING_ACTIVE.
246  */
247 static void
gistInitBuffering(GISTBuildState * buildstate)248 gistInitBuffering(GISTBuildState *buildstate)
249 {
250 	Relation	index = buildstate->indexrel;
251 	int			pagesPerBuffer;
252 	Size		pageFreeSpace;
253 	Size		itupAvgSize,
254 				itupMinSize;
255 	double		avgIndexTuplesPerPage,
256 				maxIndexTuplesPerPage;
257 	int			i;
258 	int			levelStep;
259 
260 	/* Calc space of index page which is available for index tuples */
261 	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
262 		- sizeof(ItemIdData)
263 		- buildstate->freespace;
264 
265 	/*
266 	 * Calculate average size of already inserted index tuples using gathered
267 	 * statistics.
268 	 */
269 	itupAvgSize = (double) buildstate->indtuplesSize /
270 		(double) buildstate->indtuples;
271 
272 	/*
273 	 * Calculate minimal possible size of index tuple by index metadata.
274 	 * Minimal possible size of varlena is VARHDRSZ.
275 	 *
276 	 * XXX: that's not actually true, as a short varlen can be just 2 bytes.
277 	 * And we should take padding into account here.
278 	 */
279 	itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
280 	for (i = 0; i < index->rd_att->natts; i++)
281 	{
282 		if (TupleDescAttr(index->rd_att, i)->attlen < 0)
283 			itupMinSize += VARHDRSZ;
284 		else
285 			itupMinSize += TupleDescAttr(index->rd_att, i)->attlen;
286 	}
287 
288 	/* Calculate average and maximal number of index tuples which fit to page */
289 	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
290 	maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
291 
292 	/*
293 	 * We need to calculate two parameters for the buffering algorithm:
294 	 * levelStep and pagesPerBuffer.
295 	 *
296 	 * levelStep determines the size of subtree that we operate on, while
297 	 * emptying a buffer. A higher value is better, as you need fewer buffer
298 	 * emptying steps to build the index. However, if you set it too high, the
299 	 * subtree doesn't fit in cache anymore, and you quickly lose the benefit
300 	 * of the buffers.
301 	 *
302 	 * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
303 	 * the number of tuples on page (ie. fanout), and M is the amount of
304 	 * internal memory available. Curiously, they doesn't explain *why* that
305 	 * setting is optimal. We calculate it by taking the highest levelStep so
306 	 * that a subtree still fits in cache. For a small B, our way of
307 	 * calculating levelStep is very close to Arge et al's formula. For a
308 	 * large B, our formula gives a value that is 2x higher.
309 	 *
310 	 * The average size (in pages) of a subtree of depth n can be calculated
311 	 * as a geometric series:
312 	 *
313 	 * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
314 	 *
315 	 * where B is the average number of index tuples on page. The subtree is
316 	 * cached in the shared buffer cache and the OS cache, so we choose
317 	 * levelStep so that the subtree size is comfortably smaller than
318 	 * effective_cache_size, with a safety factor of 4.
319 	 *
320 	 * The estimate on the average number of index tuples on page is based on
321 	 * average tuple sizes observed before switching to buffered build, so the
322 	 * real subtree size can be somewhat larger. Also, it would selfish to
323 	 * gobble the whole cache for our index build. The safety factor of 4
324 	 * should account for those effects.
325 	 *
326 	 * The other limiting factor for setting levelStep is that while
327 	 * processing a subtree, we need to hold one page for each buffer at the
328 	 * next lower buffered level. The max. number of buffers needed for that
329 	 * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
330 	 * hopefully maintenance_work_mem is set high enough that you're
331 	 * constrained by effective_cache_size rather than maintenance_work_mem.
332 	 *
333 	 * XXX: the buffer hash table consumes a fair amount of memory too per
334 	 * buffer, but that is not currently taken into account. That scales on
335 	 * the total number of buffers used, ie. the index size and on levelStep.
336 	 * Note that a higher levelStep *reduces* the amount of memory needed for
337 	 * the hash table.
338 	 */
339 	levelStep = 1;
340 	for (;;)
341 	{
342 		double		subtreesize;
343 		double		maxlowestlevelpages;
344 
345 		/* size of an average subtree at this levelStep (in pages). */
346 		subtreesize =
347 			(1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
348 			(1 - avgIndexTuplesPerPage);
349 
350 		/* max number of pages at the lowest level of a subtree */
351 		maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
352 
353 		/* subtree must fit in cache (with safety factor of 4) */
354 		if (subtreesize > effective_cache_size / 4)
355 			break;
356 
357 		/* each node in the lowest level of a subtree has one page in memory */
358 		if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
359 			break;
360 
361 		/* Good, we can handle this levelStep. See if we can go one higher. */
362 		levelStep++;
363 	}
364 
365 	/*
366 	 * We just reached an unacceptable value of levelStep in previous loop.
367 	 * So, decrease levelStep to get last acceptable value.
368 	 */
369 	levelStep--;
370 
371 	/*
372 	 * If there's not enough cache or maintenance_work_mem, fall back to plain
373 	 * inserts.
374 	 */
375 	if (levelStep <= 0)
376 	{
377 		elog(DEBUG1, "failed to switch to buffered GiST build");
378 		buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
379 		return;
380 	}
381 
382 	/*
383 	 * The second parameter to set is pagesPerBuffer, which determines the
384 	 * size of each buffer. We adjust pagesPerBuffer also during the build,
385 	 * which is why this calculation is in a separate function.
386 	 */
387 	pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
388 
389 	/* Initialize GISTBuildBuffers with these parameters */
390 	buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
391 											gistGetMaxLevel(index));
392 
393 	gistInitParentMap(buildstate);
394 
395 	buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
396 
397 	elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
398 		 levelStep, pagesPerBuffer);
399 }
400 
401 /*
402  * Calculate pagesPerBuffer parameter for the buffering algorithm.
403  *
404  * Buffer size is chosen so that assuming that tuples are distributed
405  * randomly, emptying half a buffer fills on average one page in every buffer
406  * at the next lower level.
407  */
408 static int
calculatePagesPerBuffer(GISTBuildState * buildstate,int levelStep)409 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
410 {
411 	double		pagesPerBuffer;
412 	double		avgIndexTuplesPerPage;
413 	double		itupAvgSize;
414 	Size		pageFreeSpace;
415 
416 	/* Calc space of index page which is available for index tuples */
417 	pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
418 		- sizeof(ItemIdData)
419 		- buildstate->freespace;
420 
421 	/*
422 	 * Calculate average size of already inserted index tuples using gathered
423 	 * statistics.
424 	 */
425 	itupAvgSize = (double) buildstate->indtuplesSize /
426 		(double) buildstate->indtuples;
427 
428 	avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
429 
430 	/*
431 	 * Recalculate required size of buffers.
432 	 */
433 	pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
434 
435 	return (int) rint(pagesPerBuffer);
436 }
437 
438 /*
439  * Per-tuple callback for table_index_build_scan.
440  */
441 static void
gistBuildCallback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)442 gistBuildCallback(Relation index,
443 				  ItemPointer tid,
444 				  Datum *values,
445 				  bool *isnull,
446 				  bool tupleIsAlive,
447 				  void *state)
448 {
449 	GISTBuildState *buildstate = (GISTBuildState *) state;
450 	IndexTuple	itup;
451 	MemoryContext oldCtx;
452 
453 	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
454 
455 	/* form an index tuple and point it at the heap tuple */
456 	itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
457 	itup->t_tid = *tid;
458 
459 	if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
460 	{
461 		/* We have buffers, so use them. */
462 		gistBufferingBuildInsert(buildstate, itup);
463 	}
464 	else
465 	{
466 		/*
467 		 * There's no buffers (yet). Since we already have the index relation
468 		 * locked, we call gistdoinsert directly.
469 		 */
470 		gistdoinsert(index, itup, buildstate->freespace,
471 					 buildstate->giststate, buildstate->heaprel, true);
472 	}
473 
474 	/* Update tuple count and total size. */
475 	buildstate->indtuples += 1;
476 	buildstate->indtuplesSize += IndexTupleSize(itup);
477 
478 	MemoryContextSwitchTo(oldCtx);
479 	MemoryContextReset(buildstate->giststate->tempCxt);
480 
481 	if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
482 		buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
483 	{
484 		/* Adjust the target buffer size now */
485 		buildstate->gfbb->pagesPerBuffer =
486 			calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
487 	}
488 
489 	/*
490 	 * In 'auto' mode, check if the index has grown too large to fit in cache,
491 	 * and switch to buffering mode if it has.
492 	 *
493 	 * To avoid excessive calls to smgrnblocks(), only check this every
494 	 * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
495 	 */
496 	if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
497 		 buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
498 		 effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
499 		(buildstate->bufferingMode == GIST_BUFFERING_STATS &&
500 		 buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
501 	{
502 		/*
503 		 * Index doesn't fit in effective cache anymore. Try to switch to
504 		 * buffering build mode.
505 		 */
506 		gistInitBuffering(buildstate);
507 	}
508 }
509 
510 /*
511  * Insert function for buffering index build.
512  */
513 static void
gistBufferingBuildInsert(GISTBuildState * buildstate,IndexTuple itup)514 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
515 {
516 	/* Insert the tuple to buffers. */
517 	gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
518 
519 	/* If we filled up (half of a) buffer, process buffer emptying. */
520 	gistProcessEmptyingQueue(buildstate);
521 }
522 
523 /*
524  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
525  * page or node buffer, and inserts the tuple there. Returns true if we have
526  * to stop buffer emptying process (because one of child buffers can't take
527  * index tuples anymore).
528  */
529 static bool
gistProcessItup(GISTBuildState * buildstate,IndexTuple itup,BlockNumber startblkno,int startlevel)530 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
531 				BlockNumber startblkno, int startlevel)
532 {
533 	GISTSTATE  *giststate = buildstate->giststate;
534 	GISTBuildBuffers *gfbb = buildstate->gfbb;
535 	Relation	indexrel = buildstate->indexrel;
536 	BlockNumber childblkno;
537 	Buffer		buffer;
538 	bool		result = false;
539 	BlockNumber blkno;
540 	int			level;
541 	OffsetNumber downlinkoffnum = InvalidOffsetNumber;
542 	BlockNumber parentblkno = InvalidBlockNumber;
543 
544 	CHECK_FOR_INTERRUPTS();
545 
546 	/*
547 	 * Loop until we reach a leaf page (level == 0) or a level with buffers
548 	 * (not including the level we start at, because we would otherwise make
549 	 * no progress).
550 	 */
551 	blkno = startblkno;
552 	level = startlevel;
553 	for (;;)
554 	{
555 		ItemId		iid;
556 		IndexTuple	idxtuple,
557 					newtup;
558 		Page		page;
559 		OffsetNumber childoffnum;
560 
561 		/* Have we reached a level with buffers? */
562 		if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
563 			break;
564 
565 		/* Have we reached a leaf page? */
566 		if (level == 0)
567 			break;
568 
569 		/*
570 		 * Nope. Descend down to the next level then. Choose a child to
571 		 * descend down to.
572 		 */
573 
574 		buffer = ReadBuffer(indexrel, blkno);
575 		LockBuffer(buffer, GIST_EXCLUSIVE);
576 
577 		page = (Page) BufferGetPage(buffer);
578 		childoffnum = gistchoose(indexrel, page, itup, giststate);
579 		iid = PageGetItemId(page, childoffnum);
580 		idxtuple = (IndexTuple) PageGetItem(page, iid);
581 		childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
582 
583 		if (level > 1)
584 			gistMemorizeParent(buildstate, childblkno, blkno);
585 
586 		/*
587 		 * Check that the key representing the target child node is consistent
588 		 * with the key we're inserting. Update it if it's not.
589 		 */
590 		newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
591 		if (newtup)
592 		{
593 			blkno = gistbufferinginserttuples(buildstate,
594 											  buffer,
595 											  level,
596 											  &newtup,
597 											  1,
598 											  childoffnum,
599 											  InvalidBlockNumber,
600 											  InvalidOffsetNumber);
601 			/* gistbufferinginserttuples() released the buffer */
602 		}
603 		else
604 			UnlockReleaseBuffer(buffer);
605 
606 		/* Descend to the child */
607 		parentblkno = blkno;
608 		blkno = childblkno;
609 		downlinkoffnum = childoffnum;
610 		Assert(level > 0);
611 		level--;
612 	}
613 
614 	if (LEVEL_HAS_BUFFERS(level, gfbb))
615 	{
616 		/*
617 		 * We've reached level with buffers. Place the index tuple to the
618 		 * buffer, and add the buffer to the emptying queue if it overflows.
619 		 */
620 		GISTNodeBuffer *childNodeBuffer;
621 
622 		/* Find the buffer or create a new one */
623 		childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
624 
625 		/* Add index tuple to it */
626 		gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
627 
628 		if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
629 			result = true;
630 	}
631 	else
632 	{
633 		/*
634 		 * We've reached a leaf page. Place the tuple here.
635 		 */
636 		Assert(level == 0);
637 		buffer = ReadBuffer(indexrel, blkno);
638 		LockBuffer(buffer, GIST_EXCLUSIVE);
639 		gistbufferinginserttuples(buildstate, buffer, level,
640 								  &itup, 1, InvalidOffsetNumber,
641 								  parentblkno, downlinkoffnum);
642 		/* gistbufferinginserttuples() released the buffer */
643 	}
644 
645 	return result;
646 }
647 
648 /*
649  * Insert tuples to a given page.
650  *
651  * This is analogous with gistinserttuples() in the regular insertion code.
652  *
653  * Returns the block number of the page where the (first) new or updated tuple
654  * was inserted. Usually that's the original page, but might be a sibling page
655  * if the original page was split.
656  *
657  * Caller should hold a lock on 'buffer' on entry. This function will unlock
658  * and unpin it.
659  */
660 static BlockNumber
gistbufferinginserttuples(GISTBuildState * buildstate,Buffer buffer,int level,IndexTuple * itup,int ntup,OffsetNumber oldoffnum,BlockNumber parentblk,OffsetNumber downlinkoffnum)661 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
662 						  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
663 						  BlockNumber parentblk, OffsetNumber downlinkoffnum)
664 {
665 	GISTBuildBuffers *gfbb = buildstate->gfbb;
666 	List	   *splitinfo;
667 	bool		is_split;
668 	BlockNumber placed_to_blk = InvalidBlockNumber;
669 
670 	is_split = gistplacetopage(buildstate->indexrel,
671 							   buildstate->freespace,
672 							   buildstate->giststate,
673 							   buffer,
674 							   itup, ntup, oldoffnum, &placed_to_blk,
675 							   InvalidBuffer,
676 							   &splitinfo,
677 							   false,
678 							   buildstate->heaprel, true);
679 
680 	/*
681 	 * If this is a root split, update the root path item kept in memory. This
682 	 * ensures that all path stacks are always complete, including all parent
683 	 * nodes up to the root. That simplifies the algorithm to re-find correct
684 	 * parent.
685 	 */
686 	if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
687 	{
688 		Page		page = BufferGetPage(buffer);
689 		OffsetNumber off;
690 		OffsetNumber maxoff;
691 
692 		Assert(level == gfbb->rootlevel);
693 		gfbb->rootlevel++;
694 
695 		elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
696 
697 		/*
698 		 * All the downlinks on the old root page are now on one of the child
699 		 * pages. Visit all the new child pages to memorize the parents of the
700 		 * grandchildren.
701 		 */
702 		if (gfbb->rootlevel > 1)
703 		{
704 			maxoff = PageGetMaxOffsetNumber(page);
705 			for (off = FirstOffsetNumber; off <= maxoff; off++)
706 			{
707 				ItemId		iid = PageGetItemId(page, off);
708 				IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
709 				BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
710 				Buffer		childbuf = ReadBuffer(buildstate->indexrel, childblkno);
711 
712 				LockBuffer(childbuf, GIST_SHARE);
713 				gistMemorizeAllDownlinks(buildstate, childbuf);
714 				UnlockReleaseBuffer(childbuf);
715 
716 				/*
717 				 * Also remember that the parent of the new child page is the
718 				 * root block.
719 				 */
720 				gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
721 			}
722 		}
723 	}
724 
725 	if (splitinfo)
726 	{
727 		/*
728 		 * Insert the downlinks to the parent. This is analogous with
729 		 * gistfinishsplit() in the regular insertion code, but the locking is
730 		 * simpler, and we have to maintain the buffers on internal nodes and
731 		 * the parent map.
732 		 */
733 		IndexTuple *downlinks;
734 		int			ndownlinks,
735 					i;
736 		Buffer		parentBuffer;
737 		ListCell   *lc;
738 
739 		/* Parent may have changed since we memorized this path. */
740 		parentBuffer =
741 			gistBufferingFindCorrectParent(buildstate,
742 										   BufferGetBlockNumber(buffer),
743 										   level,
744 										   &parentblk,
745 										   &downlinkoffnum);
746 
747 		/*
748 		 * If there's a buffer associated with this page, that needs to be
749 		 * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
750 		 * downlinks in 'splitinfo', to make sure they're consistent not only
751 		 * with the tuples already on the pages, but also the tuples in the
752 		 * buffers that will eventually be inserted to them.
753 		 */
754 		gistRelocateBuildBuffersOnSplit(gfbb,
755 										buildstate->giststate,
756 										buildstate->indexrel,
757 										level,
758 										buffer, splitinfo);
759 
760 		/* Create an array of all the downlink tuples */
761 		ndownlinks = list_length(splitinfo);
762 		downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
763 		i = 0;
764 		foreach(lc, splitinfo)
765 		{
766 			GISTPageSplitInfo *splitinfo = lfirst(lc);
767 
768 			/*
769 			 * Remember the parent of each new child page in our parent map.
770 			 * This assumes that the downlinks fit on the parent page. If the
771 			 * parent page is split, too, when we recurse up to insert the
772 			 * downlinks, the recursive gistbufferinginserttuples() call will
773 			 * update the map again.
774 			 */
775 			if (level > 0)
776 				gistMemorizeParent(buildstate,
777 								   BufferGetBlockNumber(splitinfo->buf),
778 								   BufferGetBlockNumber(parentBuffer));
779 
780 			/*
781 			 * Also update the parent map for all the downlinks that got moved
782 			 * to a different page. (actually this also loops through the
783 			 * downlinks that stayed on the original page, but it does no
784 			 * harm).
785 			 */
786 			if (level > 1)
787 				gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
788 
789 			/*
790 			 * Since there's no concurrent access, we can release the lower
791 			 * level buffers immediately. This includes the original page.
792 			 */
793 			UnlockReleaseBuffer(splitinfo->buf);
794 			downlinks[i++] = splitinfo->downlink;
795 		}
796 
797 		/* Insert them into parent. */
798 		gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
799 								  downlinks, ndownlinks, downlinkoffnum,
800 								  InvalidBlockNumber, InvalidOffsetNumber);
801 
802 		list_free_deep(splitinfo);	/* we don't need this anymore */
803 	}
804 	else
805 		UnlockReleaseBuffer(buffer);
806 
807 	return placed_to_blk;
808 }
809 
810 /*
811  * Find the downlink pointing to a child page.
812  *
813  * 'childblkno' indicates the child page to find the parent for. 'level' is
814  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
815  * point to a location where the downlink used to be - we will check that
816  * location first, and save some cycles if it hasn't moved. The function
817  * returns a buffer containing the downlink, exclusively-locked, and
818  * *parentblkno and *downlinkoffnum are set to the real location of the
819  * downlink.
820  *
821  * If the child page is a leaf (level == 0), the caller must supply a correct
822  * parentblkno. Otherwise we use the parent map hash table to find the parent
823  * block.
824  *
825  * This function serves the same purpose as gistFindCorrectParent() during
826  * normal index inserts, but this is simpler because we don't need to deal
827  * with concurrent inserts.
828  */
829 static Buffer
gistBufferingFindCorrectParent(GISTBuildState * buildstate,BlockNumber childblkno,int level,BlockNumber * parentblkno,OffsetNumber * downlinkoffnum)830 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
831 							   BlockNumber childblkno, int level,
832 							   BlockNumber *parentblkno,
833 							   OffsetNumber *downlinkoffnum)
834 {
835 	BlockNumber parent;
836 	Buffer		buffer;
837 	Page		page;
838 	OffsetNumber maxoff;
839 	OffsetNumber off;
840 
841 	if (level > 0)
842 		parent = gistGetParent(buildstate, childblkno);
843 	else
844 	{
845 		/*
846 		 * For a leaf page, the caller must supply a correct parent block
847 		 * number.
848 		 */
849 		if (*parentblkno == InvalidBlockNumber)
850 			elog(ERROR, "no parent buffer provided of child %d", childblkno);
851 		parent = *parentblkno;
852 	}
853 
854 	buffer = ReadBuffer(buildstate->indexrel, parent);
855 	page = BufferGetPage(buffer);
856 	LockBuffer(buffer, GIST_EXCLUSIVE);
857 	gistcheckpage(buildstate->indexrel, buffer);
858 	maxoff = PageGetMaxOffsetNumber(page);
859 
860 	/* Check if it was not moved */
861 	if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
862 		*downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
863 	{
864 		ItemId		iid = PageGetItemId(page, *downlinkoffnum);
865 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
866 
867 		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
868 		{
869 			/* Still there */
870 			return buffer;
871 		}
872 	}
873 
874 	/*
875 	 * Downlink was not at the offset where it used to be. Scan the page to
876 	 * find it. During normal gist insertions, it might've moved to another
877 	 * page, to the right, but during a buffering build, we keep track of the
878 	 * parent of each page in the lookup table so we should always know what
879 	 * page it's on.
880 	 */
881 	for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
882 	{
883 		ItemId		iid = PageGetItemId(page, off);
884 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
885 
886 		if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
887 		{
888 			/* yes!!, found it */
889 			*downlinkoffnum = off;
890 			return buffer;
891 		}
892 	}
893 
894 	elog(ERROR, "failed to re-find parent for block %u", childblkno);
895 	return InvalidBuffer;		/* keep compiler quiet */
896 }
897 
898 /*
899  * Process buffers emptying stack. Emptying of one buffer can cause emptying
900  * of other buffers. This function iterates until this cascading emptying
901  * process finished, e.g. until buffers emptying stack is empty.
902  */
903 static void
gistProcessEmptyingQueue(GISTBuildState * buildstate)904 gistProcessEmptyingQueue(GISTBuildState *buildstate)
905 {
906 	GISTBuildBuffers *gfbb = buildstate->gfbb;
907 
908 	/* Iterate while we have elements in buffers emptying stack. */
909 	while (gfbb->bufferEmptyingQueue != NIL)
910 	{
911 		GISTNodeBuffer *emptyingNodeBuffer;
912 
913 		/* Get node buffer from emptying stack. */
914 		emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
915 		gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
916 		emptyingNodeBuffer->queuedForEmptying = false;
917 
918 		/*
919 		 * We are going to load last pages of buffers where emptying will be
920 		 * to. So let's unload any previously loaded buffers.
921 		 */
922 		gistUnloadNodeBuffers(gfbb);
923 
924 		/*
925 		 * Pop tuples from the buffer and run them down to the buffers at
926 		 * lower level, or leaf pages. We continue until one of the lower
927 		 * level buffers fills up, or this buffer runs empty.
928 		 *
929 		 * In Arge et al's paper, the buffer emptying is stopped after
930 		 * processing 1/2 node buffer worth of tuples, to avoid overfilling
931 		 * any of the lower level buffers. However, it's more efficient to
932 		 * keep going until one of the lower level buffers actually fills up,
933 		 * so that's what we do. This doesn't need to be exact, if a buffer
934 		 * overfills by a few tuples, there's no harm done.
935 		 */
936 		while (true)
937 		{
938 			IndexTuple	itup;
939 
940 			/* Get next index tuple from the buffer */
941 			if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
942 				break;
943 
944 			/*
945 			 * Run it down to the underlying node buffer or leaf page.
946 			 *
947 			 * Note: it's possible that the buffer we're emptying splits as a
948 			 * result of this call. If that happens, our emptyingNodeBuffer
949 			 * points to the left half of the split. After split, it's very
950 			 * likely that the new left buffer is no longer over the half-full
951 			 * threshold, but we might as well keep flushing tuples from it
952 			 * until we fill a lower-level buffer.
953 			 */
954 			if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
955 			{
956 				/*
957 				 * A lower level buffer filled up. Stop emptying this buffer,
958 				 * to avoid overflowing the lower level buffer.
959 				 */
960 				break;
961 			}
962 
963 			/* Free all the memory allocated during index tuple processing */
964 			MemoryContextReset(buildstate->giststate->tempCxt);
965 		}
966 	}
967 }
968 
969 /*
970  * Empty all node buffers, from top to bottom. This is done at the end of
971  * index build to flush all remaining tuples to the index.
972  *
973  * Note: This destroys the buffersOnLevels lists, so the buffers should not
974  * be inserted to after this call.
975  */
976 static void
gistEmptyAllBuffers(GISTBuildState * buildstate)977 gistEmptyAllBuffers(GISTBuildState *buildstate)
978 {
979 	GISTBuildBuffers *gfbb = buildstate->gfbb;
980 	MemoryContext oldCtx;
981 	int			i;
982 
983 	oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
984 
985 	/*
986 	 * Iterate through the levels from top to bottom.
987 	 */
988 	for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
989 	{
990 		/*
991 		 * Empty all buffers on this level. Note that new buffers can pop up
992 		 * in the list during the processing, as a result of page splits, so a
993 		 * simple walk through the list won't work. We remove buffers from the
994 		 * list when we see them empty; a buffer can't become non-empty once
995 		 * it's been fully emptied.
996 		 */
997 		while (gfbb->buffersOnLevels[i] != NIL)
998 		{
999 			GISTNodeBuffer *nodeBuffer;
1000 
1001 			nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1002 
1003 			if (nodeBuffer->blocksCount != 0)
1004 			{
1005 				/*
1006 				 * Add this buffer to the emptying queue, and proceed to empty
1007 				 * the queue.
1008 				 */
1009 				if (!nodeBuffer->queuedForEmptying)
1010 				{
1011 					MemoryContextSwitchTo(gfbb->context);
1012 					nodeBuffer->queuedForEmptying = true;
1013 					gfbb->bufferEmptyingQueue =
1014 						lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1015 					MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1016 				}
1017 				gistProcessEmptyingQueue(buildstate);
1018 			}
1019 			else
1020 				gfbb->buffersOnLevels[i] =
1021 					list_delete_first(gfbb->buffersOnLevels[i]);
1022 		}
1023 		elog(DEBUG2, "emptied all buffers at level %d", i);
1024 	}
1025 	MemoryContextSwitchTo(oldCtx);
1026 }
1027 
1028 /*
1029  * Get the depth of the GiST index.
1030  */
1031 static int
gistGetMaxLevel(Relation index)1032 gistGetMaxLevel(Relation index)
1033 {
1034 	int			maxLevel;
1035 	BlockNumber blkno;
1036 
1037 	/*
1038 	 * Traverse down the tree, starting from the root, until we hit the leaf
1039 	 * level.
1040 	 */
1041 	maxLevel = 0;
1042 	blkno = GIST_ROOT_BLKNO;
1043 	while (true)
1044 	{
1045 		Buffer		buffer;
1046 		Page		page;
1047 		IndexTuple	itup;
1048 
1049 		buffer = ReadBuffer(index, blkno);
1050 
1051 		/*
1052 		 * There's no concurrent access during index build, so locking is just
1053 		 * pro forma.
1054 		 */
1055 		LockBuffer(buffer, GIST_SHARE);
1056 		page = (Page) BufferGetPage(buffer);
1057 
1058 		if (GistPageIsLeaf(page))
1059 		{
1060 			/* We hit the bottom, so we're done. */
1061 			UnlockReleaseBuffer(buffer);
1062 			break;
1063 		}
1064 
1065 		/*
1066 		 * Pick the first downlink on the page, and follow it. It doesn't
1067 		 * matter which downlink we choose, the tree has the same depth
1068 		 * everywhere, so we just pick the first one.
1069 		 */
1070 		itup = (IndexTuple) PageGetItem(page,
1071 										PageGetItemId(page, FirstOffsetNumber));
1072 		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1073 		UnlockReleaseBuffer(buffer);
1074 
1075 		/*
1076 		 * We're going down on the tree. It means that there is yet one more
1077 		 * level in the tree.
1078 		 */
1079 		maxLevel++;
1080 	}
1081 	return maxLevel;
1082 }
1083 
1084 
1085 /*
1086  * Routines for managing the parent map.
1087  *
1088  * Whenever a page is split, we need to insert the downlinks into the parent.
1089  * We need to somehow find the parent page to do that. In normal insertions,
1090  * we keep a stack of nodes visited when we descend the tree. However, in
1091  * buffering build, we can start descending the tree from any internal node,
1092  * when we empty a buffer by cascading tuples to its children. So we don't
1093  * have a full stack up to the root available at that time.
1094  *
1095  * So instead, we maintain a hash table to track the parent of every internal
1096  * page. We don't need to track the parents of leaf nodes, however. Whenever
1097  * we insert to a leaf, we've just descended down from its parent, so we know
1098  * its immediate parent already. This helps a lot to limit the memory used
1099  * by this hash table.
1100  *
1101  * Whenever an internal node is split, the parent map needs to be updated.
1102  * the parent of the new child page needs to be recorded, and also the
1103  * entries for all page whose downlinks are moved to a new page at the split
1104  * needs to be updated.
1105  *
1106  * We also update the parent map whenever we descend the tree. That might seem
1107  * unnecessary, because we maintain the map whenever a downlink is moved or
1108  * created, but it is needed because we switch to buffering mode after
1109  * creating a tree with regular index inserts. Any pages created before
1110  * switching to buffering mode will not be present in the parent map initially,
1111  * but will be added there the first time we visit them.
1112  */
1113 
1114 typedef struct
1115 {
1116 	BlockNumber childblkno;		/* hash key */
1117 	BlockNumber parentblkno;
1118 } ParentMapEntry;
1119 
1120 static void
gistInitParentMap(GISTBuildState * buildstate)1121 gistInitParentMap(GISTBuildState *buildstate)
1122 {
1123 	HASHCTL		hashCtl;
1124 
1125 	hashCtl.keysize = sizeof(BlockNumber);
1126 	hashCtl.entrysize = sizeof(ParentMapEntry);
1127 	hashCtl.hcxt = CurrentMemoryContext;
1128 	buildstate->parentMap = hash_create("gistbuild parent map",
1129 										1024,
1130 										&hashCtl,
1131 										HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1132 }
1133 
1134 static void
gistMemorizeParent(GISTBuildState * buildstate,BlockNumber child,BlockNumber parent)1135 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1136 {
1137 	ParentMapEntry *entry;
1138 	bool		found;
1139 
1140 	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1141 										   (const void *) &child,
1142 										   HASH_ENTER,
1143 										   &found);
1144 	entry->parentblkno = parent;
1145 }
1146 
1147 /*
1148  * Scan all downlinks on a page, and memorize their parent.
1149  */
1150 static void
gistMemorizeAllDownlinks(GISTBuildState * buildstate,Buffer parentbuf)1151 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1152 {
1153 	OffsetNumber maxoff;
1154 	OffsetNumber off;
1155 	BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1156 	Page		page = BufferGetPage(parentbuf);
1157 
1158 	Assert(!GistPageIsLeaf(page));
1159 
1160 	maxoff = PageGetMaxOffsetNumber(page);
1161 	for (off = FirstOffsetNumber; off <= maxoff; off++)
1162 	{
1163 		ItemId		iid = PageGetItemId(page, off);
1164 		IndexTuple	idxtuple = (IndexTuple) PageGetItem(page, iid);
1165 		BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1166 
1167 		gistMemorizeParent(buildstate, childblkno, parentblkno);
1168 	}
1169 }
1170 
1171 static BlockNumber
gistGetParent(GISTBuildState * buildstate,BlockNumber child)1172 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1173 {
1174 	ParentMapEntry *entry;
1175 	bool		found;
1176 
1177 	/* Find node buffer in hash table */
1178 	entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1179 										   (const void *) &child,
1180 										   HASH_FIND,
1181 										   &found);
1182 	if (!found)
1183 		elog(ERROR, "could not find parent of block %d in lookup table", child);
1184 
1185 	return entry->parentblkno;
1186 }
1187