1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  *		Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level.  We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary).  When
13  * done, we write out each final page on each level, adding it to
14  * its parent level.  When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree).  The steady-state load
20  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%.  This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * Formerly the index pages being built were kept in shared buffers, but
27  * that is of no value (since other backends have no interest in them yet)
28  * and it created locking problems for CHECKPOINT, because the upper-level
29  * pages were held exclusive-locked for long periods.  Now we just build
30  * the pages in local memory and smgrwrite or smgrextend them as we finish
31  * them.  They will need to be re-read into shared buffers on first use after
32  * the build finishes.
33  *
34  * Since the index will never be used unless it is completely built,
35  * from a crash-recovery point of view there is no need to WAL-log the
36  * steps of the build.  After completing the index build, we can just sync
37  * the whole file to disk using smgrimmedsync() before exiting this module.
38  * This can be seen to be sufficient for crash recovery by considering that
39  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
40  * just after the index build.  However, it is clearly not sufficient if the
41  * DBA is using the WAL log for PITR or replication purposes, since another
42  * machine would not be able to reconstruct the index from WAL.  Therefore,
43  * we log the completed index pages to WAL if and only if WAL archiving is
44  * active.
45  *
46  * This code isn't concerned about the FSM at all. The caller is responsible
47  * for initializing that.
48  *
49  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
50  * Portions Copyright (c) 1994, Regents of the University of California
51  *
52  * IDENTIFICATION
53  *	  src/backend/access/nbtree/nbtsort.c
54  *
55  *-------------------------------------------------------------------------
56  */
57 
58 #include "postgres.h"
59 
60 #include "access/nbtree.h"
61 #include "access/parallel.h"
62 #include "access/relscan.h"
63 #include "access/xact.h"
64 #include "access/xlog.h"
65 #include "access/xloginsert.h"
66 #include "catalog/index.h"
67 #include "executor/instrument.h"
68 #include "miscadmin.h"
69 #include "pgstat.h"
70 #include "storage/smgr.h"
71 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
72 #include "utils/rel.h"
73 #include "utils/sortsupport.h"
74 #include "utils/tuplesort.h"
75 
76 
77 /* Magic numbers for parallel state sharing */
78 #define PARALLEL_KEY_BTREE_SHARED		UINT64CONST(0xA000000000000001)
79 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
80 #define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
81 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
82 #define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000005)
83 
84 /*
85  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
86  * parallel index builds.  This may be useful as a debugging aid.
87 #undef DISABLE_LEADER_PARTICIPATION
88  */
89 
90 /*
91  * Status record for spooling/sorting phase.  (Note we may have two of
92  * these due to the special requirements for uniqueness-checking with
93  * dead tuples.)
94  */
95 typedef struct BTSpool
96 {
97 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
98 	Relation	heap;
99 	Relation	index;
100 	bool		isunique;
101 } BTSpool;
102 
103 /*
104  * Status for index builds performed in parallel.  This is allocated in a
105  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
106  * entry, private to tuplesort.c but allocated by this module on its behalf.
107  */
108 typedef struct BTShared
109 {
110 	/*
111 	 * These fields are not modified during the sort.  They primarily exist
112 	 * for the benefit of worker processes that need to create BTSpool state
113 	 * corresponding to that used by the leader.
114 	 */
115 	Oid			heaprelid;
116 	Oid			indexrelid;
117 	bool		isunique;
118 	bool		isconcurrent;
119 	int			scantuplesortstates;
120 
121 	/*
122 	 * workersdonecv is used to monitor the progress of workers.  All parallel
123 	 * participants must indicate that they are done before leader can use
124 	 * mutable state that workers maintain during scan (and before leader can
125 	 * proceed to tuplesort_performsort()).
126 	 */
127 	ConditionVariable workersdonecv;
128 
129 	/*
130 	 * mutex protects all fields before heapdesc.
131 	 *
132 	 * These fields contain status information of interest to B-Tree index
133 	 * builds that must work just the same when an index is built in parallel.
134 	 */
135 	slock_t		mutex;
136 
137 	/*
138 	 * Mutable state that is maintained by workers, and reported back to
139 	 * leader at end of parallel scan.
140 	 *
141 	 * nparticipantsdone is number of worker processes finished.
142 	 *
143 	 * reltuples is the total number of input heap tuples.
144 	 *
145 	 * havedead indicates if RECENTLY_DEAD tuples were encountered during
146 	 * build.
147 	 *
148 	 * indtuples is the total number of tuples that made it into the index.
149 	 *
150 	 * brokenhotchain indicates if any worker detected a broken HOT chain
151 	 * during build.
152 	 */
153 	int			nparticipantsdone;
154 	double		reltuples;
155 	bool		havedead;
156 	double		indtuples;
157 	bool		brokenhotchain;
158 
159 	/*
160 	 * This variable-sized field must come last.
161 	 *
162 	 * See _bt_parallel_estimate_shared().
163 	 */
164 	ParallelHeapScanDescData heapdesc;
165 } BTShared;
166 
167 /*
168  * Status for leader in parallel index build.
169  */
170 typedef struct BTLeader
171 {
172 	/* parallel context itself */
173 	ParallelContext *pcxt;
174 
175 	/*
176 	 * nparticipanttuplesorts is the exact number of worker processes
177 	 * successfully launched, plus one leader process if it participates as a
178 	 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
179 	 * participating as a worker).
180 	 */
181 	int			nparticipanttuplesorts;
182 
183 	/*
184 	 * Leader process convenience pointers to shared state (leader avoids TOC
185 	 * lookups).
186 	 *
187 	 * btshared is the shared state for entire build.  sharedsort is the
188 	 * shared, tuplesort-managed state passed to each process tuplesort.
189 	 * sharedsort2 is the corresponding btspool2 shared state, used only when
190 	 * building unique indexes.  snapshot is the snapshot used by the scan iff
191 	 * an MVCC snapshot is required.
192 	 */
193 	BTShared   *btshared;
194 	Sharedsort *sharedsort;
195 	Sharedsort *sharedsort2;
196 	Snapshot	snapshot;
197 	BufferUsage *bufferusage;
198 } BTLeader;
199 
200 /*
201  * Working state for btbuild and its callback.
202  *
203  * When parallel CREATE INDEX is used, there is a BTBuildState for each
204  * participant.
205  */
206 typedef struct BTBuildState
207 {
208 	bool		isunique;
209 	bool		havedead;
210 	Relation	heap;
211 	BTSpool    *spool;
212 
213 	/*
214 	 * spool2 is needed only when the index is a unique index. Dead tuples are
215 	 * put into spool2 instead of spool in order to avoid uniqueness check.
216 	 */
217 	BTSpool    *spool2;
218 	double		indtuples;
219 
220 	/*
221 	 * btleader is only present when a parallel index build is performed, and
222 	 * only in the leader process. (Actually, only the leader has a
223 	 * BTBuildState.  Workers have their own spool and spool2, though.)
224 	 */
225 	BTLeader   *btleader;
226 } BTBuildState;
227 
228 /*
229  * Status record for a btree page being built.  We have one of these
230  * for each active tree level.
231  *
232  * The reason we need to store a copy of the minimum key is that we'll
233  * need to propagate it to the parent node when this page is linked
234  * into its parent.  However, if the page is not a leaf page, the first
235  * entry on the page doesn't need to contain a key, so we will not have
236  * stored the key itself on the page.  (You might think we could skip
237  * copying the minimum key on leaf pages, but actually we must have a
238  * writable copy anyway because we'll poke the page's address into it
239  * before passing it up to the parent...)
240  */
241 typedef struct BTPageState
242 {
243 	Page		btps_page;		/* workspace for page building */
244 	BlockNumber btps_blkno;		/* block # to write this page at */
245 	IndexTuple	btps_minkey;	/* copy of minimum key (first item) on page */
246 	OffsetNumber btps_lastoff;	/* last item offset loaded */
247 	uint32		btps_level;		/* tree level (0 = leaf) */
248 	Size		btps_full;		/* "full" if less than this much free space */
249 	struct BTPageState *btps_next;	/* link to parent level, if any */
250 } BTPageState;
251 
252 /*
253  * Overall status record for index writing phase.
254  */
255 typedef struct BTWriteState
256 {
257 	Relation	heap;
258 	Relation	index;
259 	bool		btws_use_wal;	/* dump pages to WAL? */
260 	BlockNumber btws_pages_alloced; /* # pages allocated */
261 	BlockNumber btws_pages_written; /* # pages written out */
262 	Page		btws_zeropage;	/* workspace for filling zeroes */
263 } BTWriteState;
264 
265 
266 static double _bt_spools_heapscan(Relation heap, Relation index,
267 					BTBuildState *buildstate, IndexInfo *indexInfo);
268 static void _bt_spooldestroy(BTSpool *btspool);
269 static void _bt_spool(BTSpool *btspool, ItemPointer self,
270 		  Datum *values, bool *isnull);
271 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
272 static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values,
273 				   bool *isnull, bool tupleIsAlive, void *state);
274 static Page _bt_blnewpage(uint32 level);
275 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
276 static void _bt_slideleft(Page page);
277 static void _bt_sortaddtup(Page page, Size itemsize,
278 			   IndexTuple itup, OffsetNumber itup_off);
279 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
280 			 IndexTuple itup);
281 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
282 static void _bt_load(BTWriteState *wstate,
283 		 BTSpool *btspool, BTSpool *btspool2);
284 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
285 				   int request);
286 static void _bt_end_parallel(BTLeader *btleader);
287 static Size _bt_parallel_estimate_shared(Snapshot snapshot);
288 static double _bt_parallel_heapscan(BTBuildState *buildstate,
289 					  bool *brokenhotchain);
290 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
291 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
292 						   BTShared *btshared, Sharedsort *sharedsort,
293 						   Sharedsort *sharedsort2, int sortmem);
294 
295 
296 /*
297  *	btbuild() -- build a new btree index.
298  */
299 IndexBuildResult *
btbuild(Relation heap,Relation index,IndexInfo * indexInfo)300 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
301 {
302 	IndexBuildResult *result;
303 	BTBuildState buildstate;
304 	double		reltuples;
305 
306 #ifdef BTREE_BUILD_STATS
307 	if (log_btree_build_stats)
308 		ResetUsage();
309 #endif							/* BTREE_BUILD_STATS */
310 
311 	buildstate.isunique = indexInfo->ii_Unique;
312 	buildstate.havedead = false;
313 	buildstate.heap = heap;
314 	buildstate.spool = NULL;
315 	buildstate.spool2 = NULL;
316 	buildstate.indtuples = 0;
317 	buildstate.btleader = NULL;
318 
319 	/*
320 	 * We expect to be called exactly once for any index relation. If that's
321 	 * not the case, big trouble's what we have.
322 	 */
323 	if (RelationGetNumberOfBlocks(index) != 0)
324 		elog(ERROR, "index \"%s\" already contains data",
325 			 RelationGetRelationName(index));
326 
327 	reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
328 
329 	/*
330 	 * Finish the build by (1) completing the sort of the spool file, (2)
331 	 * inserting the sorted tuples into btree pages and (3) building the upper
332 	 * levels.  Finally, it may also be necessary to end use of parallelism.
333 	 */
334 	_bt_leafbuild(buildstate.spool, buildstate.spool2);
335 	_bt_spooldestroy(buildstate.spool);
336 	if (buildstate.spool2)
337 		_bt_spooldestroy(buildstate.spool2);
338 	if (buildstate.btleader)
339 		_bt_end_parallel(buildstate.btleader);
340 
341 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
342 
343 	result->heap_tuples = reltuples;
344 	result->index_tuples = buildstate.indtuples;
345 
346 #ifdef BTREE_BUILD_STATS
347 	if (log_btree_build_stats)
348 	{
349 		ShowUsage("BTREE BUILD STATS");
350 		ResetUsage();
351 	}
352 #endif							/* BTREE_BUILD_STATS */
353 
354 	return result;
355 }
356 
357 /*
358  * Create and initialize one or two spool structures, and save them in caller's
359  * buildstate argument.  May also fill-in fields within indexInfo used by index
360  * builds.
361  *
362  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
363  * routine encapsulates all aspects of managing parallelism.  Caller need only
364  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
365  *
366  * Returns the total number of heap tuples scanned.
367  */
368 static double
_bt_spools_heapscan(Relation heap,Relation index,BTBuildState * buildstate,IndexInfo * indexInfo)369 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
370 					IndexInfo *indexInfo)
371 {
372 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
373 	SortCoordinate coordinate = NULL;
374 	double		reltuples = 0;
375 
376 	/*
377 	 * We size the sort area as maintenance_work_mem rather than work_mem to
378 	 * speed index creation.  This should be OK since a single backend can't
379 	 * run multiple index creations in parallel (see also: notes on
380 	 * parallelism and maintenance_work_mem below).
381 	 */
382 	btspool->heap = heap;
383 	btspool->index = index;
384 	btspool->isunique = indexInfo->ii_Unique;
385 
386 	/* Save as primary spool */
387 	buildstate->spool = btspool;
388 
389 	/* Attempt to launch parallel worker scan when required */
390 	if (indexInfo->ii_ParallelWorkers > 0)
391 		_bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
392 						   indexInfo->ii_ParallelWorkers);
393 
394 	/*
395 	 * If parallel build requested and at least one worker process was
396 	 * successfully launched, set up coordination state
397 	 */
398 	if (buildstate->btleader)
399 	{
400 		coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
401 		coordinate->isWorker = false;
402 		coordinate->nParticipants =
403 			buildstate->btleader->nparticipanttuplesorts;
404 		coordinate->sharedsort = buildstate->btleader->sharedsort;
405 	}
406 
407 	/*
408 	 * Begin serial/leader tuplesort.
409 	 *
410 	 * In cases where parallelism is involved, the leader receives the same
411 	 * share of maintenance_work_mem as a serial sort (it is generally treated
412 	 * in the same way as a serial sort once we return).  Parallel worker
413 	 * Tuplesortstates will have received only a fraction of
414 	 * maintenance_work_mem, though.
415 	 *
416 	 * We rely on the lifetime of the Leader Tuplesortstate almost not
417 	 * overlapping with any worker Tuplesortstate's lifetime.  There may be
418 	 * some small overlap, but that's okay because we rely on leader
419 	 * Tuplesortstate only allocating a small, fixed amount of memory here.
420 	 * When its tuplesort_performsort() is called (by our caller), and
421 	 * significant amounts of memory are likely to be used, all workers must
422 	 * have already freed almost all memory held by their Tuplesortstates
423 	 * (they are about to go away completely, too).  The overall effect is
424 	 * that maintenance_work_mem always represents an absolute high watermark
425 	 * on the amount of memory used by a CREATE INDEX operation, regardless of
426 	 * the use of parallelism or any other factor.
427 	 */
428 	buildstate->spool->sortstate =
429 		tuplesort_begin_index_btree(heap, index, buildstate->isunique,
430 									maintenance_work_mem, coordinate,
431 									false);
432 
433 	/*
434 	 * If building a unique index, put dead tuples in a second spool to keep
435 	 * them out of the uniqueness check.  We expect that the second spool (for
436 	 * dead tuples) won't get very full, so we give it only work_mem.
437 	 */
438 	if (indexInfo->ii_Unique)
439 	{
440 		BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
441 		SortCoordinate coordinate2 = NULL;
442 
443 		/* Initialize secondary spool */
444 		btspool2->heap = heap;
445 		btspool2->index = index;
446 		btspool2->isunique = false;
447 		/* Save as secondary spool */
448 		buildstate->spool2 = btspool2;
449 
450 		if (buildstate->btleader)
451 		{
452 			/*
453 			 * Set up non-private state that is passed to
454 			 * tuplesort_begin_index_btree() about the basic high level
455 			 * coordination of a parallel sort.
456 			 */
457 			coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
458 			coordinate2->isWorker = false;
459 			coordinate2->nParticipants =
460 				buildstate->btleader->nparticipanttuplesorts;
461 			coordinate2->sharedsort = buildstate->btleader->sharedsort2;
462 		}
463 
464 		/*
465 		 * We expect that the second one (for dead tuples) won't get very
466 		 * full, so we give it only work_mem
467 		 */
468 		buildstate->spool2->sortstate =
469 			tuplesort_begin_index_btree(heap, index, false, work_mem,
470 										coordinate2, false);
471 	}
472 
473 	/* Fill spool using either serial or parallel heap scan */
474 	if (!buildstate->btleader)
475 		reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
476 									   _bt_build_callback, (void *) buildstate,
477 									   NULL);
478 	else
479 		reltuples = _bt_parallel_heapscan(buildstate,
480 										  &indexInfo->ii_BrokenHotChain);
481 
482 	/* okay, all heap tuples are spooled */
483 	if (buildstate->spool2 && !buildstate->havedead)
484 	{
485 		/* spool2 turns out to be unnecessary */
486 		_bt_spooldestroy(buildstate->spool2);
487 		buildstate->spool2 = NULL;
488 	}
489 
490 	return reltuples;
491 }
492 
493 /*
494  * clean up a spool structure and its substructures.
495  */
496 static void
_bt_spooldestroy(BTSpool * btspool)497 _bt_spooldestroy(BTSpool *btspool)
498 {
499 	tuplesort_end(btspool->sortstate);
500 	pfree(btspool);
501 }
502 
503 /*
504  * spool an index entry into the sort file.
505  */
506 static void
_bt_spool(BTSpool * btspool,ItemPointer self,Datum * values,bool * isnull)507 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
508 {
509 	tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
510 								  self, values, isnull);
511 }
512 
513 /*
514  * given a spool loaded by successive calls to _bt_spool,
515  * create an entire btree.
516  */
517 static void
_bt_leafbuild(BTSpool * btspool,BTSpool * btspool2)518 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
519 {
520 	BTWriteState wstate;
521 
522 #ifdef BTREE_BUILD_STATS
523 	if (log_btree_build_stats)
524 	{
525 		ShowUsage("BTREE BUILD (Spool) STATISTICS");
526 		ResetUsage();
527 	}
528 #endif							/* BTREE_BUILD_STATS */
529 
530 	tuplesort_performsort(btspool->sortstate);
531 	if (btspool2)
532 		tuplesort_performsort(btspool2->sortstate);
533 
534 	wstate.heap = btspool->heap;
535 	wstate.index = btspool->index;
536 
537 	/*
538 	 * We need to log index creation in WAL iff WAL archiving/streaming is
539 	 * enabled UNLESS the index isn't WAL-logged anyway.
540 	 */
541 	wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
542 
543 	/* reserve the metapage */
544 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
545 	wstate.btws_pages_written = 0;
546 	wstate.btws_zeropage = NULL;	/* until needed */
547 
548 	_bt_load(&wstate, btspool, btspool2);
549 }
550 
551 /*
552  * Per-tuple callback from IndexBuildHeapScan
553  */
554 static void
_bt_build_callback(Relation index,HeapTuple htup,Datum * values,bool * isnull,bool tupleIsAlive,void * state)555 _bt_build_callback(Relation index,
556 				   HeapTuple htup,
557 				   Datum *values,
558 				   bool *isnull,
559 				   bool tupleIsAlive,
560 				   void *state)
561 {
562 	BTBuildState *buildstate = (BTBuildState *) state;
563 
564 	/*
565 	 * insert the index tuple into the appropriate spool file for subsequent
566 	 * processing
567 	 */
568 	if (tupleIsAlive || buildstate->spool2 == NULL)
569 		_bt_spool(buildstate->spool, &htup->t_self, values, isnull);
570 	else
571 	{
572 		/* dead tuples are put into spool2 */
573 		buildstate->havedead = true;
574 		_bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
575 	}
576 
577 	buildstate->indtuples += 1;
578 }
579 
580 /*
581  * allocate workspace for a new, clean btree page, not linked to any siblings.
582  */
583 static Page
_bt_blnewpage(uint32 level)584 _bt_blnewpage(uint32 level)
585 {
586 	Page		page;
587 	BTPageOpaque opaque;
588 
589 	page = (Page) palloc(BLCKSZ);
590 
591 	/* Zero the page and set up standard page header info */
592 	_bt_pageinit(page, BLCKSZ);
593 
594 	/* Initialize BT opaque state */
595 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
596 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
597 	opaque->btpo.level = level;
598 	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
599 	opaque->btpo_cycleid = 0;
600 
601 	/* Make the P_HIKEY line pointer appear allocated */
602 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
603 
604 	return page;
605 }
606 
607 /*
608  * emit a completed btree page, and release the working storage.
609  */
610 static void
_bt_blwritepage(BTWriteState * wstate,Page page,BlockNumber blkno)611 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
612 {
613 	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
614 	RelationOpenSmgr(wstate->index);
615 
616 	/* XLOG stuff */
617 	if (wstate->btws_use_wal)
618 	{
619 		/* We use the heap NEWPAGE record type for this */
620 		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
621 	}
622 
623 	/*
624 	 * If we have to write pages nonsequentially, fill in the space with
625 	 * zeroes until we come back and overwrite.  This is not logically
626 	 * necessary on standard Unix filesystems (unwritten space will read as
627 	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
628 	 * pages aren't WAL-logged though.
629 	 */
630 	while (blkno > wstate->btws_pages_written)
631 	{
632 		if (!wstate->btws_zeropage)
633 			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
634 		/* don't set checksum for all-zero page */
635 		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
636 				   wstate->btws_pages_written++,
637 				   (char *) wstate->btws_zeropage,
638 				   true);
639 	}
640 
641 	PageSetChecksumInplace(page, blkno);
642 
643 	/*
644 	 * Now write the page.  There's no need for smgr to schedule an fsync for
645 	 * this write; we'll do it ourselves before ending the build.
646 	 */
647 	if (blkno == wstate->btws_pages_written)
648 	{
649 		/* extending the file... */
650 		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
651 				   (char *) page, true);
652 		wstate->btws_pages_written++;
653 	}
654 	else
655 	{
656 		/* overwriting a block we zero-filled before */
657 		smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
658 				  (char *) page, true);
659 	}
660 
661 	pfree(page);
662 }
663 
664 /*
665  * allocate and initialize a new BTPageState.  the returned structure
666  * is suitable for immediate use by _bt_buildadd.
667  */
668 static BTPageState *
_bt_pagestate(BTWriteState * wstate,uint32 level)669 _bt_pagestate(BTWriteState *wstate, uint32 level)
670 {
671 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
672 
673 	/* create initial page for level */
674 	state->btps_page = _bt_blnewpage(level);
675 
676 	/* and assign it a page position */
677 	state->btps_blkno = wstate->btws_pages_alloced++;
678 
679 	state->btps_minkey = NULL;
680 	/* initialize lastoff so first item goes into P_FIRSTKEY */
681 	state->btps_lastoff = P_HIKEY;
682 	state->btps_level = level;
683 	/* set "full" threshold based on level.  See notes at head of file. */
684 	if (level > 0)
685 		state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
686 	else
687 		state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
688 														  BTREE_DEFAULT_FILLFACTOR);
689 	/* no parent level, yet */
690 	state->btps_next = NULL;
691 
692 	return state;
693 }
694 
695 /*
696  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
697  * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
698  * that we have built an ItemId array in what has turned out to be a
699  * P_RIGHTMOST page.
700  */
701 static void
_bt_slideleft(Page page)702 _bt_slideleft(Page page)
703 {
704 	OffsetNumber off;
705 	OffsetNumber maxoff;
706 	ItemId		previi;
707 	ItemId		thisii;
708 
709 	if (!PageIsEmpty(page))
710 	{
711 		maxoff = PageGetMaxOffsetNumber(page);
712 		previi = PageGetItemId(page, P_HIKEY);
713 		for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
714 		{
715 			thisii = PageGetItemId(page, off);
716 			*previi = *thisii;
717 			previi = thisii;
718 		}
719 		((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
720 	}
721 }
722 
723 /*
724  * Add an item to a page being built.
725  *
726  * The main difference between this routine and a bare PageAddItem call
727  * is that this code knows that the leftmost data item on a non-leaf
728  * btree page doesn't need to have a key.  Therefore, it strips such
729  * items down to just the item header.
730  *
731  * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
732  * that because it assumes that P_RIGHTMOST() will return the correct
733  * answer for the page.  Here, we don't know yet if the page will be
734  * rightmost.  Offset P_FIRSTKEY is always the first data key.
735  */
736 static void
_bt_sortaddtup(Page page,Size itemsize,IndexTuple itup,OffsetNumber itup_off)737 _bt_sortaddtup(Page page,
738 			   Size itemsize,
739 			   IndexTuple itup,
740 			   OffsetNumber itup_off)
741 {
742 	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
743 	IndexTupleData trunctuple;
744 
745 	if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
746 	{
747 		trunctuple = *itup;
748 		trunctuple.t_info = sizeof(IndexTupleData);
749 		BTreeTupleSetNAtts(&trunctuple, 0);
750 		itup = &trunctuple;
751 		itemsize = sizeof(IndexTupleData);
752 	}
753 
754 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
755 					false, false) == InvalidOffsetNumber)
756 		elog(ERROR, "failed to add item to the index page");
757 }
758 
759 /*----------
760  * Add an item to a disk page from the sort output.
761  *
762  * We must be careful to observe the page layout conventions of nbtsearch.c:
763  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
764  * - on non-leaf pages, the key portion of the first item need not be
765  *	 stored, we should store only the link.
766  *
767  * A leaf page being built looks like:
768  *
769  * +----------------+---------------------------------+
770  * | PageHeaderData | linp0 linp1 linp2 ...           |
771  * +-----------+----+---------------------------------+
772  * | ... linpN |									  |
773  * +-----------+--------------------------------------+
774  * |	 ^ last										  |
775  * |												  |
776  * +-------------+------------------------------------+
777  * |			 | itemN ...                          |
778  * +-------------+------------------+-----------------+
779  * |		  ... item3 item2 item1 | "special space" |
780  * +--------------------------------+-----------------+
781  *
782  * Contrast this with the diagram in bufpage.h; note the mismatch
783  * between linps and items.  This is because we reserve linp0 as a
784  * placeholder for the pointer to the "high key" item; when we have
785  * filled up the page, we will set linp0 to point to itemN and clear
786  * linpN.  On the other hand, if we find this is the last (rightmost)
787  * page, we leave the items alone and slide the linp array over.  If
788  * the high key is to be truncated, offset 1 is deleted, and we insert
789  * the truncated high key at offset 1.
790  *
791  * 'last' pointer indicates the last offset added to the page.
792  *----------
793  */
794 static void
_bt_buildadd(BTWriteState * wstate,BTPageState * state,IndexTuple itup)795 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
796 {
797 	Page		npage;
798 	BlockNumber nblkno;
799 	OffsetNumber last_off;
800 	Size		pgspc;
801 	Size		itupsz;
802 	int			indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
803 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
804 
805 	/*
806 	 * This is a handy place to check for cancel interrupts during the btree
807 	 * load phase of index creation.
808 	 */
809 	CHECK_FOR_INTERRUPTS();
810 
811 	npage = state->btps_page;
812 	nblkno = state->btps_blkno;
813 	last_off = state->btps_lastoff;
814 
815 	pgspc = PageGetFreeSpace(npage);
816 	itupsz = IndexTupleSize(itup);
817 	itupsz = MAXALIGN(itupsz);
818 
819 	/*
820 	 * Check whether the item can fit on a btree page at all. (Eventually, we
821 	 * ought to try to apply TOAST methods if not.) We actually need to be
822 	 * able to fit three items on every page, so restrict any one item to 1/3
823 	 * the per-page available space. Note that at this point, itupsz doesn't
824 	 * include the ItemId.
825 	 *
826 	 * NOTE: similar code appears in _bt_insertonpg() to defend against
827 	 * oversize items being inserted into an already-existing index. But
828 	 * during creation of an index, we don't go through there.
829 	 */
830 	if (itupsz > BTMaxItemSize(npage))
831 		ereport(ERROR,
832 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
833 				 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
834 						itupsz, BTMaxItemSize(npage),
835 						RelationGetRelationName(wstate->index)),
836 				 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
837 						 "Consider a function index of an MD5 hash of the value, "
838 						 "or use full text indexing."),
839 				 errtableconstraint(wstate->heap,
840 									RelationGetRelationName(wstate->index))));
841 
842 	/*
843 	 * Check to see if page is "full".  It's definitely full if the item won't
844 	 * fit.  Otherwise, compare to the target freespace derived from the
845 	 * fillfactor.  However, we must put at least two items on each page, so
846 	 * disregard fillfactor if we don't have that many.
847 	 */
848 	if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
849 	{
850 		/*
851 		 * Finish off the page and write it out.
852 		 */
853 		Page		opage = npage;
854 		BlockNumber oblkno = nblkno;
855 		ItemId		ii;
856 		ItemId		hii;
857 		IndexTuple	oitup;
858 		BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage);
859 
860 		/* Create new page of same level */
861 		npage = _bt_blnewpage(state->btps_level);
862 
863 		/* and assign it a page position */
864 		nblkno = wstate->btws_pages_alloced++;
865 
866 		/*
867 		 * We copy the last item on the page into the new page, and then
868 		 * rearrange the old page so that the 'last item' becomes its high key
869 		 * rather than a true data item.  There had better be at least two
870 		 * items on the page already, else the page would be empty of useful
871 		 * data.
872 		 */
873 		Assert(last_off > P_FIRSTKEY);
874 		ii = PageGetItemId(opage, last_off);
875 		oitup = (IndexTuple) PageGetItem(opage, ii);
876 		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
877 
878 		/*
879 		 * Move 'last' into the high key position on opage
880 		 */
881 		hii = PageGetItemId(opage, P_HIKEY);
882 		*hii = *ii;
883 		ItemIdSetUnused(ii);	/* redundant */
884 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
885 
886 		if (indnkeyatts != indnatts && P_ISLEAF(opageop))
887 		{
888 			IndexTuple	truncated;
889 			Size		truncsz;
890 
891 			/*
892 			 * Truncate any non-key attributes from high key on leaf level
893 			 * (i.e. truncate on leaf level if we're building an INCLUDE
894 			 * index).  This is only done at the leaf level because downlinks
895 			 * in internal pages are either negative infinity items, or get
896 			 * their contents from copying from one level down.  See also:
897 			 * _bt_split().
898 			 *
899 			 * Since the truncated tuple is probably smaller than the
900 			 * original, it cannot just be copied in place (besides, we want
901 			 * to actually save space on the leaf page).  We delete the
902 			 * original high key, and add our own truncated high key at the
903 			 * same offset.
904 			 *
905 			 * Note that the page layout won't be changed very much.  oitup is
906 			 * already located at the physical beginning of tuple space, so we
907 			 * only shift the line pointer array back and forth, and overwrite
908 			 * the latter portion of the space occupied by the original tuple.
909 			 * This is fairly cheap.
910 			 */
911 			truncated = _bt_nonkey_truncate(wstate->index, oitup);
912 			truncsz = IndexTupleSize(truncated);
913 			PageIndexTupleDelete(opage, P_HIKEY);
914 			_bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
915 			pfree(truncated);
916 
917 			/* oitup should continue to point to the page's high key */
918 			hii = PageGetItemId(opage, P_HIKEY);
919 			oitup = (IndexTuple) PageGetItem(opage, hii);
920 		}
921 
922 		/*
923 		 * Link the old page into its parent, using its minimum key. If we
924 		 * don't have a parent, we have to create one; this adds a new btree
925 		 * level.
926 		 */
927 		if (state->btps_next == NULL)
928 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
929 
930 		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
931 			   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
932 			   P_LEFTMOST(opageop));
933 		Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
934 			   !P_LEFTMOST(opageop));
935 		BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
936 		_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
937 		pfree(state->btps_minkey);
938 
939 		/*
940 		 * Save a copy of the minimum key for the new page.  We have to copy
941 		 * it off the old page, not the new one, in case we are not at leaf
942 		 * level.
943 		 */
944 		state->btps_minkey = CopyIndexTuple(oitup);
945 
946 		/*
947 		 * Set the sibling links for both pages.
948 		 */
949 		{
950 			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
951 			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
952 
953 			oopaque->btpo_next = nblkno;
954 			nopaque->btpo_prev = oblkno;
955 			nopaque->btpo_next = P_NONE;	/* redundant */
956 		}
957 
958 		/*
959 		 * Write out the old page.  We never need to touch it again, so we can
960 		 * free the opage workspace too.
961 		 */
962 		_bt_blwritepage(wstate, opage, oblkno);
963 
964 		/*
965 		 * Reset last_off to point to new page
966 		 */
967 		last_off = P_FIRSTKEY;
968 	}
969 
970 	/*
971 	 * If the new item is the first for its page, stash a copy for later. Note
972 	 * this will only happen for the first item on a level; on later pages,
973 	 * the first item for a page is copied from the prior page in the code
974 	 * above.  Since the minimum key for an entire level is only used as a
975 	 * minus infinity downlink, and never as a high key, there is no need to
976 	 * truncate away non-key attributes at this point.
977 	 */
978 	if (last_off == P_HIKEY)
979 	{
980 		Assert(state->btps_minkey == NULL);
981 		state->btps_minkey = CopyIndexTuple(itup);
982 		/* _bt_sortaddtup() will perform full truncation later */
983 		BTreeTupleSetNAtts(state->btps_minkey, 0);
984 	}
985 
986 	/*
987 	 * Add the new item into the current page.
988 	 */
989 	last_off = OffsetNumberNext(last_off);
990 	_bt_sortaddtup(npage, itupsz, itup, last_off);
991 
992 	state->btps_page = npage;
993 	state->btps_blkno = nblkno;
994 	state->btps_lastoff = last_off;
995 }
996 
997 /*
998  * Finish writing out the completed btree.
999  */
1000 static void
_bt_uppershutdown(BTWriteState * wstate,BTPageState * state)1001 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1002 {
1003 	BTPageState *s;
1004 	BlockNumber rootblkno = P_NONE;
1005 	uint32		rootlevel = 0;
1006 	Page		metapage;
1007 
1008 	/*
1009 	 * Each iteration of this loop completes one more level of the tree.
1010 	 */
1011 	for (s = state; s != NULL; s = s->btps_next)
1012 	{
1013 		BlockNumber blkno;
1014 		BTPageOpaque opaque;
1015 
1016 		blkno = s->btps_blkno;
1017 		opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1018 
1019 		/*
1020 		 * We have to link the last page on this level to somewhere.
1021 		 *
1022 		 * If we're at the top, it's the root, so attach it to the metapage.
1023 		 * Otherwise, add an entry for it to its parent using its minimum key.
1024 		 * This may cause the last page of the parent level to split, but
1025 		 * that's not a problem -- we haven't gotten to it yet.
1026 		 */
1027 		if (s->btps_next == NULL)
1028 		{
1029 			opaque->btpo_flags |= BTP_ROOT;
1030 			rootblkno = blkno;
1031 			rootlevel = s->btps_level;
1032 		}
1033 		else
1034 		{
1035 			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
1036 				   IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
1037 				   P_LEFTMOST(opaque));
1038 			Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
1039 				   !P_LEFTMOST(opaque));
1040 			BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
1041 			_bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1042 			pfree(s->btps_minkey);
1043 			s->btps_minkey = NULL;
1044 		}
1045 
1046 		/*
1047 		 * This is the rightmost page, so the ItemId array needs to be slid
1048 		 * back one slot.  Then we can dump out the page.
1049 		 */
1050 		_bt_slideleft(s->btps_page);
1051 		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1052 		s->btps_page = NULL;	/* writepage freed the workspace */
1053 	}
1054 
1055 	/*
1056 	 * As the last step in the process, construct the metapage and make it
1057 	 * point to the new root (unless we had no data at all, in which case it's
1058 	 * set to point to "P_NONE").  This changes the index to the "valid" state
1059 	 * by filling in a valid magic number in the metapage.
1060 	 */
1061 	metapage = (Page) palloc(BLCKSZ);
1062 	_bt_initmetapage(metapage, rootblkno, rootlevel);
1063 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1064 }
1065 
1066 /*
1067  * Read tuples in correct sort order from tuplesort, and load them into
1068  * btree leaves.
1069  */
1070 static void
_bt_load(BTWriteState * wstate,BTSpool * btspool,BTSpool * btspool2)1071 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1072 {
1073 	BTPageState *state = NULL;
1074 	bool		merge = (btspool2 != NULL);
1075 	IndexTuple	itup,
1076 				itup2 = NULL;
1077 	bool		load1;
1078 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
1079 	int			i,
1080 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1081 	ScanKey		indexScanKey = NULL;
1082 	SortSupport sortKeys;
1083 
1084 	if (merge)
1085 	{
1086 		/*
1087 		 * Another BTSpool for dead tuples exists. Now we have to merge
1088 		 * btspool and btspool2.
1089 		 */
1090 
1091 		/* the preparation of merge */
1092 		itup = tuplesort_getindextuple(btspool->sortstate, true);
1093 		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1094 		indexScanKey = _bt_mkscankey_nodata(wstate->index);
1095 
1096 		/* Prepare SortSupport data for each column */
1097 		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1098 
1099 		for (i = 0; i < keysz; i++)
1100 		{
1101 			SortSupport sortKey = sortKeys + i;
1102 			ScanKey		scanKey = indexScanKey + i;
1103 			int16		strategy;
1104 
1105 			sortKey->ssup_cxt = CurrentMemoryContext;
1106 			sortKey->ssup_collation = scanKey->sk_collation;
1107 			sortKey->ssup_nulls_first =
1108 				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1109 			sortKey->ssup_attno = scanKey->sk_attno;
1110 			/* Abbreviation is not supported here */
1111 			sortKey->abbreviate = false;
1112 
1113 			AssertState(sortKey->ssup_attno != 0);
1114 
1115 			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1116 				BTGreaterStrategyNumber : BTLessStrategyNumber;
1117 
1118 			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1119 		}
1120 
1121 		_bt_freeskey(indexScanKey);
1122 
1123 		for (;;)
1124 		{
1125 			load1 = true;		/* load BTSpool next ? */
1126 			if (itup2 == NULL)
1127 			{
1128 				if (itup == NULL)
1129 					break;
1130 			}
1131 			else if (itup != NULL)
1132 			{
1133 				for (i = 1; i <= keysz; i++)
1134 				{
1135 					SortSupport entry;
1136 					Datum		attrDatum1,
1137 								attrDatum2;
1138 					bool		isNull1,
1139 								isNull2;
1140 					int32		compare;
1141 
1142 					entry = sortKeys + i - 1;
1143 					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1144 					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1145 
1146 					compare = ApplySortComparator(attrDatum1, isNull1,
1147 												  attrDatum2, isNull2,
1148 												  entry);
1149 					if (compare > 0)
1150 					{
1151 						load1 = false;
1152 						break;
1153 					}
1154 					else if (compare < 0)
1155 						break;
1156 				}
1157 			}
1158 			else
1159 				load1 = false;
1160 
1161 			/* When we see first tuple, create first index page */
1162 			if (state == NULL)
1163 				state = _bt_pagestate(wstate, 0);
1164 
1165 			if (load1)
1166 			{
1167 				_bt_buildadd(wstate, state, itup);
1168 				itup = tuplesort_getindextuple(btspool->sortstate, true);
1169 			}
1170 			else
1171 			{
1172 				_bt_buildadd(wstate, state, itup2);
1173 				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1174 			}
1175 		}
1176 		pfree(sortKeys);
1177 	}
1178 	else
1179 	{
1180 		/* merge is unnecessary */
1181 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
1182 											   true)) != NULL)
1183 		{
1184 			/* When we see first tuple, create first index page */
1185 			if (state == NULL)
1186 				state = _bt_pagestate(wstate, 0);
1187 
1188 			_bt_buildadd(wstate, state, itup);
1189 		}
1190 	}
1191 
1192 	/* Close down final pages and write the metapage */
1193 	_bt_uppershutdown(wstate, state);
1194 
1195 	/*
1196 	 * If the index is WAL-logged, we must fsync it down to disk before it's
1197 	 * safe to commit the transaction.  (For a non-WAL-logged index we don't
1198 	 * care since the index will be uninteresting after a crash anyway.)
1199 	 *
1200 	 * It's obvious that we must do this when not WAL-logging the build. It's
1201 	 * less obvious that we have to do it even if we did WAL-log the index
1202 	 * pages.  The reason is that since we're building outside shared buffers,
1203 	 * a CHECKPOINT occurring during the build has no way to flush the
1204 	 * previously written data to disk (indeed it won't know the index even
1205 	 * exists).  A crash later on would replay WAL from the checkpoint,
1206 	 * therefore it wouldn't replay our earlier WAL entries. If we do not
1207 	 * fsync those pages here, they might still not be on disk when the crash
1208 	 * occurs.
1209 	 */
1210 	if (RelationNeedsWAL(wstate->index))
1211 	{
1212 		RelationOpenSmgr(wstate->index);
1213 		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1214 	}
1215 }
1216 
1217 /*
1218  * Create parallel context, and launch workers for leader.
1219  *
1220  * buildstate argument should be initialized (with the exception of the
1221  * tuplesort state in spools, which may later be created based on shared
1222  * state initially set up here).
1223  *
1224  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1225  *
1226  * request is the target number of parallel worker processes to launch.
1227  *
1228  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1229  * mode by passing it to _bt_end_parallel() at the very end of its index
1230  * build.  If not even a single worker process can be launched, this is
1231  * never set, and caller should proceed with a serial index build.
1232  */
1233 static void
_bt_begin_parallel(BTBuildState * buildstate,bool isconcurrent,int request)1234 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1235 {
1236 	ParallelContext *pcxt;
1237 	int			scantuplesortstates;
1238 	Snapshot	snapshot;
1239 	Size		estbtshared;
1240 	Size		estsort;
1241 	BTShared   *btshared;
1242 	Sharedsort *sharedsort;
1243 	Sharedsort *sharedsort2;
1244 	BTSpool    *btspool = buildstate->spool;
1245 	BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1246 	BufferUsage *bufferusage;
1247 	bool		leaderparticipates = true;
1248 	int			querylen;
1249 
1250 #ifdef DISABLE_LEADER_PARTICIPATION
1251 	leaderparticipates = false;
1252 #endif
1253 
1254 	/*
1255 	 * Enter parallel mode, and create context for parallel build of btree
1256 	 * index
1257 	 */
1258 	EnterParallelMode();
1259 	Assert(request > 0);
1260 	pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1261 								 request, true);
1262 
1263 	scantuplesortstates = leaderparticipates ? request + 1 : request;
1264 
1265 	/*
1266 	 * Prepare for scan of the base relation.  In a normal index build, we use
1267 	 * SnapshotAny because we must retrieve all tuples and do our own time
1268 	 * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
1269 	 * concurrent build, we take a regular MVCC snapshot and index whatever's
1270 	 * live according to that.
1271 	 */
1272 	if (!isconcurrent)
1273 		snapshot = SnapshotAny;
1274 	else
1275 		snapshot = RegisterSnapshot(GetTransactionSnapshot());
1276 
1277 	/*
1278 	 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1279 	 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1280 	 */
1281 	estbtshared = _bt_parallel_estimate_shared(snapshot);
1282 	shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1283 	estsort = tuplesort_estimate_shared(scantuplesortstates);
1284 	shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1285 
1286 	/*
1287 	 * Unique case requires a second spool, and so we may have to account for
1288 	 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1289 	 */
1290 	if (!btspool->isunique)
1291 		shm_toc_estimate_keys(&pcxt->estimator, 2);
1292 	else
1293 	{
1294 		shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1295 		shm_toc_estimate_keys(&pcxt->estimator, 3);
1296 	}
1297 
1298 	/*
1299 	 * Estimate space for BufferUsage -- PARALLEL_KEY_BUFFER_USAGE.
1300 	 *
1301 	 * If there are no extensions loaded that care, we could skip this.  We
1302 	 * have no way of knowing whether anyone's looking at pgBufferUsage, so do
1303 	 * it unconditionally.
1304 	 */
1305 	shm_toc_estimate_chunk(&pcxt->estimator,
1306 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
1307 	shm_toc_estimate_keys(&pcxt->estimator, 1);
1308 
1309 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1310 	if (debug_query_string)
1311 	{
1312 		querylen = strlen(debug_query_string);
1313 		shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1314 		shm_toc_estimate_keys(&pcxt->estimator, 1);
1315 	}
1316 	else
1317 		querylen = 0;			/* keep compiler quiet */
1318 
1319 	/* Everyone's had a chance to ask for space, so now create the DSM */
1320 	InitializeParallelDSM(pcxt);
1321 
1322 	/* If no DSM segment was available, back out (do serial build) */
1323 	if (pcxt->seg == NULL)
1324 	{
1325 		if (IsMVCCSnapshot(snapshot))
1326 			UnregisterSnapshot(snapshot);
1327 		DestroyParallelContext(pcxt);
1328 		ExitParallelMode();
1329 		return;
1330 	}
1331 
1332 	/* Store shared build state, for which we reserved space */
1333 	btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1334 	/* Initialize immutable state */
1335 	btshared->heaprelid = RelationGetRelid(btspool->heap);
1336 	btshared->indexrelid = RelationGetRelid(btspool->index);
1337 	btshared->isunique = btspool->isunique;
1338 	btshared->isconcurrent = isconcurrent;
1339 	btshared->scantuplesortstates = scantuplesortstates;
1340 	ConditionVariableInit(&btshared->workersdonecv);
1341 	SpinLockInit(&btshared->mutex);
1342 	/* Initialize mutable state */
1343 	btshared->nparticipantsdone = 0;
1344 	btshared->reltuples = 0.0;
1345 	btshared->havedead = false;
1346 	btshared->indtuples = 0.0;
1347 	btshared->brokenhotchain = false;
1348 	heap_parallelscan_initialize(&btshared->heapdesc, btspool->heap, snapshot);
1349 
1350 	/*
1351 	 * Store shared tuplesort-private state, for which we reserved space.
1352 	 * Then, initialize opaque state using tuplesort routine.
1353 	 */
1354 	sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1355 	tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1356 								pcxt->seg);
1357 
1358 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1359 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1360 
1361 	/* Unique case requires a second spool, and associated shared state */
1362 	if (!btspool->isunique)
1363 		sharedsort2 = NULL;
1364 	else
1365 	{
1366 		/*
1367 		 * Store additional shared tuplesort-private state, for which we
1368 		 * reserved space.  Then, initialize opaque state using tuplesort
1369 		 * routine.
1370 		 */
1371 		sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1372 		tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1373 									pcxt->seg);
1374 
1375 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1376 	}
1377 
1378 	/* Store query string for workers */
1379 	if (debug_query_string)
1380 	{
1381 		char	   *sharedquery;
1382 
1383 		sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1384 		memcpy(sharedquery, debug_query_string, querylen + 1);
1385 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1386 	}
1387 
1388 	/* Allocate space for each worker's BufferUsage; no need to initialize */
1389 	bufferusage = shm_toc_allocate(pcxt->toc,
1390 								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
1391 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1392 
1393 	/* Launch workers, saving status for leader/caller */
1394 	LaunchParallelWorkers(pcxt);
1395 	btleader->pcxt = pcxt;
1396 	btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1397 	if (leaderparticipates)
1398 		btleader->nparticipanttuplesorts++;
1399 	btleader->btshared = btshared;
1400 	btleader->sharedsort = sharedsort;
1401 	btleader->sharedsort2 = sharedsort2;
1402 	btleader->snapshot = snapshot;
1403 	btleader->bufferusage = bufferusage;
1404 
1405 	/* If no workers were successfully launched, back out (do serial build) */
1406 	if (pcxt->nworkers_launched == 0)
1407 	{
1408 		_bt_end_parallel(btleader);
1409 		return;
1410 	}
1411 
1412 	/* Save leader state now that it's clear build will be parallel */
1413 	buildstate->btleader = btleader;
1414 
1415 	/* Join heap scan ourselves */
1416 	if (leaderparticipates)
1417 		_bt_leader_participate_as_worker(buildstate);
1418 
1419 	/*
1420 	 * Caller needs to wait for all launched workers when we return.  Make
1421 	 * sure that the failure-to-start case will not hang forever.
1422 	 */
1423 	WaitForParallelWorkersToAttach(pcxt);
1424 }
1425 
1426 /*
1427  * Shut down workers, destroy parallel context, and end parallel mode.
1428  */
1429 static void
_bt_end_parallel(BTLeader * btleader)1430 _bt_end_parallel(BTLeader *btleader)
1431 {
1432 	int			i;
1433 
1434 	/* Shutdown worker processes */
1435 	WaitForParallelWorkersToFinish(btleader->pcxt);
1436 
1437 	/*
1438 	 * Next, accumulate buffer usage.  (This must wait for the workers to
1439 	 * finish, or we might get incomplete data.)
1440 	 */
1441 	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1442 		InstrAccumParallelQuery(&btleader->bufferusage[i]);
1443 
1444 	/* Free last reference to MVCC snapshot, if one was used */
1445 	if (IsMVCCSnapshot(btleader->snapshot))
1446 		UnregisterSnapshot(btleader->snapshot);
1447 	DestroyParallelContext(btleader->pcxt);
1448 	ExitParallelMode();
1449 }
1450 
1451 /*
1452  * Returns size of shared memory required to store state for a parallel
1453  * btree index build based on the snapshot its parallel scan will use.
1454  */
1455 static Size
_bt_parallel_estimate_shared(Snapshot snapshot)1456 _bt_parallel_estimate_shared(Snapshot snapshot)
1457 {
1458 	if (!IsMVCCSnapshot(snapshot))
1459 	{
1460 		Assert(snapshot == SnapshotAny);
1461 		return sizeof(BTShared);
1462 	}
1463 
1464 	return add_size(offsetof(BTShared, heapdesc) +
1465 					offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1466 					EstimateSnapshotSpace(snapshot));
1467 }
1468 
1469 /*
1470  * Within leader, wait for end of heap scan.
1471  *
1472  * When called, parallel heap scan started by _bt_begin_parallel() will
1473  * already be underway within worker processes (when leader participates
1474  * as a worker, we should end up here just as workers are finishing).
1475  *
1476  * Fills in fields needed for ambuild statistics, and lets caller set
1477  * field indicating that some worker encountered a broken HOT chain.
1478  *
1479  * Returns the total number of heap tuples scanned.
1480  */
1481 static double
_bt_parallel_heapscan(BTBuildState * buildstate,bool * brokenhotchain)1482 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1483 {
1484 	BTShared   *btshared = buildstate->btleader->btshared;
1485 	int			nparticipanttuplesorts;
1486 	double		reltuples;
1487 
1488 	nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1489 	for (;;)
1490 	{
1491 		SpinLockAcquire(&btshared->mutex);
1492 		if (btshared->nparticipantsdone == nparticipanttuplesorts)
1493 		{
1494 			buildstate->havedead = btshared->havedead;
1495 			buildstate->indtuples = btshared->indtuples;
1496 			*brokenhotchain = btshared->brokenhotchain;
1497 			reltuples = btshared->reltuples;
1498 			SpinLockRelease(&btshared->mutex);
1499 			break;
1500 		}
1501 		SpinLockRelease(&btshared->mutex);
1502 
1503 		ConditionVariableSleep(&btshared->workersdonecv,
1504 							   WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1505 	}
1506 
1507 	ConditionVariableCancelSleep();
1508 
1509 	return reltuples;
1510 }
1511 
1512 /*
1513  * Within leader, participate as a parallel worker.
1514  */
1515 static void
_bt_leader_participate_as_worker(BTBuildState * buildstate)1516 _bt_leader_participate_as_worker(BTBuildState *buildstate)
1517 {
1518 	BTLeader   *btleader = buildstate->btleader;
1519 	BTSpool    *leaderworker;
1520 	BTSpool    *leaderworker2;
1521 	int			sortmem;
1522 
1523 	/* Allocate memory and initialize private spool */
1524 	leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1525 	leaderworker->heap = buildstate->spool->heap;
1526 	leaderworker->index = buildstate->spool->index;
1527 	leaderworker->isunique = buildstate->spool->isunique;
1528 
1529 	/* Initialize second spool, if required */
1530 	if (!btleader->btshared->isunique)
1531 		leaderworker2 = NULL;
1532 	else
1533 	{
1534 		/* Allocate memory for worker's own private secondary spool */
1535 		leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1536 
1537 		/* Initialize worker's own secondary spool */
1538 		leaderworker2->heap = leaderworker->heap;
1539 		leaderworker2->index = leaderworker->index;
1540 		leaderworker2->isunique = false;
1541 	}
1542 
1543 	/*
1544 	 * Might as well use reliable figure when doling out maintenance_work_mem
1545 	 * (when requested number of workers were not launched, this will be
1546 	 * somewhat higher than it is for other workers).
1547 	 */
1548 	sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1549 
1550 	/* Perform work common to all participants */
1551 	_bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1552 							   btleader->sharedsort, btleader->sharedsort2,
1553 							   sortmem);
1554 
1555 #ifdef BTREE_BUILD_STATS
1556 	if (log_btree_build_stats)
1557 	{
1558 		ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1559 		ResetUsage();
1560 	}
1561 #endif							/* BTREE_BUILD_STATS */
1562 }
1563 
1564 /*
1565  * Perform work within a launched parallel process.
1566  */
1567 void
_bt_parallel_build_main(dsm_segment * seg,shm_toc * toc)1568 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1569 {
1570 	char	   *sharedquery;
1571 	BTSpool    *btspool;
1572 	BTSpool    *btspool2;
1573 	BTShared   *btshared;
1574 	Sharedsort *sharedsort;
1575 	Sharedsort *sharedsort2;
1576 	Relation	heapRel;
1577 	Relation	indexRel;
1578 	LOCKMODE	heapLockmode;
1579 	LOCKMODE	indexLockmode;
1580 	BufferUsage *bufferusage;
1581 	int			sortmem;
1582 
1583 #ifdef BTREE_BUILD_STATS
1584 	if (log_btree_build_stats)
1585 		ResetUsage();
1586 #endif							/* BTREE_BUILD_STATS */
1587 
1588 	/* Set debug_query_string for individual workers first */
1589 	sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1590 	debug_query_string = sharedquery;
1591 
1592 	/* Report the query string from leader */
1593 	pgstat_report_activity(STATE_RUNNING, debug_query_string);
1594 
1595 	/* Look up nbtree shared state */
1596 	btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1597 
1598 	/* Open relations using lock modes known to be obtained by index.c */
1599 	if (!btshared->isconcurrent)
1600 	{
1601 		heapLockmode = ShareLock;
1602 		indexLockmode = AccessExclusiveLock;
1603 	}
1604 	else
1605 	{
1606 		heapLockmode = ShareUpdateExclusiveLock;
1607 		indexLockmode = RowExclusiveLock;
1608 	}
1609 
1610 	/* Open relations within worker */
1611 	heapRel = heap_open(btshared->heaprelid, heapLockmode);
1612 	indexRel = index_open(btshared->indexrelid, indexLockmode);
1613 
1614 	/* Initialize worker's own spool */
1615 	btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1616 	btspool->heap = heapRel;
1617 	btspool->index = indexRel;
1618 	btspool->isunique = btshared->isunique;
1619 
1620 	/* Look up shared state private to tuplesort.c */
1621 	sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1622 	tuplesort_attach_shared(sharedsort, seg);
1623 	if (!btshared->isunique)
1624 	{
1625 		btspool2 = NULL;
1626 		sharedsort2 = NULL;
1627 	}
1628 	else
1629 	{
1630 		/* Allocate memory for worker's own private secondary spool */
1631 		btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1632 
1633 		/* Initialize worker's own secondary spool */
1634 		btspool2->heap = btspool->heap;
1635 		btspool2->index = btspool->index;
1636 		btspool2->isunique = false;
1637 		/* Look up shared state private to tuplesort.c */
1638 		sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1639 		tuplesort_attach_shared(sharedsort2, seg);
1640 	}
1641 
1642 	/* Prepare to track buffer usage during parallel execution */
1643 	InstrStartParallelQuery();
1644 
1645 	/* Perform sorting of spool, and possibly a spool2 */
1646 	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1647 	_bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1648 							   sharedsort2, sortmem);
1649 
1650 	/* Report buffer usage during parallel execution */
1651 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1652 	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber]);
1653 
1654 #ifdef BTREE_BUILD_STATS
1655 	if (log_btree_build_stats)
1656 	{
1657 		ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1658 		ResetUsage();
1659 	}
1660 #endif							/* BTREE_BUILD_STATS */
1661 
1662 	index_close(indexRel, indexLockmode);
1663 	heap_close(heapRel, heapLockmode);
1664 }
1665 
1666 /*
1667  * Perform a worker's portion of a parallel sort.
1668  *
1669  * This generates a tuplesort for passed btspool, and a second tuplesort
1670  * state if a second btspool is need (i.e. for unique index builds).  All
1671  * other spool fields should already be set when this is called.
1672  *
1673  * sortmem is the amount of working memory to use within each worker,
1674  * expressed in KBs.
1675  *
1676  * When this returns, workers are done, and need only release resources.
1677  */
1678 static void
_bt_parallel_scan_and_sort(BTSpool * btspool,BTSpool * btspool2,BTShared * btshared,Sharedsort * sharedsort,Sharedsort * sharedsort2,int sortmem)1679 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1680 						   BTShared *btshared, Sharedsort *sharedsort,
1681 						   Sharedsort *sharedsort2, int sortmem)
1682 {
1683 	SortCoordinate coordinate;
1684 	BTBuildState buildstate;
1685 	HeapScanDesc scan;
1686 	double		reltuples;
1687 	IndexInfo  *indexInfo;
1688 
1689 	/* Initialize local tuplesort coordination state */
1690 	coordinate = palloc0(sizeof(SortCoordinateData));
1691 	coordinate->isWorker = true;
1692 	coordinate->nParticipants = -1;
1693 	coordinate->sharedsort = sharedsort;
1694 
1695 	/* Begin "partial" tuplesort */
1696 	btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1697 													 btspool->index,
1698 													 btspool->isunique,
1699 													 sortmem, coordinate,
1700 													 false);
1701 
1702 	/*
1703 	 * Just as with serial case, there may be a second spool.  If so, a
1704 	 * second, dedicated spool2 partial tuplesort is required.
1705 	 */
1706 	if (btspool2)
1707 	{
1708 		SortCoordinate coordinate2;
1709 
1710 		/*
1711 		 * We expect that the second one (for dead tuples) won't get very
1712 		 * full, so we give it only work_mem (unless sortmem is less for
1713 		 * worker).  Worker processes are generally permitted to allocate
1714 		 * work_mem independently.
1715 		 */
1716 		coordinate2 = palloc0(sizeof(SortCoordinateData));
1717 		coordinate2->isWorker = true;
1718 		coordinate2->nParticipants = -1;
1719 		coordinate2->sharedsort = sharedsort2;
1720 		btspool2->sortstate =
1721 			tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1722 										Min(sortmem, work_mem), coordinate2,
1723 										false);
1724 	}
1725 
1726 	/* Fill in buildstate for _bt_build_callback() */
1727 	buildstate.isunique = btshared->isunique;
1728 	buildstate.havedead = false;
1729 	buildstate.heap = btspool->heap;
1730 	buildstate.spool = btspool;
1731 	buildstate.spool2 = btspool2;
1732 	buildstate.indtuples = 0;
1733 	buildstate.btleader = NULL;
1734 
1735 	/* Join parallel scan */
1736 	indexInfo = BuildIndexInfo(btspool->index);
1737 	indexInfo->ii_Concurrent = btshared->isconcurrent;
1738 	scan = heap_beginscan_parallel(btspool->heap, &btshared->heapdesc);
1739 	reltuples = IndexBuildHeapScan(btspool->heap, btspool->index, indexInfo,
1740 								   true, _bt_build_callback,
1741 								   (void *) &buildstate, scan);
1742 
1743 	/*
1744 	 * Execute this worker's part of the sort.
1745 	 *
1746 	 * Unlike leader and serial cases, we cannot avoid calling
1747 	 * tuplesort_performsort() for spool2 if it ends up containing no dead
1748 	 * tuples (this is disallowed for workers by tuplesort).
1749 	 */
1750 	tuplesort_performsort(btspool->sortstate);
1751 	if (btspool2)
1752 		tuplesort_performsort(btspool2->sortstate);
1753 
1754 	/*
1755 	 * Done.  Record ambuild statistics, and whether we encountered a broken
1756 	 * HOT chain.
1757 	 */
1758 	SpinLockAcquire(&btshared->mutex);
1759 	btshared->nparticipantsdone++;
1760 	btshared->reltuples += reltuples;
1761 	if (buildstate.havedead)
1762 		btshared->havedead = true;
1763 	btshared->indtuples += buildstate.indtuples;
1764 	if (indexInfo->ii_BrokenHotChain)
1765 		btshared->brokenhotchain = true;
1766 	SpinLockRelease(&btshared->mutex);
1767 
1768 	/* Notify leader */
1769 	ConditionVariableSignal(&btshared->workersdonecv);
1770 
1771 	/* We can end tuplesorts immediately */
1772 	tuplesort_end(btspool->sortstate);
1773 	if (btspool2)
1774 		tuplesort_end(btspool2->sortstate);
1775 }
1776