1 /*-------------------------------------------------------------------------
2  *
3  * nbtsort.c
4  *		Build a btree from sorted input by loading leaf pages sequentially.
5  *
6  * NOTES
7  *
8  * We use tuplesort.c to sort the given index tuples into order.
9  * Then we scan the index tuples in order and build the btree pages
10  * for each level.  We load source tuples into leaf-level pages.
11  * Whenever we fill a page at one level, we add a link to it to its
12  * parent level (starting a new parent level if necessary).  When
13  * done, we write out each final page on each level, adding it to
14  * its parent level.  When we have only one page on a level, it must be
15  * the root -- it can be attached to the btree metapage and we are done.
16  *
17  * It is not wise to pack the pages entirely full, since then *any*
18  * insertion would cause a split (and not only of the leaf page; the need
19  * for a split would cascade right up the tree).  The steady-state load
20  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
21  * pages to the user-controllable fill factor (default 90%) while upper pages
22  * are always packed to 70%.  This gives us reasonable density (there aren't
23  * many upper pages if the keys are reasonable-size) without risking a lot of
24  * cascading splits during early insertions.
25  *
26  * Formerly the index pages being built were kept in shared buffers, but
27  * that is of no value (since other backends have no interest in them yet)
28  * and it created locking problems for CHECKPOINT, because the upper-level
29  * pages were held exclusive-locked for long periods.  Now we just build
30  * the pages in local memory and smgrwrite or smgrextend them as we finish
31  * them.  They will need to be re-read into shared buffers on first use after
32  * the build finishes.
33  *
34  * This code isn't concerned about the FSM at all. The caller is responsible
35  * for initializing that.
36  *
37  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
38  * Portions Copyright (c) 1994, Regents of the University of California
39  *
40  * IDENTIFICATION
41  *	  src/backend/access/nbtree/nbtsort.c
42  *
43  *-------------------------------------------------------------------------
44  */
45 
46 #include "postgres.h"
47 
48 #include "access/nbtree.h"
49 #include "access/parallel.h"
50 #include "access/relscan.h"
51 #include "access/table.h"
52 #include "access/xact.h"
53 #include "access/xlog.h"
54 #include "access/xloginsert.h"
55 #include "catalog/index.h"
56 #include "commands/progress.h"
57 #include "executor/instrument.h"
58 #include "miscadmin.h"
59 #include "pgstat.h"
60 #include "storage/smgr.h"
61 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
62 #include "utils/rel.h"
63 #include "utils/sortsupport.h"
64 #include "utils/tuplesort.h"
65 
66 
67 /* Magic numbers for parallel state sharing */
68 #define PARALLEL_KEY_BTREE_SHARED		UINT64CONST(0xA000000000000001)
69 #define PARALLEL_KEY_TUPLESORT			UINT64CONST(0xA000000000000002)
70 #define PARALLEL_KEY_TUPLESORT_SPOOL2	UINT64CONST(0xA000000000000003)
71 #define PARALLEL_KEY_QUERY_TEXT			UINT64CONST(0xA000000000000004)
72 #define PARALLEL_KEY_WAL_USAGE			UINT64CONST(0xA000000000000005)
73 #define PARALLEL_KEY_BUFFER_USAGE		UINT64CONST(0xA000000000000006)
74 
75 /*
76  * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
77  * parallel index builds.  This may be useful as a debugging aid.
78 #undef DISABLE_LEADER_PARTICIPATION
79  */
80 
81 /*
82  * Status record for spooling/sorting phase.  (Note we may have two of
83  * these due to the special requirements for uniqueness-checking with
84  * dead tuples.)
85  */
86 typedef struct BTSpool
87 {
88 	Tuplesortstate *sortstate;	/* state data for tuplesort.c */
89 	Relation	heap;
90 	Relation	index;
91 	bool		isunique;
92 } BTSpool;
93 
94 /*
95  * Status for index builds performed in parallel.  This is allocated in a
96  * dynamic shared memory segment.  Note that there is a separate tuplesort TOC
97  * entry, private to tuplesort.c but allocated by this module on its behalf.
98  */
99 typedef struct BTShared
100 {
101 	/*
102 	 * These fields are not modified during the sort.  They primarily exist
103 	 * for the benefit of worker processes that need to create BTSpool state
104 	 * corresponding to that used by the leader.
105 	 */
106 	Oid			heaprelid;
107 	Oid			indexrelid;
108 	bool		isunique;
109 	bool		isconcurrent;
110 	int			scantuplesortstates;
111 
112 	/*
113 	 * workersdonecv is used to monitor the progress of workers.  All parallel
114 	 * participants must indicate that they are done before leader can use
115 	 * mutable state that workers maintain during scan (and before leader can
116 	 * proceed to tuplesort_performsort()).
117 	 */
118 	ConditionVariable workersdonecv;
119 
120 	/*
121 	 * mutex protects all fields before heapdesc.
122 	 *
123 	 * These fields contain status information of interest to B-Tree index
124 	 * builds that must work just the same when an index is built in parallel.
125 	 */
126 	slock_t		mutex;
127 
128 	/*
129 	 * Mutable state that is maintained by workers, and reported back to
130 	 * leader at end of parallel scan.
131 	 *
132 	 * nparticipantsdone is number of worker processes finished.
133 	 *
134 	 * reltuples is the total number of input heap tuples.
135 	 *
136 	 * havedead indicates if RECENTLY_DEAD tuples were encountered during
137 	 * build.
138 	 *
139 	 * indtuples is the total number of tuples that made it into the index.
140 	 *
141 	 * brokenhotchain indicates if any worker detected a broken HOT chain
142 	 * during build.
143 	 */
144 	int			nparticipantsdone;
145 	double		reltuples;
146 	bool		havedead;
147 	double		indtuples;
148 	bool		brokenhotchain;
149 
150 	/*
151 	 * ParallelTableScanDescData data follows. Can't directly embed here, as
152 	 * implementations of the parallel table scan desc interface might need
153 	 * stronger alignment.
154 	 */
155 } BTShared;
156 
157 /*
158  * Return pointer to a BTShared's parallel table scan.
159  *
160  * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
161  * MAXALIGN.
162  */
163 #define ParallelTableScanFromBTShared(shared) \
164 	(ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
165 
166 /*
167  * Status for leader in parallel index build.
168  */
169 typedef struct BTLeader
170 {
171 	/* parallel context itself */
172 	ParallelContext *pcxt;
173 
174 	/*
175 	 * nparticipanttuplesorts is the exact number of worker processes
176 	 * successfully launched, plus one leader process if it participates as a
177 	 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
178 	 * participating as a worker).
179 	 */
180 	int			nparticipanttuplesorts;
181 
182 	/*
183 	 * Leader process convenience pointers to shared state (leader avoids TOC
184 	 * lookups).
185 	 *
186 	 * btshared is the shared state for entire build.  sharedsort is the
187 	 * shared, tuplesort-managed state passed to each process tuplesort.
188 	 * sharedsort2 is the corresponding btspool2 shared state, used only when
189 	 * building unique indexes.  snapshot is the snapshot used by the scan iff
190 	 * an MVCC snapshot is required.
191 	 */
192 	BTShared   *btshared;
193 	Sharedsort *sharedsort;
194 	Sharedsort *sharedsort2;
195 	Snapshot	snapshot;
196 	WalUsage   *walusage;
197 	BufferUsage *bufferusage;
198 } BTLeader;
199 
200 /*
201  * Working state for btbuild and its callback.
202  *
203  * When parallel CREATE INDEX is used, there is a BTBuildState for each
204  * participant.
205  */
206 typedef struct BTBuildState
207 {
208 	bool		isunique;
209 	bool		havedead;
210 	Relation	heap;
211 	BTSpool    *spool;
212 
213 	/*
214 	 * spool2 is needed only when the index is a unique index. Dead tuples are
215 	 * put into spool2 instead of spool in order to avoid uniqueness check.
216 	 */
217 	BTSpool    *spool2;
218 	double		indtuples;
219 
220 	/*
221 	 * btleader is only present when a parallel index build is performed, and
222 	 * only in the leader process. (Actually, only the leader has a
223 	 * BTBuildState.  Workers have their own spool and spool2, though.)
224 	 */
225 	BTLeader   *btleader;
226 } BTBuildState;
227 
228 /*
229  * Status record for a btree page being built.  We have one of these
230  * for each active tree level.
231  */
232 typedef struct BTPageState
233 {
234 	Page		btps_page;		/* workspace for page building */
235 	BlockNumber btps_blkno;		/* block # to write this page at */
236 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
237 	OffsetNumber btps_lastoff;	/* last item offset loaded */
238 	Size		btps_lastextra; /* last item's extra posting list space */
239 	uint32		btps_level;		/* tree level (0 = leaf) */
240 	Size		btps_full;		/* "full" if less than this much free space */
241 	struct BTPageState *btps_next;	/* link to parent level, if any */
242 } BTPageState;
243 
244 /*
245  * Overall status record for index writing phase.
246  */
247 typedef struct BTWriteState
248 {
249 	Relation	heap;
250 	Relation	index;
251 	BTScanInsert inskey;		/* generic insertion scankey */
252 	bool		btws_use_wal;	/* dump pages to WAL? */
253 	BlockNumber btws_pages_alloced; /* # pages allocated */
254 	BlockNumber btws_pages_written; /* # pages written out */
255 	Page		btws_zeropage;	/* workspace for filling zeroes */
256 } BTWriteState;
257 
258 
259 static double _bt_spools_heapscan(Relation heap, Relation index,
260 								  BTBuildState *buildstate, IndexInfo *indexInfo);
261 static void _bt_spooldestroy(BTSpool *btspool);
262 static void _bt_spool(BTSpool *btspool, ItemPointer self,
263 					  Datum *values, bool *isnull);
264 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
265 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
266 							   bool *isnull, bool tupleIsAlive, void *state);
267 static Page _bt_blnewpage(uint32 level);
268 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
269 static void _bt_slideleft(Page rightmostpage);
270 static void _bt_sortaddtup(Page page, Size itemsize,
271 						   IndexTuple itup, OffsetNumber itup_off,
272 						   bool newfirstdataitem);
273 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
274 						 IndexTuple itup, Size truncextra);
275 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
276 										  BTPageState *state,
277 										  BTDedupState dstate);
278 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
279 static void _bt_load(BTWriteState *wstate,
280 					 BTSpool *btspool, BTSpool *btspool2);
281 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
282 							   int request);
283 static void _bt_end_parallel(BTLeader *btleader);
284 static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
285 static double _bt_parallel_heapscan(BTBuildState *buildstate,
286 									bool *brokenhotchain);
287 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
288 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
289 									   BTShared *btshared, Sharedsort *sharedsort,
290 									   Sharedsort *sharedsort2, int sortmem,
291 									   bool progress);
292 
293 
294 /*
295  *	btbuild() -- build a new btree index.
296  */
297 IndexBuildResult *
btbuild(Relation heap,Relation index,IndexInfo * indexInfo)298 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
299 {
300 	IndexBuildResult *result;
301 	BTBuildState buildstate;
302 	double		reltuples;
303 
304 #ifdef BTREE_BUILD_STATS
305 	if (log_btree_build_stats)
306 		ResetUsage();
307 #endif							/* BTREE_BUILD_STATS */
308 
309 	buildstate.isunique = indexInfo->ii_Unique;
310 	buildstate.havedead = false;
311 	buildstate.heap = heap;
312 	buildstate.spool = NULL;
313 	buildstate.spool2 = NULL;
314 	buildstate.indtuples = 0;
315 	buildstate.btleader = NULL;
316 
317 	/*
318 	 * We expect to be called exactly once for any index relation. If that's
319 	 * not the case, big trouble's what we have.
320 	 */
321 	if (RelationGetNumberOfBlocks(index) != 0)
322 		elog(ERROR, "index \"%s\" already contains data",
323 			 RelationGetRelationName(index));
324 
325 	reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
326 
327 	/*
328 	 * Finish the build by (1) completing the sort of the spool file, (2)
329 	 * inserting the sorted tuples into btree pages and (3) building the upper
330 	 * levels.  Finally, it may also be necessary to end use of parallelism.
331 	 */
332 	_bt_leafbuild(buildstate.spool, buildstate.spool2);
333 	_bt_spooldestroy(buildstate.spool);
334 	if (buildstate.spool2)
335 		_bt_spooldestroy(buildstate.spool2);
336 	if (buildstate.btleader)
337 		_bt_end_parallel(buildstate.btleader);
338 
339 	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
340 
341 	result->heap_tuples = reltuples;
342 	result->index_tuples = buildstate.indtuples;
343 
344 #ifdef BTREE_BUILD_STATS
345 	if (log_btree_build_stats)
346 	{
347 		ShowUsage("BTREE BUILD STATS");
348 		ResetUsage();
349 	}
350 #endif							/* BTREE_BUILD_STATS */
351 
352 	return result;
353 }
354 
355 /*
356  * Create and initialize one or two spool structures, and save them in caller's
357  * buildstate argument.  May also fill-in fields within indexInfo used by index
358  * builds.
359  *
360  * Scans the heap, possibly in parallel, filling spools with IndexTuples.  This
361  * routine encapsulates all aspects of managing parallelism.  Caller need only
362  * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
363  *
364  * Returns the total number of heap tuples scanned.
365  */
366 static double
_bt_spools_heapscan(Relation heap,Relation index,BTBuildState * buildstate,IndexInfo * indexInfo)367 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
368 					IndexInfo *indexInfo)
369 {
370 	BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
371 	SortCoordinate coordinate = NULL;
372 	double		reltuples = 0;
373 
374 	/*
375 	 * We size the sort area as maintenance_work_mem rather than work_mem to
376 	 * speed index creation.  This should be OK since a single backend can't
377 	 * run multiple index creations in parallel (see also: notes on
378 	 * parallelism and maintenance_work_mem below).
379 	 */
380 	btspool->heap = heap;
381 	btspool->index = index;
382 	btspool->isunique = indexInfo->ii_Unique;
383 
384 	/* Save as primary spool */
385 	buildstate->spool = btspool;
386 
387 	/* Report table scan phase started */
388 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
389 								 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
390 
391 	/* Attempt to launch parallel worker scan when required */
392 	if (indexInfo->ii_ParallelWorkers > 0)
393 		_bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
394 						   indexInfo->ii_ParallelWorkers);
395 
396 	/*
397 	 * If parallel build requested and at least one worker process was
398 	 * successfully launched, set up coordination state
399 	 */
400 	if (buildstate->btleader)
401 	{
402 		coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
403 		coordinate->isWorker = false;
404 		coordinate->nParticipants =
405 			buildstate->btleader->nparticipanttuplesorts;
406 		coordinate->sharedsort = buildstate->btleader->sharedsort;
407 	}
408 
409 	/*
410 	 * Begin serial/leader tuplesort.
411 	 *
412 	 * In cases where parallelism is involved, the leader receives the same
413 	 * share of maintenance_work_mem as a serial sort (it is generally treated
414 	 * in the same way as a serial sort once we return).  Parallel worker
415 	 * Tuplesortstates will have received only a fraction of
416 	 * maintenance_work_mem, though.
417 	 *
418 	 * We rely on the lifetime of the Leader Tuplesortstate almost not
419 	 * overlapping with any worker Tuplesortstate's lifetime.  There may be
420 	 * some small overlap, but that's okay because we rely on leader
421 	 * Tuplesortstate only allocating a small, fixed amount of memory here.
422 	 * When its tuplesort_performsort() is called (by our caller), and
423 	 * significant amounts of memory are likely to be used, all workers must
424 	 * have already freed almost all memory held by their Tuplesortstates
425 	 * (they are about to go away completely, too).  The overall effect is
426 	 * that maintenance_work_mem always represents an absolute high watermark
427 	 * on the amount of memory used by a CREATE INDEX operation, regardless of
428 	 * the use of parallelism or any other factor.
429 	 */
430 	buildstate->spool->sortstate =
431 		tuplesort_begin_index_btree(heap, index, buildstate->isunique,
432 									maintenance_work_mem, coordinate,
433 									false);
434 
435 	/*
436 	 * If building a unique index, put dead tuples in a second spool to keep
437 	 * them out of the uniqueness check.  We expect that the second spool (for
438 	 * dead tuples) won't get very full, so we give it only work_mem.
439 	 */
440 	if (indexInfo->ii_Unique)
441 	{
442 		BTSpool    *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
443 		SortCoordinate coordinate2 = NULL;
444 
445 		/* Initialize secondary spool */
446 		btspool2->heap = heap;
447 		btspool2->index = index;
448 		btspool2->isunique = false;
449 		/* Save as secondary spool */
450 		buildstate->spool2 = btspool2;
451 
452 		if (buildstate->btleader)
453 		{
454 			/*
455 			 * Set up non-private state that is passed to
456 			 * tuplesort_begin_index_btree() about the basic high level
457 			 * coordination of a parallel sort.
458 			 */
459 			coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
460 			coordinate2->isWorker = false;
461 			coordinate2->nParticipants =
462 				buildstate->btleader->nparticipanttuplesorts;
463 			coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464 		}
465 
466 		/*
467 		 * We expect that the second one (for dead tuples) won't get very
468 		 * full, so we give it only work_mem
469 		 */
470 		buildstate->spool2->sortstate =
471 			tuplesort_begin_index_btree(heap, index, false, work_mem,
472 										coordinate2, false);
473 	}
474 
475 	/* Fill spool using either serial or parallel heap scan */
476 	if (!buildstate->btleader)
477 		reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
478 										   _bt_build_callback, (void *) buildstate,
479 										   NULL);
480 	else
481 		reltuples = _bt_parallel_heapscan(buildstate,
482 										  &indexInfo->ii_BrokenHotChain);
483 
484 	/*
485 	 * Set the progress target for the next phase.  Reset the block number
486 	 * values set by table_index_build_scan
487 	 */
488 	{
489 		const int	progress_index[] = {
490 			PROGRESS_CREATEIDX_TUPLES_TOTAL,
491 			PROGRESS_SCAN_BLOCKS_TOTAL,
492 			PROGRESS_SCAN_BLOCKS_DONE
493 		};
494 		const int64 progress_vals[] = {
495 			buildstate->indtuples,
496 			0, 0
497 		};
498 
499 		pgstat_progress_update_multi_param(3, progress_index, progress_vals);
500 	}
501 
502 	/* okay, all heap tuples are spooled */
503 	if (buildstate->spool2 && !buildstate->havedead)
504 	{
505 		/* spool2 turns out to be unnecessary */
506 		_bt_spooldestroy(buildstate->spool2);
507 		buildstate->spool2 = NULL;
508 	}
509 
510 	return reltuples;
511 }
512 
513 /*
514  * clean up a spool structure and its substructures.
515  */
516 static void
_bt_spooldestroy(BTSpool * btspool)517 _bt_spooldestroy(BTSpool *btspool)
518 {
519 	tuplesort_end(btspool->sortstate);
520 	pfree(btspool);
521 }
522 
523 /*
524  * spool an index entry into the sort file.
525  */
526 static void
_bt_spool(BTSpool * btspool,ItemPointer self,Datum * values,bool * isnull)527 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
528 {
529 	tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
530 								  self, values, isnull);
531 }
532 
533 /*
534  * given a spool loaded by successive calls to _bt_spool,
535  * create an entire btree.
536  */
537 static void
_bt_leafbuild(BTSpool * btspool,BTSpool * btspool2)538 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
539 {
540 	BTWriteState wstate;
541 
542 #ifdef BTREE_BUILD_STATS
543 	if (log_btree_build_stats)
544 	{
545 		ShowUsage("BTREE BUILD (Spool) STATISTICS");
546 		ResetUsage();
547 	}
548 #endif							/* BTREE_BUILD_STATS */
549 
550 	/* Execute the sort */
551 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
552 								 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
553 	tuplesort_performsort(btspool->sortstate);
554 	if (btspool2)
555 	{
556 		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
557 									 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
558 		tuplesort_performsort(btspool2->sortstate);
559 	}
560 
561 	wstate.heap = btspool->heap;
562 	wstate.index = btspool->index;
563 	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564 	/* _bt_mkscankey() won't set allequalimage without metapage */
565 	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566 	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
567 
568 	/* reserve the metapage */
569 	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
570 	wstate.btws_pages_written = 0;
571 	wstate.btws_zeropage = NULL;	/* until needed */
572 
573 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
574 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
575 	_bt_load(&wstate, btspool, btspool2);
576 }
577 
578 /*
579  * Per-tuple callback for table_index_build_scan
580  */
581 static void
_bt_build_callback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)582 _bt_build_callback(Relation index,
583 				   ItemPointer tid,
584 				   Datum *values,
585 				   bool *isnull,
586 				   bool tupleIsAlive,
587 				   void *state)
588 {
589 	BTBuildState *buildstate = (BTBuildState *) state;
590 
591 	/*
592 	 * insert the index tuple into the appropriate spool file for subsequent
593 	 * processing
594 	 */
595 	if (tupleIsAlive || buildstate->spool2 == NULL)
596 		_bt_spool(buildstate->spool, tid, values, isnull);
597 	else
598 	{
599 		/* dead tuples are put into spool2 */
600 		buildstate->havedead = true;
601 		_bt_spool(buildstate->spool2, tid, values, isnull);
602 	}
603 
604 	buildstate->indtuples += 1;
605 }
606 
607 /*
608  * allocate workspace for a new, clean btree page, not linked to any siblings.
609  */
610 static Page
_bt_blnewpage(uint32 level)611 _bt_blnewpage(uint32 level)
612 {
613 	Page		page;
614 	BTPageOpaque opaque;
615 
616 	page = (Page) palloc(BLCKSZ);
617 
618 	/* Zero the page and set up standard page header info */
619 	_bt_pageinit(page, BLCKSZ);
620 
621 	/* Initialize BT opaque state */
622 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
623 	opaque->btpo_prev = opaque->btpo_next = P_NONE;
624 	opaque->btpo_level = level;
625 	opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
626 	opaque->btpo_cycleid = 0;
627 
628 	/* Make the P_HIKEY line pointer appear allocated */
629 	((PageHeader) page)->pd_lower += sizeof(ItemIdData);
630 
631 	return page;
632 }
633 
634 /*
635  * emit a completed btree page, and release the working storage.
636  */
637 static void
_bt_blwritepage(BTWriteState * wstate,Page page,BlockNumber blkno)638 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
639 {
640 	/* Ensure rd_smgr is open (could have been closed by relcache flush!) */
641 	RelationOpenSmgr(wstate->index);
642 
643 	/* XLOG stuff */
644 	if (wstate->btws_use_wal)
645 	{
646 		/* We use the XLOG_FPI record type for this */
647 		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
648 	}
649 
650 	/*
651 	 * If we have to write pages nonsequentially, fill in the space with
652 	 * zeroes until we come back and overwrite.  This is not logically
653 	 * necessary on standard Unix filesystems (unwritten space will read as
654 	 * zeroes anyway), but it should help to avoid fragmentation. The dummy
655 	 * pages aren't WAL-logged though.
656 	 */
657 	while (blkno > wstate->btws_pages_written)
658 	{
659 		if (!wstate->btws_zeropage)
660 			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
661 		/* don't set checksum for all-zero page */
662 		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
663 				   wstate->btws_pages_written++,
664 				   (char *) wstate->btws_zeropage,
665 				   true);
666 	}
667 
668 	PageSetChecksumInplace(page, blkno);
669 
670 	/*
671 	 * Now write the page.  There's no need for smgr to schedule an fsync for
672 	 * this write; we'll do it ourselves before ending the build.
673 	 */
674 	if (blkno == wstate->btws_pages_written)
675 	{
676 		/* extending the file... */
677 		smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
678 				   (char *) page, true);
679 		wstate->btws_pages_written++;
680 	}
681 	else
682 	{
683 		/* overwriting a block we zero-filled before */
684 		smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
685 				  (char *) page, true);
686 	}
687 
688 	pfree(page);
689 }
690 
691 /*
692  * allocate and initialize a new BTPageState.  the returned structure
693  * is suitable for immediate use by _bt_buildadd.
694  */
695 static BTPageState *
_bt_pagestate(BTWriteState * wstate,uint32 level)696 _bt_pagestate(BTWriteState *wstate, uint32 level)
697 {
698 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
699 
700 	/* create initial page for level */
701 	state->btps_page = _bt_blnewpage(level);
702 
703 	/* and assign it a page position */
704 	state->btps_blkno = wstate->btws_pages_alloced++;
705 
706 	state->btps_lowkey = NULL;
707 	/* initialize lastoff so first item goes into P_FIRSTKEY */
708 	state->btps_lastoff = P_HIKEY;
709 	state->btps_lastextra = 0;
710 	state->btps_level = level;
711 	/* set "full" threshold based on level.  See notes at head of file. */
712 	if (level > 0)
713 		state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
714 	else
715 		state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
716 
717 	/* no parent level, yet */
718 	state->btps_next = NULL;
719 
720 	return state;
721 }
722 
723 /*
724  * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
725  * P_HIKEY, overwriting P_HIKEY).
726  *
727  * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
728  * rightmost page on its level is not supposed to get a high key.  Now that
729  * it's clear that this page is a rightmost page, remove the unneeded empty
730  * P_HIKEY line pointer space.
731  */
732 static void
_bt_slideleft(Page rightmostpage)733 _bt_slideleft(Page rightmostpage)
734 {
735 	OffsetNumber off;
736 	OffsetNumber maxoff;
737 	ItemId		previi;
738 
739 	maxoff = PageGetMaxOffsetNumber(rightmostpage);
740 	Assert(maxoff >= P_FIRSTKEY);
741 	previi = PageGetItemId(rightmostpage, P_HIKEY);
742 	for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
743 	{
744 		ItemId		thisii = PageGetItemId(rightmostpage, off);
745 
746 		*previi = *thisii;
747 		previi = thisii;
748 	}
749 	((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
750 }
751 
752 /*
753  * Add an item to a page being built.
754  *
755  * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
756  * raises an error directly.
757  *
758  * Note that our nbtsort.c caller does not know yet if the page will be
759  * rightmost.  Offset P_FIRSTKEY is always assumed to be the first data key by
760  * caller.  Page that turns out to be the rightmost on its level is fixed by
761  * calling _bt_slideleft().
762  */
763 static void
_bt_sortaddtup(Page page,Size itemsize,IndexTuple itup,OffsetNumber itup_off,bool newfirstdataitem)764 _bt_sortaddtup(Page page,
765 			   Size itemsize,
766 			   IndexTuple itup,
767 			   OffsetNumber itup_off,
768 			   bool newfirstdataitem)
769 {
770 	IndexTupleData trunctuple;
771 
772 	if (newfirstdataitem)
773 	{
774 		trunctuple = *itup;
775 		trunctuple.t_info = sizeof(IndexTupleData);
776 		BTreeTupleSetNAtts(&trunctuple, 0, false);
777 		itup = &trunctuple;
778 		itemsize = sizeof(IndexTupleData);
779 	}
780 
781 	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
782 					false, false) == InvalidOffsetNumber)
783 		elog(ERROR, "failed to add item to the index page");
784 }
785 
786 /*----------
787  * Add an item to a disk page from the sort output (or add a posting list
788  * item formed from the sort output).
789  *
790  * We must be careful to observe the page layout conventions of nbtsearch.c:
791  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
792  * - on non-leaf pages, the key portion of the first item need not be
793  *	 stored, we should store only the link.
794  *
795  * A leaf page being built looks like:
796  *
797  * +----------------+---------------------------------+
798  * | PageHeaderData | linp0 linp1 linp2 ...           |
799  * +-----------+----+---------------------------------+
800  * | ... linpN |									  |
801  * +-----------+--------------------------------------+
802  * |	 ^ last										  |
803  * |												  |
804  * +-------------+------------------------------------+
805  * |			 | itemN ...                          |
806  * +-------------+------------------+-----------------+
807  * |		  ... item3 item2 item1 | "special space" |
808  * +--------------------------------+-----------------+
809  *
810  * Contrast this with the diagram in bufpage.h; note the mismatch
811  * between linps and items.  This is because we reserve linp0 as a
812  * placeholder for the pointer to the "high key" item; when we have
813  * filled up the page, we will set linp0 to point to itemN and clear
814  * linpN.  On the other hand, if we find this is the last (rightmost)
815  * page, we leave the items alone and slide the linp array over.  If
816  * the high key is to be truncated, offset 1 is deleted, and we insert
817  * the truncated high key at offset 1.
818  *
819  * 'last' pointer indicates the last offset added to the page.
820  *
821  * 'truncextra' is the size of the posting list in itup, if any.  This
822  * information is stashed for the next call here, when we may benefit
823  * from considering the impact of truncating away the posting list on
824  * the page before deciding to finish the page off.  Posting lists are
825  * often relatively large, so it is worth going to the trouble of
826  * accounting for the saving from truncating away the posting list of
827  * the tuple that becomes the high key (that may be the only way to
828  * get close to target free space on the page).  Note that this is
829  * only used for the soft fillfactor-wise limit, not the critical hard
830  * limit.
831  *----------
832  */
833 static void
_bt_buildadd(BTWriteState * wstate,BTPageState * state,IndexTuple itup,Size truncextra)834 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
835 			 Size truncextra)
836 {
837 	Page		npage;
838 	BlockNumber nblkno;
839 	OffsetNumber last_off;
840 	Size		last_truncextra;
841 	Size		pgspc;
842 	Size		itupsz;
843 	bool		isleaf;
844 
845 	/*
846 	 * This is a handy place to check for cancel interrupts during the btree
847 	 * load phase of index creation.
848 	 */
849 	CHECK_FOR_INTERRUPTS();
850 
851 	npage = state->btps_page;
852 	nblkno = state->btps_blkno;
853 	last_off = state->btps_lastoff;
854 	last_truncextra = state->btps_lastextra;
855 	state->btps_lastextra = truncextra;
856 
857 	pgspc = PageGetFreeSpace(npage);
858 	itupsz = IndexTupleSize(itup);
859 	itupsz = MAXALIGN(itupsz);
860 	/* Leaf case has slightly different rules due to suffix truncation */
861 	isleaf = (state->btps_level == 0);
862 
863 	/*
864 	 * Check whether the new item can fit on a btree page on current level at
865 	 * all.
866 	 *
867 	 * Every newly built index will treat heap TID as part of the keyspace,
868 	 * which imposes the requirement that new high keys must occasionally have
869 	 * a heap TID appended within _bt_truncate().  That may leave a new pivot
870 	 * tuple one or two MAXALIGN() quantums larger than the original
871 	 * firstright tuple it's derived from.  v4 deals with the problem by
872 	 * decreasing the limit on the size of tuples inserted on the leaf level
873 	 * by the same small amount.  Enforce the new v4+ limit on the leaf level,
874 	 * and the old limit on internal levels, since pivot tuples may need to
875 	 * make use of the reserved space.  This should never fail on internal
876 	 * pages.
877 	 */
878 	if (unlikely(itupsz > BTMaxItemSize(npage)))
879 		_bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
880 							 itup);
881 
882 	/*
883 	 * Check to see if current page will fit new item, with space left over to
884 	 * append a heap TID during suffix truncation when page is a leaf page.
885 	 *
886 	 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
887 	 * high key with heap TID when finishing off a leaf page, since we rely on
888 	 * _bt_check_third_page() rejecting oversized non-pivot tuples.  On
889 	 * internal pages we can always fit 3 pivot tuples with larger internal
890 	 * page tuple limit (includes page high key).
891 	 *
892 	 * Most of the time, a page is only "full" in the sense that the soft
893 	 * fillfactor-wise limit has been exceeded.  However, we must always leave
894 	 * at least two items plus a high key on each page before starting a new
895 	 * page.  Disregard fillfactor and insert on "full" current page if we
896 	 * don't have the minimum number of items yet.  (Note that we deliberately
897 	 * assume that suffix truncation neither enlarges nor shrinks new high key
898 	 * when applying soft limit, except when last tuple has a posting list.)
899 	 */
900 	Assert(last_truncextra == 0 || isleaf);
901 	if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
902 		(pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
903 	{
904 		/*
905 		 * Finish off the page and write it out.
906 		 */
907 		Page		opage = npage;
908 		BlockNumber oblkno = nblkno;
909 		ItemId		ii;
910 		ItemId		hii;
911 		IndexTuple	oitup;
912 
913 		/* Create new page of same level */
914 		npage = _bt_blnewpage(state->btps_level);
915 
916 		/* and assign it a page position */
917 		nblkno = wstate->btws_pages_alloced++;
918 
919 		/*
920 		 * We copy the last item on the page into the new page, and then
921 		 * rearrange the old page so that the 'last item' becomes its high key
922 		 * rather than a true data item.  There had better be at least two
923 		 * items on the page already, else the page would be empty of useful
924 		 * data.
925 		 */
926 		Assert(last_off > P_FIRSTKEY);
927 		ii = PageGetItemId(opage, last_off);
928 		oitup = (IndexTuple) PageGetItem(opage, ii);
929 		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
930 					   !isleaf);
931 
932 		/*
933 		 * Move 'last' into the high key position on opage.  _bt_blnewpage()
934 		 * allocated empty space for a line pointer when opage was first
935 		 * created, so this is a matter of rearranging already-allocated space
936 		 * on page, and initializing high key line pointer. (Actually, leaf
937 		 * pages must also swap oitup with a truncated version of oitup, which
938 		 * is sometimes larger than oitup, though never by more than the space
939 		 * needed to append a heap TID.)
940 		 */
941 		hii = PageGetItemId(opage, P_HIKEY);
942 		*hii = *ii;
943 		ItemIdSetUnused(ii);	/* redundant */
944 		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
945 
946 		if (isleaf)
947 		{
948 			IndexTuple	lastleft;
949 			IndexTuple	truncated;
950 
951 			/*
952 			 * Truncate away any unneeded attributes from high key on leaf
953 			 * level.  This is only done at the leaf level because downlinks
954 			 * in internal pages are either negative infinity items, or get
955 			 * their contents from copying from one level down.  See also:
956 			 * _bt_split().
957 			 *
958 			 * We don't try to bias our choice of split point to make it more
959 			 * likely that _bt_truncate() can truncate away more attributes,
960 			 * whereas the split point used within _bt_split() is chosen much
961 			 * more delicately.  Even still, the lastleft and firstright
962 			 * tuples passed to _bt_truncate() here are at least not fully
963 			 * equal to each other when deduplication is used, unless there is
964 			 * a large group of duplicates (also, unique index builds usually
965 			 * have few or no spool2 duplicates).  When the split point is
966 			 * between two unequal tuples, _bt_truncate() will avoid including
967 			 * a heap TID in the new high key, which is the most important
968 			 * benefit of suffix truncation.
969 			 *
970 			 * Overwrite the old item with new truncated high key directly.
971 			 * oitup is already located at the physical beginning of tuple
972 			 * space, so this should directly reuse the existing tuple space.
973 			 */
974 			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
975 			lastleft = (IndexTuple) PageGetItem(opage, ii);
976 
977 			Assert(IndexTupleSize(oitup) > last_truncextra);
978 			truncated = _bt_truncate(wstate->index, lastleft, oitup,
979 									 wstate->inskey);
980 			if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
981 										 IndexTupleSize(truncated)))
982 				elog(ERROR, "failed to add high key to the index page");
983 			pfree(truncated);
984 
985 			/* oitup should continue to point to the page's high key */
986 			hii = PageGetItemId(opage, P_HIKEY);
987 			oitup = (IndexTuple) PageGetItem(opage, hii);
988 		}
989 
990 		/*
991 		 * Link the old page into its parent, using its low key.  If we don't
992 		 * have a parent, we have to create one; this adds a new btree level.
993 		 */
994 		if (state->btps_next == NULL)
995 			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
996 
997 		Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
998 				IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
999 				BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1000 			   P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1001 		Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1002 			   !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1003 		BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1004 		_bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1005 		pfree(state->btps_lowkey);
1006 
1007 		/*
1008 		 * Save a copy of the high key from the old page.  It is also the low
1009 		 * key for the new page.
1010 		 */
1011 		state->btps_lowkey = CopyIndexTuple(oitup);
1012 
1013 		/*
1014 		 * Set the sibling links for both pages.
1015 		 */
1016 		{
1017 			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1018 			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1019 
1020 			oopaque->btpo_next = nblkno;
1021 			nopaque->btpo_prev = oblkno;
1022 			nopaque->btpo_next = P_NONE;	/* redundant */
1023 		}
1024 
1025 		/*
1026 		 * Write out the old page.  We never need to touch it again, so we can
1027 		 * free the opage workspace too.
1028 		 */
1029 		_bt_blwritepage(wstate, opage, oblkno);
1030 
1031 		/*
1032 		 * Reset last_off to point to new page
1033 		 */
1034 		last_off = P_FIRSTKEY;
1035 	}
1036 
1037 	/*
1038 	 * By here, either original page is still the current page, or a new page
1039 	 * was created that became the current page.  Either way, the current page
1040 	 * definitely has space for new item.
1041 	 *
1042 	 * If the new item is the first for its page, it must also be the first
1043 	 * item on its entire level.  On later same-level pages, a low key for a
1044 	 * page will be copied from the prior page in the code above.  Generate a
1045 	 * minus infinity low key here instead.
1046 	 */
1047 	if (last_off == P_HIKEY)
1048 	{
1049 		Assert(state->btps_lowkey == NULL);
1050 		state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1051 		state->btps_lowkey->t_info = sizeof(IndexTupleData);
1052 		BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1053 	}
1054 
1055 	/*
1056 	 * Add the new item into the current page.
1057 	 */
1058 	last_off = OffsetNumberNext(last_off);
1059 	_bt_sortaddtup(npage, itupsz, itup, last_off,
1060 				   !isleaf && last_off == P_FIRSTKEY);
1061 
1062 	state->btps_page = npage;
1063 	state->btps_blkno = nblkno;
1064 	state->btps_lastoff = last_off;
1065 }
1066 
1067 /*
1068  * Finalize pending posting list tuple, and add it to the index.  Final tuple
1069  * is based on saved base tuple, and saved list of heap TIDs.
1070  *
1071  * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1072  * using _bt_buildadd().
1073  */
1074 static void
_bt_sort_dedup_finish_pending(BTWriteState * wstate,BTPageState * state,BTDedupState dstate)1075 _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1076 							  BTDedupState dstate)
1077 {
1078 	Assert(dstate->nitems > 0);
1079 
1080 	if (dstate->nitems == 1)
1081 		_bt_buildadd(wstate, state, dstate->base, 0);
1082 	else
1083 	{
1084 		IndexTuple	postingtuple;
1085 		Size		truncextra;
1086 
1087 		/* form a tuple with a posting list */
1088 		postingtuple = _bt_form_posting(dstate->base,
1089 										dstate->htids,
1090 										dstate->nhtids);
1091 		/* Calculate posting list overhead */
1092 		truncextra = IndexTupleSize(postingtuple) -
1093 			BTreeTupleGetPostingOffset(postingtuple);
1094 
1095 		_bt_buildadd(wstate, state, postingtuple, truncextra);
1096 		pfree(postingtuple);
1097 	}
1098 
1099 	dstate->nmaxitems = 0;
1100 	dstate->nhtids = 0;
1101 	dstate->nitems = 0;
1102 	dstate->phystupsize = 0;
1103 }
1104 
1105 /*
1106  * Finish writing out the completed btree.
1107  */
1108 static void
_bt_uppershutdown(BTWriteState * wstate,BTPageState * state)1109 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1110 {
1111 	BTPageState *s;
1112 	BlockNumber rootblkno = P_NONE;
1113 	uint32		rootlevel = 0;
1114 	Page		metapage;
1115 
1116 	/*
1117 	 * Each iteration of this loop completes one more level of the tree.
1118 	 */
1119 	for (s = state; s != NULL; s = s->btps_next)
1120 	{
1121 		BlockNumber blkno;
1122 		BTPageOpaque opaque;
1123 
1124 		blkno = s->btps_blkno;
1125 		opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1126 
1127 		/*
1128 		 * We have to link the last page on this level to somewhere.
1129 		 *
1130 		 * If we're at the top, it's the root, so attach it to the metapage.
1131 		 * Otherwise, add an entry for it to its parent using its low key.
1132 		 * This may cause the last page of the parent level to split, but
1133 		 * that's not a problem -- we haven't gotten to it yet.
1134 		 */
1135 		if (s->btps_next == NULL)
1136 		{
1137 			opaque->btpo_flags |= BTP_ROOT;
1138 			rootblkno = blkno;
1139 			rootlevel = s->btps_level;
1140 		}
1141 		else
1142 		{
1143 			Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1144 					IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1145 					BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1146 				   P_LEFTMOST(opaque));
1147 			Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1148 				   !P_LEFTMOST(opaque));
1149 			BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1150 			_bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1151 			pfree(s->btps_lowkey);
1152 			s->btps_lowkey = NULL;
1153 		}
1154 
1155 		/*
1156 		 * This is the rightmost page, so the ItemId array needs to be slid
1157 		 * back one slot.  Then we can dump out the page.
1158 		 */
1159 		_bt_slideleft(s->btps_page);
1160 		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1161 		s->btps_page = NULL;	/* writepage freed the workspace */
1162 	}
1163 
1164 	/*
1165 	 * As the last step in the process, construct the metapage and make it
1166 	 * point to the new root (unless we had no data at all, in which case it's
1167 	 * set to point to "P_NONE").  This changes the index to the "valid" state
1168 	 * by filling in a valid magic number in the metapage.
1169 	 */
1170 	metapage = (Page) palloc(BLCKSZ);
1171 	_bt_initmetapage(metapage, rootblkno, rootlevel,
1172 					 wstate->inskey->allequalimage);
1173 	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1174 }
1175 
1176 /*
1177  * Read tuples in correct sort order from tuplesort, and load them into
1178  * btree leaves.
1179  */
1180 static void
_bt_load(BTWriteState * wstate,BTSpool * btspool,BTSpool * btspool2)1181 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1182 {
1183 	BTPageState *state = NULL;
1184 	bool		merge = (btspool2 != NULL);
1185 	IndexTuple	itup,
1186 				itup2 = NULL;
1187 	bool		load1;
1188 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
1189 	int			i,
1190 				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1191 	SortSupport sortKeys;
1192 	int64		tuples_done = 0;
1193 	bool		deduplicate;
1194 
1195 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1196 		BTGetDeduplicateItems(wstate->index);
1197 
1198 	if (merge)
1199 	{
1200 		/*
1201 		 * Another BTSpool for dead tuples exists. Now we have to merge
1202 		 * btspool and btspool2.
1203 		 */
1204 
1205 		/* the preparation of merge */
1206 		itup = tuplesort_getindextuple(btspool->sortstate, true);
1207 		itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1208 
1209 		/* Prepare SortSupport data for each column */
1210 		sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1211 
1212 		for (i = 0; i < keysz; i++)
1213 		{
1214 			SortSupport sortKey = sortKeys + i;
1215 			ScanKey		scanKey = wstate->inskey->scankeys + i;
1216 			int16		strategy;
1217 
1218 			sortKey->ssup_cxt = CurrentMemoryContext;
1219 			sortKey->ssup_collation = scanKey->sk_collation;
1220 			sortKey->ssup_nulls_first =
1221 				(scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1222 			sortKey->ssup_attno = scanKey->sk_attno;
1223 			/* Abbreviation is not supported here */
1224 			sortKey->abbreviate = false;
1225 
1226 			AssertState(sortKey->ssup_attno != 0);
1227 
1228 			strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1229 				BTGreaterStrategyNumber : BTLessStrategyNumber;
1230 
1231 			PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1232 		}
1233 
1234 		for (;;)
1235 		{
1236 			load1 = true;		/* load BTSpool next ? */
1237 			if (itup2 == NULL)
1238 			{
1239 				if (itup == NULL)
1240 					break;
1241 			}
1242 			else if (itup != NULL)
1243 			{
1244 				int32		compare = 0;
1245 
1246 				for (i = 1; i <= keysz; i++)
1247 				{
1248 					SortSupport entry;
1249 					Datum		attrDatum1,
1250 								attrDatum2;
1251 					bool		isNull1,
1252 								isNull2;
1253 
1254 					entry = sortKeys + i - 1;
1255 					attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1256 					attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1257 
1258 					compare = ApplySortComparator(attrDatum1, isNull1,
1259 												  attrDatum2, isNull2,
1260 												  entry);
1261 					if (compare > 0)
1262 					{
1263 						load1 = false;
1264 						break;
1265 					}
1266 					else if (compare < 0)
1267 						break;
1268 				}
1269 
1270 				/*
1271 				 * If key values are equal, we sort on ItemPointer.  This is
1272 				 * required for btree indexes, since heap TID is treated as an
1273 				 * implicit last key attribute in order to ensure that all
1274 				 * keys in the index are physically unique.
1275 				 */
1276 				if (compare == 0)
1277 				{
1278 					compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1279 					Assert(compare != 0);
1280 					if (compare > 0)
1281 						load1 = false;
1282 				}
1283 			}
1284 			else
1285 				load1 = false;
1286 
1287 			/* When we see first tuple, create first index page */
1288 			if (state == NULL)
1289 				state = _bt_pagestate(wstate, 0);
1290 
1291 			if (load1)
1292 			{
1293 				_bt_buildadd(wstate, state, itup, 0);
1294 				itup = tuplesort_getindextuple(btspool->sortstate, true);
1295 			}
1296 			else
1297 			{
1298 				_bt_buildadd(wstate, state, itup2, 0);
1299 				itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1300 			}
1301 
1302 			/* Report progress */
1303 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1304 										 ++tuples_done);
1305 		}
1306 		pfree(sortKeys);
1307 	}
1308 	else if (deduplicate)
1309 	{
1310 		/* merge is unnecessary, deduplicate into posting lists */
1311 		BTDedupState dstate;
1312 
1313 		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1314 		dstate->deduplicate = true; /* unused */
1315 		dstate->nmaxitems = 0;	/* unused */
1316 		dstate->maxpostingsize = 0; /* set later */
1317 		/* Metadata about base tuple of current pending posting list */
1318 		dstate->base = NULL;
1319 		dstate->baseoff = InvalidOffsetNumber;	/* unused */
1320 		dstate->basetupsize = 0;
1321 		/* Metadata about current pending posting list TIDs */
1322 		dstate->htids = NULL;
1323 		dstate->nhtids = 0;
1324 		dstate->nitems = 0;
1325 		dstate->phystupsize = 0;	/* unused */
1326 		dstate->nintervals = 0; /* unused */
1327 
1328 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
1329 											   true)) != NULL)
1330 		{
1331 			/* When we see first tuple, create first index page */
1332 			if (state == NULL)
1333 			{
1334 				state = _bt_pagestate(wstate, 0);
1335 
1336 				/*
1337 				 * Limit size of posting list tuples to 1/10 space we want to
1338 				 * leave behind on the page, plus space for final item's line
1339 				 * pointer.  This is equal to the space that we'd like to
1340 				 * leave behind on each leaf page when fillfactor is 90,
1341 				 * allowing us to get close to fillfactor% space utilization
1342 				 * when there happen to be a great many duplicates.  (This
1343 				 * makes higher leaf fillfactor settings ineffective when
1344 				 * building indexes that have many duplicates, but packing
1345 				 * leaf pages full with few very large tuples doesn't seem
1346 				 * like a useful goal.)
1347 				 */
1348 				dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1349 					sizeof(ItemIdData);
1350 				Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1351 					   dstate->maxpostingsize <= INDEX_SIZE_MASK);
1352 				dstate->htids = palloc(dstate->maxpostingsize);
1353 
1354 				/* start new pending posting list with itup copy */
1355 				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1356 										InvalidOffsetNumber);
1357 			}
1358 			else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1359 										 itup) > keysz &&
1360 					 _bt_dedup_save_htid(dstate, itup))
1361 			{
1362 				/*
1363 				 * Tuple is equal to base tuple of pending posting list.  Heap
1364 				 * TID from itup has been saved in state.
1365 				 */
1366 			}
1367 			else
1368 			{
1369 				/*
1370 				 * Tuple is not equal to pending posting list tuple, or
1371 				 * _bt_dedup_save_htid() opted to not merge current item into
1372 				 * pending posting list.
1373 				 */
1374 				_bt_sort_dedup_finish_pending(wstate, state, dstate);
1375 				pfree(dstate->base);
1376 
1377 				/* start new pending posting list with itup copy */
1378 				_bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1379 										InvalidOffsetNumber);
1380 			}
1381 
1382 			/* Report progress */
1383 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1384 										 ++tuples_done);
1385 		}
1386 
1387 		if (state)
1388 		{
1389 			/*
1390 			 * Handle the last item (there must be a last item when the
1391 			 * tuplesort returned one or more tuples)
1392 			 */
1393 			_bt_sort_dedup_finish_pending(wstate, state, dstate);
1394 			pfree(dstate->base);
1395 			pfree(dstate->htids);
1396 		}
1397 
1398 		pfree(dstate);
1399 	}
1400 	else
1401 	{
1402 		/* merging and deduplication are both unnecessary */
1403 		while ((itup = tuplesort_getindextuple(btspool->sortstate,
1404 											   true)) != NULL)
1405 		{
1406 			/* When we see first tuple, create first index page */
1407 			if (state == NULL)
1408 				state = _bt_pagestate(wstate, 0);
1409 
1410 			_bt_buildadd(wstate, state, itup, 0);
1411 
1412 			/* Report progress */
1413 			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1414 										 ++tuples_done);
1415 		}
1416 	}
1417 
1418 	/* Close down final pages and write the metapage */
1419 	_bt_uppershutdown(wstate, state);
1420 
1421 	/*
1422 	 * When we WAL-logged index pages, we must nonetheless fsync index files.
1423 	 * Since we're building outside shared buffers, a CHECKPOINT occurring
1424 	 * during the build has no way to flush the previously written data to
1425 	 * disk (indeed it won't know the index even exists).  A crash later on
1426 	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
1427 	 * earlier WAL entries. If we do not fsync those pages here, they might
1428 	 * still not be on disk when the crash occurs.
1429 	 */
1430 	if (wstate->btws_use_wal)
1431 	{
1432 		RelationOpenSmgr(wstate->index);
1433 		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1434 	}
1435 }
1436 
1437 /*
1438  * Create parallel context, and launch workers for leader.
1439  *
1440  * buildstate argument should be initialized (with the exception of the
1441  * tuplesort state in spools, which may later be created based on shared
1442  * state initially set up here).
1443  *
1444  * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1445  *
1446  * request is the target number of parallel worker processes to launch.
1447  *
1448  * Sets buildstate's BTLeader, which caller must use to shut down parallel
1449  * mode by passing it to _bt_end_parallel() at the very end of its index
1450  * build.  If not even a single worker process can be launched, this is
1451  * never set, and caller should proceed with a serial index build.
1452  */
1453 static void
_bt_begin_parallel(BTBuildState * buildstate,bool isconcurrent,int request)1454 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1455 {
1456 	ParallelContext *pcxt;
1457 	int			scantuplesortstates;
1458 	Snapshot	snapshot;
1459 	Size		estbtshared;
1460 	Size		estsort;
1461 	BTShared   *btshared;
1462 	Sharedsort *sharedsort;
1463 	Sharedsort *sharedsort2;
1464 	BTSpool    *btspool = buildstate->spool;
1465 	BTLeader   *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1466 	WalUsage   *walusage;
1467 	BufferUsage *bufferusage;
1468 	bool		leaderparticipates = true;
1469 	int			querylen;
1470 
1471 #ifdef DISABLE_LEADER_PARTICIPATION
1472 	leaderparticipates = false;
1473 #endif
1474 
1475 	/*
1476 	 * Enter parallel mode, and create context for parallel build of btree
1477 	 * index
1478 	 */
1479 	EnterParallelMode();
1480 	Assert(request > 0);
1481 	pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1482 								 request);
1483 
1484 	scantuplesortstates = leaderparticipates ? request + 1 : request;
1485 
1486 	/*
1487 	 * Prepare for scan of the base relation.  In a normal index build, we use
1488 	 * SnapshotAny because we must retrieve all tuples and do our own time
1489 	 * qual checks (because we have to index RECENTLY_DEAD tuples).  In a
1490 	 * concurrent build, we take a regular MVCC snapshot and index whatever's
1491 	 * live according to that.
1492 	 */
1493 	if (!isconcurrent)
1494 		snapshot = SnapshotAny;
1495 	else
1496 		snapshot = RegisterSnapshot(GetTransactionSnapshot());
1497 
1498 	/*
1499 	 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1500 	 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1501 	 */
1502 	estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1503 	shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1504 	estsort = tuplesort_estimate_shared(scantuplesortstates);
1505 	shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1506 
1507 	/*
1508 	 * Unique case requires a second spool, and so we may have to account for
1509 	 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1510 	 */
1511 	if (!btspool->isunique)
1512 		shm_toc_estimate_keys(&pcxt->estimator, 2);
1513 	else
1514 	{
1515 		shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1516 		shm_toc_estimate_keys(&pcxt->estimator, 3);
1517 	}
1518 
1519 	/*
1520 	 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1521 	 * and PARALLEL_KEY_BUFFER_USAGE.
1522 	 *
1523 	 * If there are no extensions loaded that care, we could skip this.  We
1524 	 * have no way of knowing whether anyone's looking at pgWalUsage or
1525 	 * pgBufferUsage, so do it unconditionally.
1526 	 */
1527 	shm_toc_estimate_chunk(&pcxt->estimator,
1528 						   mul_size(sizeof(WalUsage), pcxt->nworkers));
1529 	shm_toc_estimate_keys(&pcxt->estimator, 1);
1530 	shm_toc_estimate_chunk(&pcxt->estimator,
1531 						   mul_size(sizeof(BufferUsage), pcxt->nworkers));
1532 	shm_toc_estimate_keys(&pcxt->estimator, 1);
1533 
1534 	/* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1535 	if (debug_query_string)
1536 	{
1537 		querylen = strlen(debug_query_string);
1538 		shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1539 		shm_toc_estimate_keys(&pcxt->estimator, 1);
1540 	}
1541 	else
1542 		querylen = 0;			/* keep compiler quiet */
1543 
1544 	/* Everyone's had a chance to ask for space, so now create the DSM */
1545 	InitializeParallelDSM(pcxt);
1546 
1547 	/* If no DSM segment was available, back out (do serial build) */
1548 	if (pcxt->seg == NULL)
1549 	{
1550 		if (IsMVCCSnapshot(snapshot))
1551 			UnregisterSnapshot(snapshot);
1552 		DestroyParallelContext(pcxt);
1553 		ExitParallelMode();
1554 		return;
1555 	}
1556 
1557 	/* Store shared build state, for which we reserved space */
1558 	btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1559 	/* Initialize immutable state */
1560 	btshared->heaprelid = RelationGetRelid(btspool->heap);
1561 	btshared->indexrelid = RelationGetRelid(btspool->index);
1562 	btshared->isunique = btspool->isunique;
1563 	btshared->isconcurrent = isconcurrent;
1564 	btshared->scantuplesortstates = scantuplesortstates;
1565 	ConditionVariableInit(&btshared->workersdonecv);
1566 	SpinLockInit(&btshared->mutex);
1567 	/* Initialize mutable state */
1568 	btshared->nparticipantsdone = 0;
1569 	btshared->reltuples = 0.0;
1570 	btshared->havedead = false;
1571 	btshared->indtuples = 0.0;
1572 	btshared->brokenhotchain = false;
1573 	table_parallelscan_initialize(btspool->heap,
1574 								  ParallelTableScanFromBTShared(btshared),
1575 								  snapshot);
1576 
1577 	/*
1578 	 * Store shared tuplesort-private state, for which we reserved space.
1579 	 * Then, initialize opaque state using tuplesort routine.
1580 	 */
1581 	sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1582 	tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1583 								pcxt->seg);
1584 
1585 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1586 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1587 
1588 	/* Unique case requires a second spool, and associated shared state */
1589 	if (!btspool->isunique)
1590 		sharedsort2 = NULL;
1591 	else
1592 	{
1593 		/*
1594 		 * Store additional shared tuplesort-private state, for which we
1595 		 * reserved space.  Then, initialize opaque state using tuplesort
1596 		 * routine.
1597 		 */
1598 		sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1599 		tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1600 									pcxt->seg);
1601 
1602 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1603 	}
1604 
1605 	/* Store query string for workers */
1606 	if (debug_query_string)
1607 	{
1608 		char	   *sharedquery;
1609 
1610 		sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1611 		memcpy(sharedquery, debug_query_string, querylen + 1);
1612 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1613 	}
1614 
1615 	/*
1616 	 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1617 	 * initialize.
1618 	 */
1619 	walusage = shm_toc_allocate(pcxt->toc,
1620 								mul_size(sizeof(WalUsage), pcxt->nworkers));
1621 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1622 	bufferusage = shm_toc_allocate(pcxt->toc,
1623 								   mul_size(sizeof(BufferUsage), pcxt->nworkers));
1624 	shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1625 
1626 	/* Launch workers, saving status for leader/caller */
1627 	LaunchParallelWorkers(pcxt);
1628 	btleader->pcxt = pcxt;
1629 	btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1630 	if (leaderparticipates)
1631 		btleader->nparticipanttuplesorts++;
1632 	btleader->btshared = btshared;
1633 	btleader->sharedsort = sharedsort;
1634 	btleader->sharedsort2 = sharedsort2;
1635 	btleader->snapshot = snapshot;
1636 	btleader->walusage = walusage;
1637 	btleader->bufferusage = bufferusage;
1638 
1639 	/* If no workers were successfully launched, back out (do serial build) */
1640 	if (pcxt->nworkers_launched == 0)
1641 	{
1642 		_bt_end_parallel(btleader);
1643 		return;
1644 	}
1645 
1646 	/* Save leader state now that it's clear build will be parallel */
1647 	buildstate->btleader = btleader;
1648 
1649 	/* Join heap scan ourselves */
1650 	if (leaderparticipates)
1651 		_bt_leader_participate_as_worker(buildstate);
1652 
1653 	/*
1654 	 * Caller needs to wait for all launched workers when we return.  Make
1655 	 * sure that the failure-to-start case will not hang forever.
1656 	 */
1657 	WaitForParallelWorkersToAttach(pcxt);
1658 }
1659 
1660 /*
1661  * Shut down workers, destroy parallel context, and end parallel mode.
1662  */
1663 static void
_bt_end_parallel(BTLeader * btleader)1664 _bt_end_parallel(BTLeader *btleader)
1665 {
1666 	int			i;
1667 
1668 	/* Shutdown worker processes */
1669 	WaitForParallelWorkersToFinish(btleader->pcxt);
1670 
1671 	/*
1672 	 * Next, accumulate WAL usage.  (This must wait for the workers to finish,
1673 	 * or we might get incomplete data.)
1674 	 */
1675 	for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1676 		InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1677 
1678 	/* Free last reference to MVCC snapshot, if one was used */
1679 	if (IsMVCCSnapshot(btleader->snapshot))
1680 		UnregisterSnapshot(btleader->snapshot);
1681 	DestroyParallelContext(btleader->pcxt);
1682 	ExitParallelMode();
1683 }
1684 
1685 /*
1686  * Returns size of shared memory required to store state for a parallel
1687  * btree index build based on the snapshot its parallel scan will use.
1688  */
1689 static Size
_bt_parallel_estimate_shared(Relation heap,Snapshot snapshot)1690 _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1691 {
1692 	/* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1693 	return add_size(BUFFERALIGN(sizeof(BTShared)),
1694 					table_parallelscan_estimate(heap, snapshot));
1695 }
1696 
1697 /*
1698  * Within leader, wait for end of heap scan.
1699  *
1700  * When called, parallel heap scan started by _bt_begin_parallel() will
1701  * already be underway within worker processes (when leader participates
1702  * as a worker, we should end up here just as workers are finishing).
1703  *
1704  * Fills in fields needed for ambuild statistics, and lets caller set
1705  * field indicating that some worker encountered a broken HOT chain.
1706  *
1707  * Returns the total number of heap tuples scanned.
1708  */
1709 static double
_bt_parallel_heapscan(BTBuildState * buildstate,bool * brokenhotchain)1710 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1711 {
1712 	BTShared   *btshared = buildstate->btleader->btshared;
1713 	int			nparticipanttuplesorts;
1714 	double		reltuples;
1715 
1716 	nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1717 	for (;;)
1718 	{
1719 		SpinLockAcquire(&btshared->mutex);
1720 		if (btshared->nparticipantsdone == nparticipanttuplesorts)
1721 		{
1722 			buildstate->havedead = btshared->havedead;
1723 			buildstate->indtuples = btshared->indtuples;
1724 			*brokenhotchain = btshared->brokenhotchain;
1725 			reltuples = btshared->reltuples;
1726 			SpinLockRelease(&btshared->mutex);
1727 			break;
1728 		}
1729 		SpinLockRelease(&btshared->mutex);
1730 
1731 		ConditionVariableSleep(&btshared->workersdonecv,
1732 							   WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1733 	}
1734 
1735 	ConditionVariableCancelSleep();
1736 
1737 	return reltuples;
1738 }
1739 
1740 /*
1741  * Within leader, participate as a parallel worker.
1742  */
1743 static void
_bt_leader_participate_as_worker(BTBuildState * buildstate)1744 _bt_leader_participate_as_worker(BTBuildState *buildstate)
1745 {
1746 	BTLeader   *btleader = buildstate->btleader;
1747 	BTSpool    *leaderworker;
1748 	BTSpool    *leaderworker2;
1749 	int			sortmem;
1750 
1751 	/* Allocate memory and initialize private spool */
1752 	leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1753 	leaderworker->heap = buildstate->spool->heap;
1754 	leaderworker->index = buildstate->spool->index;
1755 	leaderworker->isunique = buildstate->spool->isunique;
1756 
1757 	/* Initialize second spool, if required */
1758 	if (!btleader->btshared->isunique)
1759 		leaderworker2 = NULL;
1760 	else
1761 	{
1762 		/* Allocate memory for worker's own private secondary spool */
1763 		leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1764 
1765 		/* Initialize worker's own secondary spool */
1766 		leaderworker2->heap = leaderworker->heap;
1767 		leaderworker2->index = leaderworker->index;
1768 		leaderworker2->isunique = false;
1769 	}
1770 
1771 	/*
1772 	 * Might as well use reliable figure when doling out maintenance_work_mem
1773 	 * (when requested number of workers were not launched, this will be
1774 	 * somewhat higher than it is for other workers).
1775 	 */
1776 	sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1777 
1778 	/* Perform work common to all participants */
1779 	_bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1780 							   btleader->sharedsort, btleader->sharedsort2,
1781 							   sortmem, true);
1782 
1783 #ifdef BTREE_BUILD_STATS
1784 	if (log_btree_build_stats)
1785 	{
1786 		ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1787 		ResetUsage();
1788 	}
1789 #endif							/* BTREE_BUILD_STATS */
1790 }
1791 
1792 /*
1793  * Perform work within a launched parallel process.
1794  */
1795 void
_bt_parallel_build_main(dsm_segment * seg,shm_toc * toc)1796 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1797 {
1798 	char	   *sharedquery;
1799 	BTSpool    *btspool;
1800 	BTSpool    *btspool2;
1801 	BTShared   *btshared;
1802 	Sharedsort *sharedsort;
1803 	Sharedsort *sharedsort2;
1804 	Relation	heapRel;
1805 	Relation	indexRel;
1806 	LOCKMODE	heapLockmode;
1807 	LOCKMODE	indexLockmode;
1808 	WalUsage   *walusage;
1809 	BufferUsage *bufferusage;
1810 	int			sortmem;
1811 
1812 #ifdef BTREE_BUILD_STATS
1813 	if (log_btree_build_stats)
1814 		ResetUsage();
1815 #endif							/* BTREE_BUILD_STATS */
1816 
1817 	/* Set debug_query_string for individual workers first */
1818 	sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1819 	debug_query_string = sharedquery;
1820 
1821 	/* Report the query string from leader */
1822 	pgstat_report_activity(STATE_RUNNING, debug_query_string);
1823 
1824 	/* Look up nbtree shared state */
1825 	btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1826 
1827 	/* Open relations using lock modes known to be obtained by index.c */
1828 	if (!btshared->isconcurrent)
1829 	{
1830 		heapLockmode = ShareLock;
1831 		indexLockmode = AccessExclusiveLock;
1832 	}
1833 	else
1834 	{
1835 		heapLockmode = ShareUpdateExclusiveLock;
1836 		indexLockmode = RowExclusiveLock;
1837 	}
1838 
1839 	/* Open relations within worker */
1840 	heapRel = table_open(btshared->heaprelid, heapLockmode);
1841 	indexRel = index_open(btshared->indexrelid, indexLockmode);
1842 
1843 	/* Initialize worker's own spool */
1844 	btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1845 	btspool->heap = heapRel;
1846 	btspool->index = indexRel;
1847 	btspool->isunique = btshared->isunique;
1848 
1849 	/* Look up shared state private to tuplesort.c */
1850 	sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1851 	tuplesort_attach_shared(sharedsort, seg);
1852 	if (!btshared->isunique)
1853 	{
1854 		btspool2 = NULL;
1855 		sharedsort2 = NULL;
1856 	}
1857 	else
1858 	{
1859 		/* Allocate memory for worker's own private secondary spool */
1860 		btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1861 
1862 		/* Initialize worker's own secondary spool */
1863 		btspool2->heap = btspool->heap;
1864 		btspool2->index = btspool->index;
1865 		btspool2->isunique = false;
1866 		/* Look up shared state private to tuplesort.c */
1867 		sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1868 		tuplesort_attach_shared(sharedsort2, seg);
1869 	}
1870 
1871 	/* Prepare to track buffer usage during parallel execution */
1872 	InstrStartParallelQuery();
1873 
1874 	/* Perform sorting of spool, and possibly a spool2 */
1875 	sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1876 	_bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1877 							   sharedsort2, sortmem, false);
1878 
1879 	/* Report WAL/buffer usage during parallel execution */
1880 	bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1881 	walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1882 	InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1883 						  &walusage[ParallelWorkerNumber]);
1884 
1885 #ifdef BTREE_BUILD_STATS
1886 	if (log_btree_build_stats)
1887 	{
1888 		ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1889 		ResetUsage();
1890 	}
1891 #endif							/* BTREE_BUILD_STATS */
1892 
1893 	index_close(indexRel, indexLockmode);
1894 	table_close(heapRel, heapLockmode);
1895 }
1896 
1897 /*
1898  * Perform a worker's portion of a parallel sort.
1899  *
1900  * This generates a tuplesort for passed btspool, and a second tuplesort
1901  * state if a second btspool is need (i.e. for unique index builds).  All
1902  * other spool fields should already be set when this is called.
1903  *
1904  * sortmem is the amount of working memory to use within each worker,
1905  * expressed in KBs.
1906  *
1907  * When this returns, workers are done, and need only release resources.
1908  */
1909 static void
_bt_parallel_scan_and_sort(BTSpool * btspool,BTSpool * btspool2,BTShared * btshared,Sharedsort * sharedsort,Sharedsort * sharedsort2,int sortmem,bool progress)1910 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1911 						   BTShared *btshared, Sharedsort *sharedsort,
1912 						   Sharedsort *sharedsort2, int sortmem, bool progress)
1913 {
1914 	SortCoordinate coordinate;
1915 	BTBuildState buildstate;
1916 	TableScanDesc scan;
1917 	double		reltuples;
1918 	IndexInfo  *indexInfo;
1919 
1920 	/* Initialize local tuplesort coordination state */
1921 	coordinate = palloc0(sizeof(SortCoordinateData));
1922 	coordinate->isWorker = true;
1923 	coordinate->nParticipants = -1;
1924 	coordinate->sharedsort = sharedsort;
1925 
1926 	/* Begin "partial" tuplesort */
1927 	btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1928 													 btspool->index,
1929 													 btspool->isunique,
1930 													 sortmem, coordinate,
1931 													 false);
1932 
1933 	/*
1934 	 * Just as with serial case, there may be a second spool.  If so, a
1935 	 * second, dedicated spool2 partial tuplesort is required.
1936 	 */
1937 	if (btspool2)
1938 	{
1939 		SortCoordinate coordinate2;
1940 
1941 		/*
1942 		 * We expect that the second one (for dead tuples) won't get very
1943 		 * full, so we give it only work_mem (unless sortmem is less for
1944 		 * worker).  Worker processes are generally permitted to allocate
1945 		 * work_mem independently.
1946 		 */
1947 		coordinate2 = palloc0(sizeof(SortCoordinateData));
1948 		coordinate2->isWorker = true;
1949 		coordinate2->nParticipants = -1;
1950 		coordinate2->sharedsort = sharedsort2;
1951 		btspool2->sortstate =
1952 			tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1953 										Min(sortmem, work_mem), coordinate2,
1954 										false);
1955 	}
1956 
1957 	/* Fill in buildstate for _bt_build_callback() */
1958 	buildstate.isunique = btshared->isunique;
1959 	buildstate.havedead = false;
1960 	buildstate.heap = btspool->heap;
1961 	buildstate.spool = btspool;
1962 	buildstate.spool2 = btspool2;
1963 	buildstate.indtuples = 0;
1964 	buildstate.btleader = NULL;
1965 
1966 	/* Join parallel scan */
1967 	indexInfo = BuildIndexInfo(btspool->index);
1968 	indexInfo->ii_Concurrent = btshared->isconcurrent;
1969 	scan = table_beginscan_parallel(btspool->heap,
1970 									ParallelTableScanFromBTShared(btshared));
1971 	reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1972 									   true, progress, _bt_build_callback,
1973 									   (void *) &buildstate, scan);
1974 
1975 	/* Execute this worker's part of the sort */
1976 	if (progress)
1977 		pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1978 									 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1979 	tuplesort_performsort(btspool->sortstate);
1980 	if (btspool2)
1981 	{
1982 		if (progress)
1983 			pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1984 										 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1985 		tuplesort_performsort(btspool2->sortstate);
1986 	}
1987 
1988 	/*
1989 	 * Done.  Record ambuild statistics, and whether we encountered a broken
1990 	 * HOT chain.
1991 	 */
1992 	SpinLockAcquire(&btshared->mutex);
1993 	btshared->nparticipantsdone++;
1994 	btshared->reltuples += reltuples;
1995 	if (buildstate.havedead)
1996 		btshared->havedead = true;
1997 	btshared->indtuples += buildstate.indtuples;
1998 	if (indexInfo->ii_BrokenHotChain)
1999 		btshared->brokenhotchain = true;
2000 	SpinLockRelease(&btshared->mutex);
2001 
2002 	/* Notify leader */
2003 	ConditionVariableSignal(&btshared->workersdonecv);
2004 
2005 	/* We can end tuplesorts immediately */
2006 	tuplesort_end(btspool->sortstate);
2007 	if (btspool2)
2008 		tuplesort_end(btspool2->sortstate);
2009 }
2010