1 /*-------------------------------------------------------------------------
2 *
3 * nbtsort.c
4 * Build a btree from sorted input by loading leaf pages sequentially.
5 *
6 * NOTES
7 *
8 * We use tuplesort.c to sort the given index tuples into order.
9 * Then we scan the index tuples in order and build the btree pages
10 * for each level. We load source tuples into leaf-level pages.
11 * Whenever we fill a page at one level, we add a link to it to its
12 * parent level (starting a new parent level if necessary). When
13 * done, we write out each final page on each level, adding it to
14 * its parent level. When we have only one page on a level, it must be
15 * the root -- it can be attached to the btree metapage and we are done.
16 *
17 * It is not wise to pack the pages entirely full, since then *any*
18 * insertion would cause a split (and not only of the leaf page; the need
19 * for a split would cascade right up the tree). The steady-state load
20 * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 * pages to the user-controllable fill factor (default 90%) while upper pages
22 * are always packed to 70%. This gives us reasonable density (there aren't
23 * many upper pages if the keys are reasonable-size) without risking a lot of
24 * cascading splits during early insertions.
25 *
26 * Formerly the index pages being built were kept in shared buffers, but
27 * that is of no value (since other backends have no interest in them yet)
28 * and it created locking problems for CHECKPOINT, because the upper-level
29 * pages were held exclusive-locked for long periods. Now we just build
30 * the pages in local memory and smgrwrite or smgrextend them as we finish
31 * them. They will need to be re-read into shared buffers on first use after
32 * the build finishes.
33 *
34 * Since the index will never be used unless it is completely built,
35 * from a crash-recovery point of view there is no need to WAL-log the
36 * steps of the build. After completing the index build, we can just sync
37 * the whole file to disk using smgrimmedsync() before exiting this module.
38 * This can be seen to be sufficient for crash recovery by considering that
39 * it's effectively equivalent to what would happen if a CHECKPOINT occurred
40 * just after the index build. However, it is clearly not sufficient if the
41 * DBA is using the WAL log for PITR or replication purposes, since another
42 * machine would not be able to reconstruct the index from WAL. Therefore,
43 * we log the completed index pages to WAL if and only if WAL archiving is
44 * active.
45 *
46 * This code isn't concerned about the FSM at all. The caller is responsible
47 * for initializing that.
48 *
49 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
50 * Portions Copyright (c) 1994, Regents of the University of California
51 *
52 * IDENTIFICATION
53 * src/backend/access/nbtree/nbtsort.c
54 *
55 *-------------------------------------------------------------------------
56 */
57
58 #include "postgres.h"
59
60 #include "access/nbtree.h"
61 #include "access/parallel.h"
62 #include "access/relscan.h"
63 #include "access/xact.h"
64 #include "access/xlog.h"
65 #include "access/xloginsert.h"
66 #include "catalog/index.h"
67 #include "executor/instrument.h"
68 #include "miscadmin.h"
69 #include "pgstat.h"
70 #include "storage/smgr.h"
71 #include "tcop/tcopprot.h" /* pgrminclude ignore */
72 #include "utils/rel.h"
73 #include "utils/sortsupport.h"
74 #include "utils/tuplesort.h"
75
76
77 /* Magic numbers for parallel state sharing */
78 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
79 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
80 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
81 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
82 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000005)
83
84 /*
85 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
86 * parallel index builds. This may be useful as a debugging aid.
87 #undef DISABLE_LEADER_PARTICIPATION
88 */
89
90 /*
91 * Status record for spooling/sorting phase. (Note we may have two of
92 * these due to the special requirements for uniqueness-checking with
93 * dead tuples.)
94 */
95 typedef struct BTSpool
96 {
97 Tuplesortstate *sortstate; /* state data for tuplesort.c */
98 Relation heap;
99 Relation index;
100 bool isunique;
101 } BTSpool;
102
103 /*
104 * Status for index builds performed in parallel. This is allocated in a
105 * dynamic shared memory segment. Note that there is a separate tuplesort TOC
106 * entry, private to tuplesort.c but allocated by this module on its behalf.
107 */
108 typedef struct BTShared
109 {
110 /*
111 * These fields are not modified during the sort. They primarily exist
112 * for the benefit of worker processes that need to create BTSpool state
113 * corresponding to that used by the leader.
114 */
115 Oid heaprelid;
116 Oid indexrelid;
117 bool isunique;
118 bool isconcurrent;
119 int scantuplesortstates;
120
121 /*
122 * workersdonecv is used to monitor the progress of workers. All parallel
123 * participants must indicate that they are done before leader can use
124 * mutable state that workers maintain during scan (and before leader can
125 * proceed to tuplesort_performsort()).
126 */
127 ConditionVariable workersdonecv;
128
129 /*
130 * mutex protects all fields before heapdesc.
131 *
132 * These fields contain status information of interest to B-Tree index
133 * builds that must work just the same when an index is built in parallel.
134 */
135 slock_t mutex;
136
137 /*
138 * Mutable state that is maintained by workers, and reported back to
139 * leader at end of parallel scan.
140 *
141 * nparticipantsdone is number of worker processes finished.
142 *
143 * reltuples is the total number of input heap tuples.
144 *
145 * havedead indicates if RECENTLY_DEAD tuples were encountered during
146 * build.
147 *
148 * indtuples is the total number of tuples that made it into the index.
149 *
150 * brokenhotchain indicates if any worker detected a broken HOT chain
151 * during build.
152 */
153 int nparticipantsdone;
154 double reltuples;
155 bool havedead;
156 double indtuples;
157 bool brokenhotchain;
158
159 /*
160 * This variable-sized field must come last.
161 *
162 * See _bt_parallel_estimate_shared().
163 */
164 ParallelHeapScanDescData heapdesc;
165 } BTShared;
166
167 /*
168 * Status for leader in parallel index build.
169 */
170 typedef struct BTLeader
171 {
172 /* parallel context itself */
173 ParallelContext *pcxt;
174
175 /*
176 * nparticipanttuplesorts is the exact number of worker processes
177 * successfully launched, plus one leader process if it participates as a
178 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
179 * participating as a worker).
180 */
181 int nparticipanttuplesorts;
182
183 /*
184 * Leader process convenience pointers to shared state (leader avoids TOC
185 * lookups).
186 *
187 * btshared is the shared state for entire build. sharedsort is the
188 * shared, tuplesort-managed state passed to each process tuplesort.
189 * sharedsort2 is the corresponding btspool2 shared state, used only when
190 * building unique indexes. snapshot is the snapshot used by the scan iff
191 * an MVCC snapshot is required.
192 */
193 BTShared *btshared;
194 Sharedsort *sharedsort;
195 Sharedsort *sharedsort2;
196 Snapshot snapshot;
197 BufferUsage *bufferusage;
198 } BTLeader;
199
200 /*
201 * Working state for btbuild and its callback.
202 *
203 * When parallel CREATE INDEX is used, there is a BTBuildState for each
204 * participant.
205 */
206 typedef struct BTBuildState
207 {
208 bool isunique;
209 bool havedead;
210 Relation heap;
211 BTSpool *spool;
212
213 /*
214 * spool2 is needed only when the index is a unique index. Dead tuples are
215 * put into spool2 instead of spool in order to avoid uniqueness check.
216 */
217 BTSpool *spool2;
218 double indtuples;
219
220 /*
221 * btleader is only present when a parallel index build is performed, and
222 * only in the leader process. (Actually, only the leader has a
223 * BTBuildState. Workers have their own spool and spool2, though.)
224 */
225 BTLeader *btleader;
226 } BTBuildState;
227
228 /*
229 * Status record for a btree page being built. We have one of these
230 * for each active tree level.
231 *
232 * The reason we need to store a copy of the minimum key is that we'll
233 * need to propagate it to the parent node when this page is linked
234 * into its parent. However, if the page is not a leaf page, the first
235 * entry on the page doesn't need to contain a key, so we will not have
236 * stored the key itself on the page. (You might think we could skip
237 * copying the minimum key on leaf pages, but actually we must have a
238 * writable copy anyway because we'll poke the page's address into it
239 * before passing it up to the parent...)
240 */
241 typedef struct BTPageState
242 {
243 Page btps_page; /* workspace for page building */
244 BlockNumber btps_blkno; /* block # to write this page at */
245 IndexTuple btps_minkey; /* copy of minimum key (first item) on page */
246 OffsetNumber btps_lastoff; /* last item offset loaded */
247 uint32 btps_level; /* tree level (0 = leaf) */
248 Size btps_full; /* "full" if less than this much free space */
249 struct BTPageState *btps_next; /* link to parent level, if any */
250 } BTPageState;
251
252 /*
253 * Overall status record for index writing phase.
254 */
255 typedef struct BTWriteState
256 {
257 Relation heap;
258 Relation index;
259 bool btws_use_wal; /* dump pages to WAL? */
260 BlockNumber btws_pages_alloced; /* # pages allocated */
261 BlockNumber btws_pages_written; /* # pages written out */
262 Page btws_zeropage; /* workspace for filling zeroes */
263 } BTWriteState;
264
265
266 static double _bt_spools_heapscan(Relation heap, Relation index,
267 BTBuildState *buildstate, IndexInfo *indexInfo);
268 static void _bt_spooldestroy(BTSpool *btspool);
269 static void _bt_spool(BTSpool *btspool, ItemPointer self,
270 Datum *values, bool *isnull);
271 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
272 static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values,
273 bool *isnull, bool tupleIsAlive, void *state);
274 static Page _bt_blnewpage(uint32 level);
275 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
276 static void _bt_slideleft(Page page);
277 static void _bt_sortaddtup(Page page, Size itemsize,
278 IndexTuple itup, OffsetNumber itup_off);
279 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
280 IndexTuple itup);
281 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
282 static void _bt_load(BTWriteState *wstate,
283 BTSpool *btspool, BTSpool *btspool2);
284 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
285 int request);
286 static void _bt_end_parallel(BTLeader *btleader);
287 static Size _bt_parallel_estimate_shared(Snapshot snapshot);
288 static double _bt_parallel_heapscan(BTBuildState *buildstate,
289 bool *brokenhotchain);
290 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
291 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
292 BTShared *btshared, Sharedsort *sharedsort,
293 Sharedsort *sharedsort2, int sortmem);
294
295
296 /*
297 * btbuild() -- build a new btree index.
298 */
299 IndexBuildResult *
btbuild(Relation heap,Relation index,IndexInfo * indexInfo)300 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
301 {
302 IndexBuildResult *result;
303 BTBuildState buildstate;
304 double reltuples;
305
306 #ifdef BTREE_BUILD_STATS
307 if (log_btree_build_stats)
308 ResetUsage();
309 #endif /* BTREE_BUILD_STATS */
310
311 buildstate.isunique = indexInfo->ii_Unique;
312 buildstate.havedead = false;
313 buildstate.heap = heap;
314 buildstate.spool = NULL;
315 buildstate.spool2 = NULL;
316 buildstate.indtuples = 0;
317 buildstate.btleader = NULL;
318
319 /*
320 * We expect to be called exactly once for any index relation. If that's
321 * not the case, big trouble's what we have.
322 */
323 if (RelationGetNumberOfBlocks(index) != 0)
324 elog(ERROR, "index \"%s\" already contains data",
325 RelationGetRelationName(index));
326
327 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
328
329 /*
330 * Finish the build by (1) completing the sort of the spool file, (2)
331 * inserting the sorted tuples into btree pages and (3) building the upper
332 * levels. Finally, it may also be necessary to end use of parallelism.
333 */
334 _bt_leafbuild(buildstate.spool, buildstate.spool2);
335 _bt_spooldestroy(buildstate.spool);
336 if (buildstate.spool2)
337 _bt_spooldestroy(buildstate.spool2);
338 if (buildstate.btleader)
339 _bt_end_parallel(buildstate.btleader);
340
341 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
342
343 result->heap_tuples = reltuples;
344 result->index_tuples = buildstate.indtuples;
345
346 #ifdef BTREE_BUILD_STATS
347 if (log_btree_build_stats)
348 {
349 ShowUsage("BTREE BUILD STATS");
350 ResetUsage();
351 }
352 #endif /* BTREE_BUILD_STATS */
353
354 return result;
355 }
356
357 /*
358 * Create and initialize one or two spool structures, and save them in caller's
359 * buildstate argument. May also fill-in fields within indexInfo used by index
360 * builds.
361 *
362 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
363 * routine encapsulates all aspects of managing parallelism. Caller need only
364 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
365 *
366 * Returns the total number of heap tuples scanned.
367 */
368 static double
_bt_spools_heapscan(Relation heap,Relation index,BTBuildState * buildstate,IndexInfo * indexInfo)369 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
370 IndexInfo *indexInfo)
371 {
372 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
373 SortCoordinate coordinate = NULL;
374 double reltuples = 0;
375
376 /*
377 * We size the sort area as maintenance_work_mem rather than work_mem to
378 * speed index creation. This should be OK since a single backend can't
379 * run multiple index creations in parallel (see also: notes on
380 * parallelism and maintenance_work_mem below).
381 */
382 btspool->heap = heap;
383 btspool->index = index;
384 btspool->isunique = indexInfo->ii_Unique;
385
386 /* Save as primary spool */
387 buildstate->spool = btspool;
388
389 /* Attempt to launch parallel worker scan when required */
390 if (indexInfo->ii_ParallelWorkers > 0)
391 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
392 indexInfo->ii_ParallelWorkers);
393
394 /*
395 * If parallel build requested and at least one worker process was
396 * successfully launched, set up coordination state
397 */
398 if (buildstate->btleader)
399 {
400 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
401 coordinate->isWorker = false;
402 coordinate->nParticipants =
403 buildstate->btleader->nparticipanttuplesorts;
404 coordinate->sharedsort = buildstate->btleader->sharedsort;
405 }
406
407 /*
408 * Begin serial/leader tuplesort.
409 *
410 * In cases where parallelism is involved, the leader receives the same
411 * share of maintenance_work_mem as a serial sort (it is generally treated
412 * in the same way as a serial sort once we return). Parallel worker
413 * Tuplesortstates will have received only a fraction of
414 * maintenance_work_mem, though.
415 *
416 * We rely on the lifetime of the Leader Tuplesortstate almost not
417 * overlapping with any worker Tuplesortstate's lifetime. There may be
418 * some small overlap, but that's okay because we rely on leader
419 * Tuplesortstate only allocating a small, fixed amount of memory here.
420 * When its tuplesort_performsort() is called (by our caller), and
421 * significant amounts of memory are likely to be used, all workers must
422 * have already freed almost all memory held by their Tuplesortstates
423 * (they are about to go away completely, too). The overall effect is
424 * that maintenance_work_mem always represents an absolute high watermark
425 * on the amount of memory used by a CREATE INDEX operation, regardless of
426 * the use of parallelism or any other factor.
427 */
428 buildstate->spool->sortstate =
429 tuplesort_begin_index_btree(heap, index, buildstate->isunique,
430 maintenance_work_mem, coordinate,
431 false);
432
433 /*
434 * If building a unique index, put dead tuples in a second spool to keep
435 * them out of the uniqueness check. We expect that the second spool (for
436 * dead tuples) won't get very full, so we give it only work_mem.
437 */
438 if (indexInfo->ii_Unique)
439 {
440 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
441 SortCoordinate coordinate2 = NULL;
442
443 /* Initialize secondary spool */
444 btspool2->heap = heap;
445 btspool2->index = index;
446 btspool2->isunique = false;
447 /* Save as secondary spool */
448 buildstate->spool2 = btspool2;
449
450 if (buildstate->btleader)
451 {
452 /*
453 * Set up non-private state that is passed to
454 * tuplesort_begin_index_btree() about the basic high level
455 * coordination of a parallel sort.
456 */
457 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
458 coordinate2->isWorker = false;
459 coordinate2->nParticipants =
460 buildstate->btleader->nparticipanttuplesorts;
461 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
462 }
463
464 /*
465 * We expect that the second one (for dead tuples) won't get very
466 * full, so we give it only work_mem
467 */
468 buildstate->spool2->sortstate =
469 tuplesort_begin_index_btree(heap, index, false, work_mem,
470 coordinate2, false);
471 }
472
473 /* Fill spool using either serial or parallel heap scan */
474 if (!buildstate->btleader)
475 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
476 _bt_build_callback, (void *) buildstate,
477 NULL);
478 else
479 reltuples = _bt_parallel_heapscan(buildstate,
480 &indexInfo->ii_BrokenHotChain);
481
482 /* okay, all heap tuples are spooled */
483 if (buildstate->spool2 && !buildstate->havedead)
484 {
485 /* spool2 turns out to be unnecessary */
486 _bt_spooldestroy(buildstate->spool2);
487 buildstate->spool2 = NULL;
488 }
489
490 return reltuples;
491 }
492
493 /*
494 * clean up a spool structure and its substructures.
495 */
496 static void
_bt_spooldestroy(BTSpool * btspool)497 _bt_spooldestroy(BTSpool *btspool)
498 {
499 tuplesort_end(btspool->sortstate);
500 pfree(btspool);
501 }
502
503 /*
504 * spool an index entry into the sort file.
505 */
506 static void
_bt_spool(BTSpool * btspool,ItemPointer self,Datum * values,bool * isnull)507 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
508 {
509 tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
510 self, values, isnull);
511 }
512
513 /*
514 * given a spool loaded by successive calls to _bt_spool,
515 * create an entire btree.
516 */
517 static void
_bt_leafbuild(BTSpool * btspool,BTSpool * btspool2)518 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
519 {
520 BTWriteState wstate;
521
522 #ifdef BTREE_BUILD_STATS
523 if (log_btree_build_stats)
524 {
525 ShowUsage("BTREE BUILD (Spool) STATISTICS");
526 ResetUsage();
527 }
528 #endif /* BTREE_BUILD_STATS */
529
530 tuplesort_performsort(btspool->sortstate);
531 if (btspool2)
532 tuplesort_performsort(btspool2->sortstate);
533
534 wstate.heap = btspool->heap;
535 wstate.index = btspool->index;
536
537 /*
538 * We need to log index creation in WAL iff WAL archiving/streaming is
539 * enabled UNLESS the index isn't WAL-logged anyway.
540 */
541 wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
542
543 /* reserve the metapage */
544 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
545 wstate.btws_pages_written = 0;
546 wstate.btws_zeropage = NULL; /* until needed */
547
548 _bt_load(&wstate, btspool, btspool2);
549 }
550
551 /*
552 * Per-tuple callback from IndexBuildHeapScan
553 */
554 static void
_bt_build_callback(Relation index,HeapTuple htup,Datum * values,bool * isnull,bool tupleIsAlive,void * state)555 _bt_build_callback(Relation index,
556 HeapTuple htup,
557 Datum *values,
558 bool *isnull,
559 bool tupleIsAlive,
560 void *state)
561 {
562 BTBuildState *buildstate = (BTBuildState *) state;
563
564 /*
565 * insert the index tuple into the appropriate spool file for subsequent
566 * processing
567 */
568 if (tupleIsAlive || buildstate->spool2 == NULL)
569 _bt_spool(buildstate->spool, &htup->t_self, values, isnull);
570 else
571 {
572 /* dead tuples are put into spool2 */
573 buildstate->havedead = true;
574 _bt_spool(buildstate->spool2, &htup->t_self, values, isnull);
575 }
576
577 buildstate->indtuples += 1;
578 }
579
580 /*
581 * allocate workspace for a new, clean btree page, not linked to any siblings.
582 */
583 static Page
_bt_blnewpage(uint32 level)584 _bt_blnewpage(uint32 level)
585 {
586 Page page;
587 BTPageOpaque opaque;
588
589 page = (Page) palloc(BLCKSZ);
590
591 /* Zero the page and set up standard page header info */
592 _bt_pageinit(page, BLCKSZ);
593
594 /* Initialize BT opaque state */
595 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
596 opaque->btpo_prev = opaque->btpo_next = P_NONE;
597 opaque->btpo.level = level;
598 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
599 opaque->btpo_cycleid = 0;
600
601 /* Make the P_HIKEY line pointer appear allocated */
602 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
603
604 return page;
605 }
606
607 /*
608 * emit a completed btree page, and release the working storage.
609 */
610 static void
_bt_blwritepage(BTWriteState * wstate,Page page,BlockNumber blkno)611 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
612 {
613 /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
614 RelationOpenSmgr(wstate->index);
615
616 /* XLOG stuff */
617 if (wstate->btws_use_wal)
618 {
619 /* We use the heap NEWPAGE record type for this */
620 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
621 }
622
623 /*
624 * If we have to write pages nonsequentially, fill in the space with
625 * zeroes until we come back and overwrite. This is not logically
626 * necessary on standard Unix filesystems (unwritten space will read as
627 * zeroes anyway), but it should help to avoid fragmentation. The dummy
628 * pages aren't WAL-logged though.
629 */
630 while (blkno > wstate->btws_pages_written)
631 {
632 if (!wstate->btws_zeropage)
633 wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
634 /* don't set checksum for all-zero page */
635 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
636 wstate->btws_pages_written++,
637 (char *) wstate->btws_zeropage,
638 true);
639 }
640
641 PageSetChecksumInplace(page, blkno);
642
643 /*
644 * Now write the page. There's no need for smgr to schedule an fsync for
645 * this write; we'll do it ourselves before ending the build.
646 */
647 if (blkno == wstate->btws_pages_written)
648 {
649 /* extending the file... */
650 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
651 (char *) page, true);
652 wstate->btws_pages_written++;
653 }
654 else
655 {
656 /* overwriting a block we zero-filled before */
657 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
658 (char *) page, true);
659 }
660
661 pfree(page);
662 }
663
664 /*
665 * allocate and initialize a new BTPageState. the returned structure
666 * is suitable for immediate use by _bt_buildadd.
667 */
668 static BTPageState *
_bt_pagestate(BTWriteState * wstate,uint32 level)669 _bt_pagestate(BTWriteState *wstate, uint32 level)
670 {
671 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
672
673 /* create initial page for level */
674 state->btps_page = _bt_blnewpage(level);
675
676 /* and assign it a page position */
677 state->btps_blkno = wstate->btws_pages_alloced++;
678
679 state->btps_minkey = NULL;
680 /* initialize lastoff so first item goes into P_FIRSTKEY */
681 state->btps_lastoff = P_HIKEY;
682 state->btps_level = level;
683 /* set "full" threshold based on level. See notes at head of file. */
684 if (level > 0)
685 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
686 else
687 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
688 BTREE_DEFAULT_FILLFACTOR);
689 /* no parent level, yet */
690 state->btps_next = NULL;
691
692 return state;
693 }
694
695 /*
696 * slide an array of ItemIds back one slot (from P_FIRSTKEY to
697 * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover
698 * that we have built an ItemId array in what has turned out to be a
699 * P_RIGHTMOST page.
700 */
701 static void
_bt_slideleft(Page page)702 _bt_slideleft(Page page)
703 {
704 OffsetNumber off;
705 OffsetNumber maxoff;
706 ItemId previi;
707 ItemId thisii;
708
709 if (!PageIsEmpty(page))
710 {
711 maxoff = PageGetMaxOffsetNumber(page);
712 previi = PageGetItemId(page, P_HIKEY);
713 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
714 {
715 thisii = PageGetItemId(page, off);
716 *previi = *thisii;
717 previi = thisii;
718 }
719 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
720 }
721 }
722
723 /*
724 * Add an item to a page being built.
725 *
726 * The main difference between this routine and a bare PageAddItem call
727 * is that this code knows that the leftmost data item on a non-leaf
728 * btree page doesn't need to have a key. Therefore, it strips such
729 * items down to just the item header.
730 *
731 * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
732 * that because it assumes that P_RIGHTMOST() will return the correct
733 * answer for the page. Here, we don't know yet if the page will be
734 * rightmost. Offset P_FIRSTKEY is always the first data key.
735 */
736 static void
_bt_sortaddtup(Page page,Size itemsize,IndexTuple itup,OffsetNumber itup_off)737 _bt_sortaddtup(Page page,
738 Size itemsize,
739 IndexTuple itup,
740 OffsetNumber itup_off)
741 {
742 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
743 IndexTupleData trunctuple;
744
745 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
746 {
747 trunctuple = *itup;
748 trunctuple.t_info = sizeof(IndexTupleData);
749 BTreeTupleSetNAtts(&trunctuple, 0);
750 itup = &trunctuple;
751 itemsize = sizeof(IndexTupleData);
752 }
753
754 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
755 false, false) == InvalidOffsetNumber)
756 elog(ERROR, "failed to add item to the index page");
757 }
758
759 /*----------
760 * Add an item to a disk page from the sort output.
761 *
762 * We must be careful to observe the page layout conventions of nbtsearch.c:
763 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
764 * - on non-leaf pages, the key portion of the first item need not be
765 * stored, we should store only the link.
766 *
767 * A leaf page being built looks like:
768 *
769 * +----------------+---------------------------------+
770 * | PageHeaderData | linp0 linp1 linp2 ... |
771 * +-----------+----+---------------------------------+
772 * | ... linpN | |
773 * +-----------+--------------------------------------+
774 * | ^ last |
775 * | |
776 * +-------------+------------------------------------+
777 * | | itemN ... |
778 * +-------------+------------------+-----------------+
779 * | ... item3 item2 item1 | "special space" |
780 * +--------------------------------+-----------------+
781 *
782 * Contrast this with the diagram in bufpage.h; note the mismatch
783 * between linps and items. This is because we reserve linp0 as a
784 * placeholder for the pointer to the "high key" item; when we have
785 * filled up the page, we will set linp0 to point to itemN and clear
786 * linpN. On the other hand, if we find this is the last (rightmost)
787 * page, we leave the items alone and slide the linp array over. If
788 * the high key is to be truncated, offset 1 is deleted, and we insert
789 * the truncated high key at offset 1.
790 *
791 * 'last' pointer indicates the last offset added to the page.
792 *----------
793 */
794 static void
_bt_buildadd(BTWriteState * wstate,BTPageState * state,IndexTuple itup)795 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
796 {
797 Page npage;
798 BlockNumber nblkno;
799 OffsetNumber last_off;
800 Size pgspc;
801 Size itupsz;
802 int indnatts = IndexRelationGetNumberOfAttributes(wstate->index);
803 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index);
804
805 /*
806 * This is a handy place to check for cancel interrupts during the btree
807 * load phase of index creation.
808 */
809 CHECK_FOR_INTERRUPTS();
810
811 npage = state->btps_page;
812 nblkno = state->btps_blkno;
813 last_off = state->btps_lastoff;
814
815 pgspc = PageGetFreeSpace(npage);
816 itupsz = IndexTupleSize(itup);
817 itupsz = MAXALIGN(itupsz);
818
819 /*
820 * Check whether the item can fit on a btree page at all. (Eventually, we
821 * ought to try to apply TOAST methods if not.) We actually need to be
822 * able to fit three items on every page, so restrict any one item to 1/3
823 * the per-page available space. Note that at this point, itupsz doesn't
824 * include the ItemId.
825 *
826 * NOTE: similar code appears in _bt_insertonpg() to defend against
827 * oversize items being inserted into an already-existing index. But
828 * during creation of an index, we don't go through there.
829 */
830 if (itupsz > BTMaxItemSize(npage))
831 ereport(ERROR,
832 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
833 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
834 itupsz, BTMaxItemSize(npage),
835 RelationGetRelationName(wstate->index)),
836 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
837 "Consider a function index of an MD5 hash of the value, "
838 "or use full text indexing."),
839 errtableconstraint(wstate->heap,
840 RelationGetRelationName(wstate->index))));
841
842 /*
843 * Check to see if page is "full". It's definitely full if the item won't
844 * fit. Otherwise, compare to the target freespace derived from the
845 * fillfactor. However, we must put at least two items on each page, so
846 * disregard fillfactor if we don't have that many.
847 */
848 if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
849 {
850 /*
851 * Finish off the page and write it out.
852 */
853 Page opage = npage;
854 BlockNumber oblkno = nblkno;
855 ItemId ii;
856 ItemId hii;
857 IndexTuple oitup;
858 BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage);
859
860 /* Create new page of same level */
861 npage = _bt_blnewpage(state->btps_level);
862
863 /* and assign it a page position */
864 nblkno = wstate->btws_pages_alloced++;
865
866 /*
867 * We copy the last item on the page into the new page, and then
868 * rearrange the old page so that the 'last item' becomes its high key
869 * rather than a true data item. There had better be at least two
870 * items on the page already, else the page would be empty of useful
871 * data.
872 */
873 Assert(last_off > P_FIRSTKEY);
874 ii = PageGetItemId(opage, last_off);
875 oitup = (IndexTuple) PageGetItem(opage, ii);
876 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
877
878 /*
879 * Move 'last' into the high key position on opage
880 */
881 hii = PageGetItemId(opage, P_HIKEY);
882 *hii = *ii;
883 ItemIdSetUnused(ii); /* redundant */
884 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
885
886 if (indnkeyatts != indnatts && P_ISLEAF(opageop))
887 {
888 IndexTuple truncated;
889 Size truncsz;
890
891 /*
892 * Truncate any non-key attributes from high key on leaf level
893 * (i.e. truncate on leaf level if we're building an INCLUDE
894 * index). This is only done at the leaf level because downlinks
895 * in internal pages are either negative infinity items, or get
896 * their contents from copying from one level down. See also:
897 * _bt_split().
898 *
899 * Since the truncated tuple is probably smaller than the
900 * original, it cannot just be copied in place (besides, we want
901 * to actually save space on the leaf page). We delete the
902 * original high key, and add our own truncated high key at the
903 * same offset.
904 *
905 * Note that the page layout won't be changed very much. oitup is
906 * already located at the physical beginning of tuple space, so we
907 * only shift the line pointer array back and forth, and overwrite
908 * the latter portion of the space occupied by the original tuple.
909 * This is fairly cheap.
910 */
911 truncated = _bt_nonkey_truncate(wstate->index, oitup);
912 truncsz = IndexTupleSize(truncated);
913 PageIndexTupleDelete(opage, P_HIKEY);
914 _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY);
915 pfree(truncated);
916
917 /* oitup should continue to point to the page's high key */
918 hii = PageGetItemId(opage, P_HIKEY);
919 oitup = (IndexTuple) PageGetItem(opage, hii);
920 }
921
922 /*
923 * Link the old page into its parent, using its minimum key. If we
924 * don't have a parent, we have to create one; this adds a new btree
925 * level.
926 */
927 if (state->btps_next == NULL)
928 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
929
930 Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) ==
931 IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
932 P_LEFTMOST(opageop));
933 Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 ||
934 !P_LEFTMOST(opageop));
935 BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno);
936 _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
937 pfree(state->btps_minkey);
938
939 /*
940 * Save a copy of the minimum key for the new page. We have to copy
941 * it off the old page, not the new one, in case we are not at leaf
942 * level.
943 */
944 state->btps_minkey = CopyIndexTuple(oitup);
945
946 /*
947 * Set the sibling links for both pages.
948 */
949 {
950 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
951 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
952
953 oopaque->btpo_next = nblkno;
954 nopaque->btpo_prev = oblkno;
955 nopaque->btpo_next = P_NONE; /* redundant */
956 }
957
958 /*
959 * Write out the old page. We never need to touch it again, so we can
960 * free the opage workspace too.
961 */
962 _bt_blwritepage(wstate, opage, oblkno);
963
964 /*
965 * Reset last_off to point to new page
966 */
967 last_off = P_FIRSTKEY;
968 }
969
970 /*
971 * If the new item is the first for its page, stash a copy for later. Note
972 * this will only happen for the first item on a level; on later pages,
973 * the first item for a page is copied from the prior page in the code
974 * above. Since the minimum key for an entire level is only used as a
975 * minus infinity downlink, and never as a high key, there is no need to
976 * truncate away non-key attributes at this point.
977 */
978 if (last_off == P_HIKEY)
979 {
980 Assert(state->btps_minkey == NULL);
981 state->btps_minkey = CopyIndexTuple(itup);
982 /* _bt_sortaddtup() will perform full truncation later */
983 BTreeTupleSetNAtts(state->btps_minkey, 0);
984 }
985
986 /*
987 * Add the new item into the current page.
988 */
989 last_off = OffsetNumberNext(last_off);
990 _bt_sortaddtup(npage, itupsz, itup, last_off);
991
992 state->btps_page = npage;
993 state->btps_blkno = nblkno;
994 state->btps_lastoff = last_off;
995 }
996
997 /*
998 * Finish writing out the completed btree.
999 */
1000 static void
_bt_uppershutdown(BTWriteState * wstate,BTPageState * state)1001 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1002 {
1003 BTPageState *s;
1004 BlockNumber rootblkno = P_NONE;
1005 uint32 rootlevel = 0;
1006 Page metapage;
1007
1008 /*
1009 * Each iteration of this loop completes one more level of the tree.
1010 */
1011 for (s = state; s != NULL; s = s->btps_next)
1012 {
1013 BlockNumber blkno;
1014 BTPageOpaque opaque;
1015
1016 blkno = s->btps_blkno;
1017 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1018
1019 /*
1020 * We have to link the last page on this level to somewhere.
1021 *
1022 * If we're at the top, it's the root, so attach it to the metapage.
1023 * Otherwise, add an entry for it to its parent using its minimum key.
1024 * This may cause the last page of the parent level to split, but
1025 * that's not a problem -- we haven't gotten to it yet.
1026 */
1027 if (s->btps_next == NULL)
1028 {
1029 opaque->btpo_flags |= BTP_ROOT;
1030 rootblkno = blkno;
1031 rootlevel = s->btps_level;
1032 }
1033 else
1034 {
1035 Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) ==
1036 IndexRelationGetNumberOfKeyAttributes(wstate->index) ||
1037 P_LEFTMOST(opaque));
1038 Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 ||
1039 !P_LEFTMOST(opaque));
1040 BTreeInnerTupleSetDownLink(s->btps_minkey, blkno);
1041 _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
1042 pfree(s->btps_minkey);
1043 s->btps_minkey = NULL;
1044 }
1045
1046 /*
1047 * This is the rightmost page, so the ItemId array needs to be slid
1048 * back one slot. Then we can dump out the page.
1049 */
1050 _bt_slideleft(s->btps_page);
1051 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1052 s->btps_page = NULL; /* writepage freed the workspace */
1053 }
1054
1055 /*
1056 * As the last step in the process, construct the metapage and make it
1057 * point to the new root (unless we had no data at all, in which case it's
1058 * set to point to "P_NONE"). This changes the index to the "valid" state
1059 * by filling in a valid magic number in the metapage.
1060 */
1061 metapage = (Page) palloc(BLCKSZ);
1062 _bt_initmetapage(metapage, rootblkno, rootlevel);
1063 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1064 }
1065
1066 /*
1067 * Read tuples in correct sort order from tuplesort, and load them into
1068 * btree leaves.
1069 */
1070 static void
_bt_load(BTWriteState * wstate,BTSpool * btspool,BTSpool * btspool2)1071 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1072 {
1073 BTPageState *state = NULL;
1074 bool merge = (btspool2 != NULL);
1075 IndexTuple itup,
1076 itup2 = NULL;
1077 bool load1;
1078 TupleDesc tupdes = RelationGetDescr(wstate->index);
1079 int i,
1080 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1081 ScanKey indexScanKey = NULL;
1082 SortSupport sortKeys;
1083
1084 if (merge)
1085 {
1086 /*
1087 * Another BTSpool for dead tuples exists. Now we have to merge
1088 * btspool and btspool2.
1089 */
1090
1091 /* the preparation of merge */
1092 itup = tuplesort_getindextuple(btspool->sortstate, true);
1093 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1094 indexScanKey = _bt_mkscankey_nodata(wstate->index);
1095
1096 /* Prepare SortSupport data for each column */
1097 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1098
1099 for (i = 0; i < keysz; i++)
1100 {
1101 SortSupport sortKey = sortKeys + i;
1102 ScanKey scanKey = indexScanKey + i;
1103 int16 strategy;
1104
1105 sortKey->ssup_cxt = CurrentMemoryContext;
1106 sortKey->ssup_collation = scanKey->sk_collation;
1107 sortKey->ssup_nulls_first =
1108 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1109 sortKey->ssup_attno = scanKey->sk_attno;
1110 /* Abbreviation is not supported here */
1111 sortKey->abbreviate = false;
1112
1113 AssertState(sortKey->ssup_attno != 0);
1114
1115 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1116 BTGreaterStrategyNumber : BTLessStrategyNumber;
1117
1118 PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1119 }
1120
1121 _bt_freeskey(indexScanKey);
1122
1123 for (;;)
1124 {
1125 load1 = true; /* load BTSpool next ? */
1126 if (itup2 == NULL)
1127 {
1128 if (itup == NULL)
1129 break;
1130 }
1131 else if (itup != NULL)
1132 {
1133 for (i = 1; i <= keysz; i++)
1134 {
1135 SortSupport entry;
1136 Datum attrDatum1,
1137 attrDatum2;
1138 bool isNull1,
1139 isNull2;
1140 int32 compare;
1141
1142 entry = sortKeys + i - 1;
1143 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1144 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1145
1146 compare = ApplySortComparator(attrDatum1, isNull1,
1147 attrDatum2, isNull2,
1148 entry);
1149 if (compare > 0)
1150 {
1151 load1 = false;
1152 break;
1153 }
1154 else if (compare < 0)
1155 break;
1156 }
1157 }
1158 else
1159 load1 = false;
1160
1161 /* When we see first tuple, create first index page */
1162 if (state == NULL)
1163 state = _bt_pagestate(wstate, 0);
1164
1165 if (load1)
1166 {
1167 _bt_buildadd(wstate, state, itup);
1168 itup = tuplesort_getindextuple(btspool->sortstate, true);
1169 }
1170 else
1171 {
1172 _bt_buildadd(wstate, state, itup2);
1173 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1174 }
1175 }
1176 pfree(sortKeys);
1177 }
1178 else
1179 {
1180 /* merge is unnecessary */
1181 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1182 true)) != NULL)
1183 {
1184 /* When we see first tuple, create first index page */
1185 if (state == NULL)
1186 state = _bt_pagestate(wstate, 0);
1187
1188 _bt_buildadd(wstate, state, itup);
1189 }
1190 }
1191
1192 /* Close down final pages and write the metapage */
1193 _bt_uppershutdown(wstate, state);
1194
1195 /*
1196 * If the index is WAL-logged, we must fsync it down to disk before it's
1197 * safe to commit the transaction. (For a non-WAL-logged index we don't
1198 * care since the index will be uninteresting after a crash anyway.)
1199 *
1200 * It's obvious that we must do this when not WAL-logging the build. It's
1201 * less obvious that we have to do it even if we did WAL-log the index
1202 * pages. The reason is that since we're building outside shared buffers,
1203 * a CHECKPOINT occurring during the build has no way to flush the
1204 * previously written data to disk (indeed it won't know the index even
1205 * exists). A crash later on would replay WAL from the checkpoint,
1206 * therefore it wouldn't replay our earlier WAL entries. If we do not
1207 * fsync those pages here, they might still not be on disk when the crash
1208 * occurs.
1209 */
1210 if (RelationNeedsWAL(wstate->index))
1211 {
1212 RelationOpenSmgr(wstate->index);
1213 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1214 }
1215 }
1216
1217 /*
1218 * Create parallel context, and launch workers for leader.
1219 *
1220 * buildstate argument should be initialized (with the exception of the
1221 * tuplesort state in spools, which may later be created based on shared
1222 * state initially set up here).
1223 *
1224 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1225 *
1226 * request is the target number of parallel worker processes to launch.
1227 *
1228 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1229 * mode by passing it to _bt_end_parallel() at the very end of its index
1230 * build. If not even a single worker process can be launched, this is
1231 * never set, and caller should proceed with a serial index build.
1232 */
1233 static void
_bt_begin_parallel(BTBuildState * buildstate,bool isconcurrent,int request)1234 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1235 {
1236 ParallelContext *pcxt;
1237 int scantuplesortstates;
1238 Snapshot snapshot;
1239 Size estbtshared;
1240 Size estsort;
1241 BTShared *btshared;
1242 Sharedsort *sharedsort;
1243 Sharedsort *sharedsort2;
1244 BTSpool *btspool = buildstate->spool;
1245 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1246 BufferUsage *bufferusage;
1247 bool leaderparticipates = true;
1248 int querylen;
1249
1250 #ifdef DISABLE_LEADER_PARTICIPATION
1251 leaderparticipates = false;
1252 #endif
1253
1254 /*
1255 * Enter parallel mode, and create context for parallel build of btree
1256 * index
1257 */
1258 EnterParallelMode();
1259 Assert(request > 0);
1260 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1261 request, true);
1262
1263 scantuplesortstates = leaderparticipates ? request + 1 : request;
1264
1265 /*
1266 * Prepare for scan of the base relation. In a normal index build, we use
1267 * SnapshotAny because we must retrieve all tuples and do our own time
1268 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1269 * concurrent build, we take a regular MVCC snapshot and index whatever's
1270 * live according to that.
1271 */
1272 if (!isconcurrent)
1273 snapshot = SnapshotAny;
1274 else
1275 snapshot = RegisterSnapshot(GetTransactionSnapshot());
1276
1277 /*
1278 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1279 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1280 */
1281 estbtshared = _bt_parallel_estimate_shared(snapshot);
1282 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1283 estsort = tuplesort_estimate_shared(scantuplesortstates);
1284 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1285
1286 /*
1287 * Unique case requires a second spool, and so we may have to account for
1288 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1289 */
1290 if (!btspool->isunique)
1291 shm_toc_estimate_keys(&pcxt->estimator, 2);
1292 else
1293 {
1294 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1295 shm_toc_estimate_keys(&pcxt->estimator, 3);
1296 }
1297
1298 /*
1299 * Estimate space for BufferUsage -- PARALLEL_KEY_BUFFER_USAGE.
1300 *
1301 * If there are no extensions loaded that care, we could skip this. We
1302 * have no way of knowing whether anyone's looking at pgBufferUsage, so do
1303 * it unconditionally.
1304 */
1305 shm_toc_estimate_chunk(&pcxt->estimator,
1306 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1307 shm_toc_estimate_keys(&pcxt->estimator, 1);
1308
1309 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1310 if (debug_query_string)
1311 {
1312 querylen = strlen(debug_query_string);
1313 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1314 shm_toc_estimate_keys(&pcxt->estimator, 1);
1315 }
1316 else
1317 querylen = 0; /* keep compiler quiet */
1318
1319 /* Everyone's had a chance to ask for space, so now create the DSM */
1320 InitializeParallelDSM(pcxt);
1321
1322 /* If no DSM segment was available, back out (do serial build) */
1323 if (pcxt->seg == NULL)
1324 {
1325 if (IsMVCCSnapshot(snapshot))
1326 UnregisterSnapshot(snapshot);
1327 DestroyParallelContext(pcxt);
1328 ExitParallelMode();
1329 return;
1330 }
1331
1332 /* Store shared build state, for which we reserved space */
1333 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1334 /* Initialize immutable state */
1335 btshared->heaprelid = RelationGetRelid(btspool->heap);
1336 btshared->indexrelid = RelationGetRelid(btspool->index);
1337 btshared->isunique = btspool->isunique;
1338 btshared->isconcurrent = isconcurrent;
1339 btshared->scantuplesortstates = scantuplesortstates;
1340 ConditionVariableInit(&btshared->workersdonecv);
1341 SpinLockInit(&btshared->mutex);
1342 /* Initialize mutable state */
1343 btshared->nparticipantsdone = 0;
1344 btshared->reltuples = 0.0;
1345 btshared->havedead = false;
1346 btshared->indtuples = 0.0;
1347 btshared->brokenhotchain = false;
1348 heap_parallelscan_initialize(&btshared->heapdesc, btspool->heap, snapshot);
1349
1350 /*
1351 * Store shared tuplesort-private state, for which we reserved space.
1352 * Then, initialize opaque state using tuplesort routine.
1353 */
1354 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1355 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1356 pcxt->seg);
1357
1358 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1359 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1360
1361 /* Unique case requires a second spool, and associated shared state */
1362 if (!btspool->isunique)
1363 sharedsort2 = NULL;
1364 else
1365 {
1366 /*
1367 * Store additional shared tuplesort-private state, for which we
1368 * reserved space. Then, initialize opaque state using tuplesort
1369 * routine.
1370 */
1371 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1372 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1373 pcxt->seg);
1374
1375 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1376 }
1377
1378 /* Store query string for workers */
1379 if (debug_query_string)
1380 {
1381 char *sharedquery;
1382
1383 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1384 memcpy(sharedquery, debug_query_string, querylen + 1);
1385 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1386 }
1387
1388 /* Allocate space for each worker's BufferUsage; no need to initialize */
1389 bufferusage = shm_toc_allocate(pcxt->toc,
1390 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1391 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1392
1393 /* Launch workers, saving status for leader/caller */
1394 LaunchParallelWorkers(pcxt);
1395 btleader->pcxt = pcxt;
1396 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1397 if (leaderparticipates)
1398 btleader->nparticipanttuplesorts++;
1399 btleader->btshared = btshared;
1400 btleader->sharedsort = sharedsort;
1401 btleader->sharedsort2 = sharedsort2;
1402 btleader->snapshot = snapshot;
1403 btleader->bufferusage = bufferusage;
1404
1405 /* If no workers were successfully launched, back out (do serial build) */
1406 if (pcxt->nworkers_launched == 0)
1407 {
1408 _bt_end_parallel(btleader);
1409 return;
1410 }
1411
1412 /* Save leader state now that it's clear build will be parallel */
1413 buildstate->btleader = btleader;
1414
1415 /* Join heap scan ourselves */
1416 if (leaderparticipates)
1417 _bt_leader_participate_as_worker(buildstate);
1418
1419 /*
1420 * Caller needs to wait for all launched workers when we return. Make
1421 * sure that the failure-to-start case will not hang forever.
1422 */
1423 WaitForParallelWorkersToAttach(pcxt);
1424 }
1425
1426 /*
1427 * Shut down workers, destroy parallel context, and end parallel mode.
1428 */
1429 static void
_bt_end_parallel(BTLeader * btleader)1430 _bt_end_parallel(BTLeader *btleader)
1431 {
1432 int i;
1433
1434 /* Shutdown worker processes */
1435 WaitForParallelWorkersToFinish(btleader->pcxt);
1436
1437 /*
1438 * Next, accumulate buffer usage. (This must wait for the workers to
1439 * finish, or we might get incomplete data.)
1440 */
1441 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1442 InstrAccumParallelQuery(&btleader->bufferusage[i]);
1443
1444 /* Free last reference to MVCC snapshot, if one was used */
1445 if (IsMVCCSnapshot(btleader->snapshot))
1446 UnregisterSnapshot(btleader->snapshot);
1447 DestroyParallelContext(btleader->pcxt);
1448 ExitParallelMode();
1449 }
1450
1451 /*
1452 * Returns size of shared memory required to store state for a parallel
1453 * btree index build based on the snapshot its parallel scan will use.
1454 */
1455 static Size
_bt_parallel_estimate_shared(Snapshot snapshot)1456 _bt_parallel_estimate_shared(Snapshot snapshot)
1457 {
1458 if (!IsMVCCSnapshot(snapshot))
1459 {
1460 Assert(snapshot == SnapshotAny);
1461 return sizeof(BTShared);
1462 }
1463
1464 return add_size(offsetof(BTShared, heapdesc) +
1465 offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1466 EstimateSnapshotSpace(snapshot));
1467 }
1468
1469 /*
1470 * Within leader, wait for end of heap scan.
1471 *
1472 * When called, parallel heap scan started by _bt_begin_parallel() will
1473 * already be underway within worker processes (when leader participates
1474 * as a worker, we should end up here just as workers are finishing).
1475 *
1476 * Fills in fields needed for ambuild statistics, and lets caller set
1477 * field indicating that some worker encountered a broken HOT chain.
1478 *
1479 * Returns the total number of heap tuples scanned.
1480 */
1481 static double
_bt_parallel_heapscan(BTBuildState * buildstate,bool * brokenhotchain)1482 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1483 {
1484 BTShared *btshared = buildstate->btleader->btshared;
1485 int nparticipanttuplesorts;
1486 double reltuples;
1487
1488 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1489 for (;;)
1490 {
1491 SpinLockAcquire(&btshared->mutex);
1492 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1493 {
1494 buildstate->havedead = btshared->havedead;
1495 buildstate->indtuples = btshared->indtuples;
1496 *brokenhotchain = btshared->brokenhotchain;
1497 reltuples = btshared->reltuples;
1498 SpinLockRelease(&btshared->mutex);
1499 break;
1500 }
1501 SpinLockRelease(&btshared->mutex);
1502
1503 ConditionVariableSleep(&btshared->workersdonecv,
1504 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1505 }
1506
1507 ConditionVariableCancelSleep();
1508
1509 return reltuples;
1510 }
1511
1512 /*
1513 * Within leader, participate as a parallel worker.
1514 */
1515 static void
_bt_leader_participate_as_worker(BTBuildState * buildstate)1516 _bt_leader_participate_as_worker(BTBuildState *buildstate)
1517 {
1518 BTLeader *btleader = buildstate->btleader;
1519 BTSpool *leaderworker;
1520 BTSpool *leaderworker2;
1521 int sortmem;
1522
1523 /* Allocate memory and initialize private spool */
1524 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1525 leaderworker->heap = buildstate->spool->heap;
1526 leaderworker->index = buildstate->spool->index;
1527 leaderworker->isunique = buildstate->spool->isunique;
1528
1529 /* Initialize second spool, if required */
1530 if (!btleader->btshared->isunique)
1531 leaderworker2 = NULL;
1532 else
1533 {
1534 /* Allocate memory for worker's own private secondary spool */
1535 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1536
1537 /* Initialize worker's own secondary spool */
1538 leaderworker2->heap = leaderworker->heap;
1539 leaderworker2->index = leaderworker->index;
1540 leaderworker2->isunique = false;
1541 }
1542
1543 /*
1544 * Might as well use reliable figure when doling out maintenance_work_mem
1545 * (when requested number of workers were not launched, this will be
1546 * somewhat higher than it is for other workers).
1547 */
1548 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1549
1550 /* Perform work common to all participants */
1551 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1552 btleader->sharedsort, btleader->sharedsort2,
1553 sortmem);
1554
1555 #ifdef BTREE_BUILD_STATS
1556 if (log_btree_build_stats)
1557 {
1558 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1559 ResetUsage();
1560 }
1561 #endif /* BTREE_BUILD_STATS */
1562 }
1563
1564 /*
1565 * Perform work within a launched parallel process.
1566 */
1567 void
_bt_parallel_build_main(dsm_segment * seg,shm_toc * toc)1568 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1569 {
1570 char *sharedquery;
1571 BTSpool *btspool;
1572 BTSpool *btspool2;
1573 BTShared *btshared;
1574 Sharedsort *sharedsort;
1575 Sharedsort *sharedsort2;
1576 Relation heapRel;
1577 Relation indexRel;
1578 LOCKMODE heapLockmode;
1579 LOCKMODE indexLockmode;
1580 BufferUsage *bufferusage;
1581 int sortmem;
1582
1583 #ifdef BTREE_BUILD_STATS
1584 if (log_btree_build_stats)
1585 ResetUsage();
1586 #endif /* BTREE_BUILD_STATS */
1587
1588 /* Set debug_query_string for individual workers first */
1589 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1590 debug_query_string = sharedquery;
1591
1592 /* Report the query string from leader */
1593 pgstat_report_activity(STATE_RUNNING, debug_query_string);
1594
1595 /* Look up nbtree shared state */
1596 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1597
1598 /* Open relations using lock modes known to be obtained by index.c */
1599 if (!btshared->isconcurrent)
1600 {
1601 heapLockmode = ShareLock;
1602 indexLockmode = AccessExclusiveLock;
1603 }
1604 else
1605 {
1606 heapLockmode = ShareUpdateExclusiveLock;
1607 indexLockmode = RowExclusiveLock;
1608 }
1609
1610 /* Open relations within worker */
1611 heapRel = heap_open(btshared->heaprelid, heapLockmode);
1612 indexRel = index_open(btshared->indexrelid, indexLockmode);
1613
1614 /* Initialize worker's own spool */
1615 btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1616 btspool->heap = heapRel;
1617 btspool->index = indexRel;
1618 btspool->isunique = btshared->isunique;
1619
1620 /* Look up shared state private to tuplesort.c */
1621 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1622 tuplesort_attach_shared(sharedsort, seg);
1623 if (!btshared->isunique)
1624 {
1625 btspool2 = NULL;
1626 sharedsort2 = NULL;
1627 }
1628 else
1629 {
1630 /* Allocate memory for worker's own private secondary spool */
1631 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1632
1633 /* Initialize worker's own secondary spool */
1634 btspool2->heap = btspool->heap;
1635 btspool2->index = btspool->index;
1636 btspool2->isunique = false;
1637 /* Look up shared state private to tuplesort.c */
1638 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1639 tuplesort_attach_shared(sharedsort2, seg);
1640 }
1641
1642 /* Prepare to track buffer usage during parallel execution */
1643 InstrStartParallelQuery();
1644
1645 /* Perform sorting of spool, and possibly a spool2 */
1646 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1647 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1648 sharedsort2, sortmem);
1649
1650 /* Report buffer usage during parallel execution */
1651 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1652 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber]);
1653
1654 #ifdef BTREE_BUILD_STATS
1655 if (log_btree_build_stats)
1656 {
1657 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1658 ResetUsage();
1659 }
1660 #endif /* BTREE_BUILD_STATS */
1661
1662 index_close(indexRel, indexLockmode);
1663 heap_close(heapRel, heapLockmode);
1664 }
1665
1666 /*
1667 * Perform a worker's portion of a parallel sort.
1668 *
1669 * This generates a tuplesort for passed btspool, and a second tuplesort
1670 * state if a second btspool is need (i.e. for unique index builds). All
1671 * other spool fields should already be set when this is called.
1672 *
1673 * sortmem is the amount of working memory to use within each worker,
1674 * expressed in KBs.
1675 *
1676 * When this returns, workers are done, and need only release resources.
1677 */
1678 static void
_bt_parallel_scan_and_sort(BTSpool * btspool,BTSpool * btspool2,BTShared * btshared,Sharedsort * sharedsort,Sharedsort * sharedsort2,int sortmem)1679 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1680 BTShared *btshared, Sharedsort *sharedsort,
1681 Sharedsort *sharedsort2, int sortmem)
1682 {
1683 SortCoordinate coordinate;
1684 BTBuildState buildstate;
1685 HeapScanDesc scan;
1686 double reltuples;
1687 IndexInfo *indexInfo;
1688
1689 /* Initialize local tuplesort coordination state */
1690 coordinate = palloc0(sizeof(SortCoordinateData));
1691 coordinate->isWorker = true;
1692 coordinate->nParticipants = -1;
1693 coordinate->sharedsort = sharedsort;
1694
1695 /* Begin "partial" tuplesort */
1696 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1697 btspool->index,
1698 btspool->isunique,
1699 sortmem, coordinate,
1700 false);
1701
1702 /*
1703 * Just as with serial case, there may be a second spool. If so, a
1704 * second, dedicated spool2 partial tuplesort is required.
1705 */
1706 if (btspool2)
1707 {
1708 SortCoordinate coordinate2;
1709
1710 /*
1711 * We expect that the second one (for dead tuples) won't get very
1712 * full, so we give it only work_mem (unless sortmem is less for
1713 * worker). Worker processes are generally permitted to allocate
1714 * work_mem independently.
1715 */
1716 coordinate2 = palloc0(sizeof(SortCoordinateData));
1717 coordinate2->isWorker = true;
1718 coordinate2->nParticipants = -1;
1719 coordinate2->sharedsort = sharedsort2;
1720 btspool2->sortstate =
1721 tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1722 Min(sortmem, work_mem), coordinate2,
1723 false);
1724 }
1725
1726 /* Fill in buildstate for _bt_build_callback() */
1727 buildstate.isunique = btshared->isunique;
1728 buildstate.havedead = false;
1729 buildstate.heap = btspool->heap;
1730 buildstate.spool = btspool;
1731 buildstate.spool2 = btspool2;
1732 buildstate.indtuples = 0;
1733 buildstate.btleader = NULL;
1734
1735 /* Join parallel scan */
1736 indexInfo = BuildIndexInfo(btspool->index);
1737 indexInfo->ii_Concurrent = btshared->isconcurrent;
1738 scan = heap_beginscan_parallel(btspool->heap, &btshared->heapdesc);
1739 reltuples = IndexBuildHeapScan(btspool->heap, btspool->index, indexInfo,
1740 true, _bt_build_callback,
1741 (void *) &buildstate, scan);
1742
1743 /*
1744 * Execute this worker's part of the sort.
1745 *
1746 * Unlike leader and serial cases, we cannot avoid calling
1747 * tuplesort_performsort() for spool2 if it ends up containing no dead
1748 * tuples (this is disallowed for workers by tuplesort).
1749 */
1750 tuplesort_performsort(btspool->sortstate);
1751 if (btspool2)
1752 tuplesort_performsort(btspool2->sortstate);
1753
1754 /*
1755 * Done. Record ambuild statistics, and whether we encountered a broken
1756 * HOT chain.
1757 */
1758 SpinLockAcquire(&btshared->mutex);
1759 btshared->nparticipantsdone++;
1760 btshared->reltuples += reltuples;
1761 if (buildstate.havedead)
1762 btshared->havedead = true;
1763 btshared->indtuples += buildstate.indtuples;
1764 if (indexInfo->ii_BrokenHotChain)
1765 btshared->brokenhotchain = true;
1766 SpinLockRelease(&btshared->mutex);
1767
1768 /* Notify leader */
1769 ConditionVariableSignal(&btshared->workersdonecv);
1770
1771 /* We can end tuplesorts immediately */
1772 tuplesort_end(btspool->sortstate);
1773 if (btspool2)
1774 tuplesort_end(btspool2->sortstate);
1775 }
1776