1 /*-------------------------------------------------------------------------
2 *
3 * nbtsort.c
4 * Build a btree from sorted input by loading leaf pages sequentially.
5 *
6 * NOTES
7 *
8 * We use tuplesort.c to sort the given index tuples into order.
9 * Then we scan the index tuples in order and build the btree pages
10 * for each level. We load source tuples into leaf-level pages.
11 * Whenever we fill a page at one level, we add a link to it to its
12 * parent level (starting a new parent level if necessary). When
13 * done, we write out each final page on each level, adding it to
14 * its parent level. When we have only one page on a level, it must be
15 * the root -- it can be attached to the btree metapage and we are done.
16 *
17 * It is not wise to pack the pages entirely full, since then *any*
18 * insertion would cause a split (and not only of the leaf page; the need
19 * for a split would cascade right up the tree). The steady-state load
20 * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 * pages to the user-controllable fill factor (default 90%) while upper pages
22 * are always packed to 70%. This gives us reasonable density (there aren't
23 * many upper pages if the keys are reasonable-size) without risking a lot of
24 * cascading splits during early insertions.
25 *
26 * Formerly the index pages being built were kept in shared buffers, but
27 * that is of no value (since other backends have no interest in them yet)
28 * and it created locking problems for CHECKPOINT, because the upper-level
29 * pages were held exclusive-locked for long periods. Now we just build
30 * the pages in local memory and smgrwrite or smgrextend them as we finish
31 * them. They will need to be re-read into shared buffers on first use after
32 * the build finishes.
33 *
34 * This code isn't concerned about the FSM at all. The caller is responsible
35 * for initializing that.
36 *
37 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
38 * Portions Copyright (c) 1994, Regents of the University of California
39 *
40 * IDENTIFICATION
41 * src/backend/access/nbtree/nbtsort.c
42 *
43 *-------------------------------------------------------------------------
44 */
45
46 #include "postgres.h"
47
48 #include "access/nbtree.h"
49 #include "access/parallel.h"
50 #include "access/relscan.h"
51 #include "access/table.h"
52 #include "access/xact.h"
53 #include "access/xlog.h"
54 #include "access/xloginsert.h"
55 #include "catalog/index.h"
56 #include "commands/progress.h"
57 #include "executor/instrument.h"
58 #include "miscadmin.h"
59 #include "pgstat.h"
60 #include "storage/smgr.h"
61 #include "tcop/tcopprot.h" /* pgrminclude ignore */
62 #include "utils/rel.h"
63 #include "utils/sortsupport.h"
64 #include "utils/tuplesort.h"
65
66
67 /* Magic numbers for parallel state sharing */
68 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
69 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
70 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
71 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
72 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
73 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
74
75 /*
76 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
77 * parallel index builds. This may be useful as a debugging aid.
78 #undef DISABLE_LEADER_PARTICIPATION
79 */
80
81 /*
82 * Status record for spooling/sorting phase. (Note we may have two of
83 * these due to the special requirements for uniqueness-checking with
84 * dead tuples.)
85 */
86 typedef struct BTSpool
87 {
88 Tuplesortstate *sortstate; /* state data for tuplesort.c */
89 Relation heap;
90 Relation index;
91 bool isunique;
92 } BTSpool;
93
94 /*
95 * Status for index builds performed in parallel. This is allocated in a
96 * dynamic shared memory segment. Note that there is a separate tuplesort TOC
97 * entry, private to tuplesort.c but allocated by this module on its behalf.
98 */
99 typedef struct BTShared
100 {
101 /*
102 * These fields are not modified during the sort. They primarily exist
103 * for the benefit of worker processes that need to create BTSpool state
104 * corresponding to that used by the leader.
105 */
106 Oid heaprelid;
107 Oid indexrelid;
108 bool isunique;
109 bool isconcurrent;
110 int scantuplesortstates;
111
112 /*
113 * workersdonecv is used to monitor the progress of workers. All parallel
114 * participants must indicate that they are done before leader can use
115 * mutable state that workers maintain during scan (and before leader can
116 * proceed to tuplesort_performsort()).
117 */
118 ConditionVariable workersdonecv;
119
120 /*
121 * mutex protects all fields before heapdesc.
122 *
123 * These fields contain status information of interest to B-Tree index
124 * builds that must work just the same when an index is built in parallel.
125 */
126 slock_t mutex;
127
128 /*
129 * Mutable state that is maintained by workers, and reported back to
130 * leader at end of parallel scan.
131 *
132 * nparticipantsdone is number of worker processes finished.
133 *
134 * reltuples is the total number of input heap tuples.
135 *
136 * havedead indicates if RECENTLY_DEAD tuples were encountered during
137 * build.
138 *
139 * indtuples is the total number of tuples that made it into the index.
140 *
141 * brokenhotchain indicates if any worker detected a broken HOT chain
142 * during build.
143 */
144 int nparticipantsdone;
145 double reltuples;
146 bool havedead;
147 double indtuples;
148 bool brokenhotchain;
149
150 /*
151 * ParallelTableScanDescData data follows. Can't directly embed here, as
152 * implementations of the parallel table scan desc interface might need
153 * stronger alignment.
154 */
155 } BTShared;
156
157 /*
158 * Return pointer to a BTShared's parallel table scan.
159 *
160 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
161 * MAXALIGN.
162 */
163 #define ParallelTableScanFromBTShared(shared) \
164 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
165
166 /*
167 * Status for leader in parallel index build.
168 */
169 typedef struct BTLeader
170 {
171 /* parallel context itself */
172 ParallelContext *pcxt;
173
174 /*
175 * nparticipanttuplesorts is the exact number of worker processes
176 * successfully launched, plus one leader process if it participates as a
177 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
178 * participating as a worker).
179 */
180 int nparticipanttuplesorts;
181
182 /*
183 * Leader process convenience pointers to shared state (leader avoids TOC
184 * lookups).
185 *
186 * btshared is the shared state for entire build. sharedsort is the
187 * shared, tuplesort-managed state passed to each process tuplesort.
188 * sharedsort2 is the corresponding btspool2 shared state, used only when
189 * building unique indexes. snapshot is the snapshot used by the scan iff
190 * an MVCC snapshot is required.
191 */
192 BTShared *btshared;
193 Sharedsort *sharedsort;
194 Sharedsort *sharedsort2;
195 Snapshot snapshot;
196 WalUsage *walusage;
197 BufferUsage *bufferusage;
198 } BTLeader;
199
200 /*
201 * Working state for btbuild and its callback.
202 *
203 * When parallel CREATE INDEX is used, there is a BTBuildState for each
204 * participant.
205 */
206 typedef struct BTBuildState
207 {
208 bool isunique;
209 bool havedead;
210 Relation heap;
211 BTSpool *spool;
212
213 /*
214 * spool2 is needed only when the index is a unique index. Dead tuples are
215 * put into spool2 instead of spool in order to avoid uniqueness check.
216 */
217 BTSpool *spool2;
218 double indtuples;
219
220 /*
221 * btleader is only present when a parallel index build is performed, and
222 * only in the leader process. (Actually, only the leader has a
223 * BTBuildState. Workers have their own spool and spool2, though.)
224 */
225 BTLeader *btleader;
226 } BTBuildState;
227
228 /*
229 * Status record for a btree page being built. We have one of these
230 * for each active tree level.
231 */
232 typedef struct BTPageState
233 {
234 Page btps_page; /* workspace for page building */
235 BlockNumber btps_blkno; /* block # to write this page at */
236 IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
237 OffsetNumber btps_lastoff; /* last item offset loaded */
238 Size btps_lastextra; /* last item's extra posting list space */
239 uint32 btps_level; /* tree level (0 = leaf) */
240 Size btps_full; /* "full" if less than this much free space */
241 struct BTPageState *btps_next; /* link to parent level, if any */
242 } BTPageState;
243
244 /*
245 * Overall status record for index writing phase.
246 */
247 typedef struct BTWriteState
248 {
249 Relation heap;
250 Relation index;
251 BTScanInsert inskey; /* generic insertion scankey */
252 bool btws_use_wal; /* dump pages to WAL? */
253 BlockNumber btws_pages_alloced; /* # pages allocated */
254 BlockNumber btws_pages_written; /* # pages written out */
255 Page btws_zeropage; /* workspace for filling zeroes */
256 } BTWriteState;
257
258
259 static double _bt_spools_heapscan(Relation heap, Relation index,
260 BTBuildState *buildstate, IndexInfo *indexInfo);
261 static void _bt_spooldestroy(BTSpool *btspool);
262 static void _bt_spool(BTSpool *btspool, ItemPointer self,
263 Datum *values, bool *isnull);
264 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
265 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
266 bool *isnull, bool tupleIsAlive, void *state);
267 static Page _bt_blnewpage(uint32 level);
268 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
269 static void _bt_slideleft(Page rightmostpage);
270 static void _bt_sortaddtup(Page page, Size itemsize,
271 IndexTuple itup, OffsetNumber itup_off,
272 bool newfirstdataitem);
273 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
274 IndexTuple itup, Size truncextra);
275 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
276 BTPageState *state,
277 BTDedupState dstate);
278 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
279 static void _bt_load(BTWriteState *wstate,
280 BTSpool *btspool, BTSpool *btspool2);
281 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
282 int request);
283 static void _bt_end_parallel(BTLeader *btleader);
284 static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
285 static double _bt_parallel_heapscan(BTBuildState *buildstate,
286 bool *brokenhotchain);
287 static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
288 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
289 BTShared *btshared, Sharedsort *sharedsort,
290 Sharedsort *sharedsort2, int sortmem,
291 bool progress);
292
293
294 /*
295 * btbuild() -- build a new btree index.
296 */
297 IndexBuildResult *
btbuild(Relation heap,Relation index,IndexInfo * indexInfo)298 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
299 {
300 IndexBuildResult *result;
301 BTBuildState buildstate;
302 double reltuples;
303
304 #ifdef BTREE_BUILD_STATS
305 if (log_btree_build_stats)
306 ResetUsage();
307 #endif /* BTREE_BUILD_STATS */
308
309 buildstate.isunique = indexInfo->ii_Unique;
310 buildstate.havedead = false;
311 buildstate.heap = heap;
312 buildstate.spool = NULL;
313 buildstate.spool2 = NULL;
314 buildstate.indtuples = 0;
315 buildstate.btleader = NULL;
316
317 /*
318 * We expect to be called exactly once for any index relation. If that's
319 * not the case, big trouble's what we have.
320 */
321 if (RelationGetNumberOfBlocks(index) != 0)
322 elog(ERROR, "index \"%s\" already contains data",
323 RelationGetRelationName(index));
324
325 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
326
327 /*
328 * Finish the build by (1) completing the sort of the spool file, (2)
329 * inserting the sorted tuples into btree pages and (3) building the upper
330 * levels. Finally, it may also be necessary to end use of parallelism.
331 */
332 _bt_leafbuild(buildstate.spool, buildstate.spool2);
333 _bt_spooldestroy(buildstate.spool);
334 if (buildstate.spool2)
335 _bt_spooldestroy(buildstate.spool2);
336 if (buildstate.btleader)
337 _bt_end_parallel(buildstate.btleader);
338
339 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
340
341 result->heap_tuples = reltuples;
342 result->index_tuples = buildstate.indtuples;
343
344 #ifdef BTREE_BUILD_STATS
345 if (log_btree_build_stats)
346 {
347 ShowUsage("BTREE BUILD STATS");
348 ResetUsage();
349 }
350 #endif /* BTREE_BUILD_STATS */
351
352 return result;
353 }
354
355 /*
356 * Create and initialize one or two spool structures, and save them in caller's
357 * buildstate argument. May also fill-in fields within indexInfo used by index
358 * builds.
359 *
360 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
361 * routine encapsulates all aspects of managing parallelism. Caller need only
362 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
363 *
364 * Returns the total number of heap tuples scanned.
365 */
366 static double
_bt_spools_heapscan(Relation heap,Relation index,BTBuildState * buildstate,IndexInfo * indexInfo)367 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
368 IndexInfo *indexInfo)
369 {
370 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
371 SortCoordinate coordinate = NULL;
372 double reltuples = 0;
373
374 /*
375 * We size the sort area as maintenance_work_mem rather than work_mem to
376 * speed index creation. This should be OK since a single backend can't
377 * run multiple index creations in parallel (see also: notes on
378 * parallelism and maintenance_work_mem below).
379 */
380 btspool->heap = heap;
381 btspool->index = index;
382 btspool->isunique = indexInfo->ii_Unique;
383
384 /* Save as primary spool */
385 buildstate->spool = btspool;
386
387 /* Report table scan phase started */
388 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
389 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
390
391 /* Attempt to launch parallel worker scan when required */
392 if (indexInfo->ii_ParallelWorkers > 0)
393 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
394 indexInfo->ii_ParallelWorkers);
395
396 /*
397 * If parallel build requested and at least one worker process was
398 * successfully launched, set up coordination state
399 */
400 if (buildstate->btleader)
401 {
402 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
403 coordinate->isWorker = false;
404 coordinate->nParticipants =
405 buildstate->btleader->nparticipanttuplesorts;
406 coordinate->sharedsort = buildstate->btleader->sharedsort;
407 }
408
409 /*
410 * Begin serial/leader tuplesort.
411 *
412 * In cases where parallelism is involved, the leader receives the same
413 * share of maintenance_work_mem as a serial sort (it is generally treated
414 * in the same way as a serial sort once we return). Parallel worker
415 * Tuplesortstates will have received only a fraction of
416 * maintenance_work_mem, though.
417 *
418 * We rely on the lifetime of the Leader Tuplesortstate almost not
419 * overlapping with any worker Tuplesortstate's lifetime. There may be
420 * some small overlap, but that's okay because we rely on leader
421 * Tuplesortstate only allocating a small, fixed amount of memory here.
422 * When its tuplesort_performsort() is called (by our caller), and
423 * significant amounts of memory are likely to be used, all workers must
424 * have already freed almost all memory held by their Tuplesortstates
425 * (they are about to go away completely, too). The overall effect is
426 * that maintenance_work_mem always represents an absolute high watermark
427 * on the amount of memory used by a CREATE INDEX operation, regardless of
428 * the use of parallelism or any other factor.
429 */
430 buildstate->spool->sortstate =
431 tuplesort_begin_index_btree(heap, index, buildstate->isunique,
432 maintenance_work_mem, coordinate,
433 false);
434
435 /*
436 * If building a unique index, put dead tuples in a second spool to keep
437 * them out of the uniqueness check. We expect that the second spool (for
438 * dead tuples) won't get very full, so we give it only work_mem.
439 */
440 if (indexInfo->ii_Unique)
441 {
442 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
443 SortCoordinate coordinate2 = NULL;
444
445 /* Initialize secondary spool */
446 btspool2->heap = heap;
447 btspool2->index = index;
448 btspool2->isunique = false;
449 /* Save as secondary spool */
450 buildstate->spool2 = btspool2;
451
452 if (buildstate->btleader)
453 {
454 /*
455 * Set up non-private state that is passed to
456 * tuplesort_begin_index_btree() about the basic high level
457 * coordination of a parallel sort.
458 */
459 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));
460 coordinate2->isWorker = false;
461 coordinate2->nParticipants =
462 buildstate->btleader->nparticipanttuplesorts;
463 coordinate2->sharedsort = buildstate->btleader->sharedsort2;
464 }
465
466 /*
467 * We expect that the second one (for dead tuples) won't get very
468 * full, so we give it only work_mem
469 */
470 buildstate->spool2->sortstate =
471 tuplesort_begin_index_btree(heap, index, false, work_mem,
472 coordinate2, false);
473 }
474
475 /* Fill spool using either serial or parallel heap scan */
476 if (!buildstate->btleader)
477 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
478 _bt_build_callback, (void *) buildstate,
479 NULL);
480 else
481 reltuples = _bt_parallel_heapscan(buildstate,
482 &indexInfo->ii_BrokenHotChain);
483
484 /*
485 * Set the progress target for the next phase. Reset the block number
486 * values set by table_index_build_scan
487 */
488 {
489 const int progress_index[] = {
490 PROGRESS_CREATEIDX_TUPLES_TOTAL,
491 PROGRESS_SCAN_BLOCKS_TOTAL,
492 PROGRESS_SCAN_BLOCKS_DONE
493 };
494 const int64 progress_vals[] = {
495 buildstate->indtuples,
496 0, 0
497 };
498
499 pgstat_progress_update_multi_param(3, progress_index, progress_vals);
500 }
501
502 /* okay, all heap tuples are spooled */
503 if (buildstate->spool2 && !buildstate->havedead)
504 {
505 /* spool2 turns out to be unnecessary */
506 _bt_spooldestroy(buildstate->spool2);
507 buildstate->spool2 = NULL;
508 }
509
510 return reltuples;
511 }
512
513 /*
514 * clean up a spool structure and its substructures.
515 */
516 static void
_bt_spooldestroy(BTSpool * btspool)517 _bt_spooldestroy(BTSpool *btspool)
518 {
519 tuplesort_end(btspool->sortstate);
520 pfree(btspool);
521 }
522
523 /*
524 * spool an index entry into the sort file.
525 */
526 static void
_bt_spool(BTSpool * btspool,ItemPointer self,Datum * values,bool * isnull)527 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
528 {
529 tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
530 self, values, isnull);
531 }
532
533 /*
534 * given a spool loaded by successive calls to _bt_spool,
535 * create an entire btree.
536 */
537 static void
_bt_leafbuild(BTSpool * btspool,BTSpool * btspool2)538 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
539 {
540 BTWriteState wstate;
541
542 #ifdef BTREE_BUILD_STATS
543 if (log_btree_build_stats)
544 {
545 ShowUsage("BTREE BUILD (Spool) STATISTICS");
546 ResetUsage();
547 }
548 #endif /* BTREE_BUILD_STATS */
549
550 /* Execute the sort */
551 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
552 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
553 tuplesort_performsort(btspool->sortstate);
554 if (btspool2)
555 {
556 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
557 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
558 tuplesort_performsort(btspool2->sortstate);
559 }
560
561 wstate.heap = btspool->heap;
562 wstate.index = btspool->index;
563 wstate.inskey = _bt_mkscankey(wstate.index, NULL);
564 /* _bt_mkscankey() won't set allequalimage without metapage */
565 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
566 wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
567
568 /* reserve the metapage */
569 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
570 wstate.btws_pages_written = 0;
571 wstate.btws_zeropage = NULL; /* until needed */
572
573 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
574 PROGRESS_BTREE_PHASE_LEAF_LOAD);
575 _bt_load(&wstate, btspool, btspool2);
576 }
577
578 /*
579 * Per-tuple callback for table_index_build_scan
580 */
581 static void
_bt_build_callback(Relation index,ItemPointer tid,Datum * values,bool * isnull,bool tupleIsAlive,void * state)582 _bt_build_callback(Relation index,
583 ItemPointer tid,
584 Datum *values,
585 bool *isnull,
586 bool tupleIsAlive,
587 void *state)
588 {
589 BTBuildState *buildstate = (BTBuildState *) state;
590
591 /*
592 * insert the index tuple into the appropriate spool file for subsequent
593 * processing
594 */
595 if (tupleIsAlive || buildstate->spool2 == NULL)
596 _bt_spool(buildstate->spool, tid, values, isnull);
597 else
598 {
599 /* dead tuples are put into spool2 */
600 buildstate->havedead = true;
601 _bt_spool(buildstate->spool2, tid, values, isnull);
602 }
603
604 buildstate->indtuples += 1;
605 }
606
607 /*
608 * allocate workspace for a new, clean btree page, not linked to any siblings.
609 */
610 static Page
_bt_blnewpage(uint32 level)611 _bt_blnewpage(uint32 level)
612 {
613 Page page;
614 BTPageOpaque opaque;
615
616 page = (Page) palloc(BLCKSZ);
617
618 /* Zero the page and set up standard page header info */
619 _bt_pageinit(page, BLCKSZ);
620
621 /* Initialize BT opaque state */
622 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
623 opaque->btpo_prev = opaque->btpo_next = P_NONE;
624 opaque->btpo_level = level;
625 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
626 opaque->btpo_cycleid = 0;
627
628 /* Make the P_HIKEY line pointer appear allocated */
629 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
630
631 return page;
632 }
633
634 /*
635 * emit a completed btree page, and release the working storage.
636 */
637 static void
_bt_blwritepage(BTWriteState * wstate,Page page,BlockNumber blkno)638 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
639 {
640 /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
641 RelationOpenSmgr(wstate->index);
642
643 /* XLOG stuff */
644 if (wstate->btws_use_wal)
645 {
646 /* We use the XLOG_FPI record type for this */
647 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
648 }
649
650 /*
651 * If we have to write pages nonsequentially, fill in the space with
652 * zeroes until we come back and overwrite. This is not logically
653 * necessary on standard Unix filesystems (unwritten space will read as
654 * zeroes anyway), but it should help to avoid fragmentation. The dummy
655 * pages aren't WAL-logged though.
656 */
657 while (blkno > wstate->btws_pages_written)
658 {
659 if (!wstate->btws_zeropage)
660 wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
661 /* don't set checksum for all-zero page */
662 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
663 wstate->btws_pages_written++,
664 (char *) wstate->btws_zeropage,
665 true);
666 }
667
668 PageSetChecksumInplace(page, blkno);
669
670 /*
671 * Now write the page. There's no need for smgr to schedule an fsync for
672 * this write; we'll do it ourselves before ending the build.
673 */
674 if (blkno == wstate->btws_pages_written)
675 {
676 /* extending the file... */
677 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
678 (char *) page, true);
679 wstate->btws_pages_written++;
680 }
681 else
682 {
683 /* overwriting a block we zero-filled before */
684 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
685 (char *) page, true);
686 }
687
688 pfree(page);
689 }
690
691 /*
692 * allocate and initialize a new BTPageState. the returned structure
693 * is suitable for immediate use by _bt_buildadd.
694 */
695 static BTPageState *
_bt_pagestate(BTWriteState * wstate,uint32 level)696 _bt_pagestate(BTWriteState *wstate, uint32 level)
697 {
698 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
699
700 /* create initial page for level */
701 state->btps_page = _bt_blnewpage(level);
702
703 /* and assign it a page position */
704 state->btps_blkno = wstate->btws_pages_alloced++;
705
706 state->btps_lowkey = NULL;
707 /* initialize lastoff so first item goes into P_FIRSTKEY */
708 state->btps_lastoff = P_HIKEY;
709 state->btps_lastextra = 0;
710 state->btps_level = level;
711 /* set "full" threshold based on level. See notes at head of file. */
712 if (level > 0)
713 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
714 else
715 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
716
717 /* no parent level, yet */
718 state->btps_next = NULL;
719
720 return state;
721 }
722
723 /*
724 * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
725 * P_HIKEY, overwriting P_HIKEY).
726 *
727 * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
728 * rightmost page on its level is not supposed to get a high key. Now that
729 * it's clear that this page is a rightmost page, remove the unneeded empty
730 * P_HIKEY line pointer space.
731 */
732 static void
_bt_slideleft(Page rightmostpage)733 _bt_slideleft(Page rightmostpage)
734 {
735 OffsetNumber off;
736 OffsetNumber maxoff;
737 ItemId previi;
738
739 maxoff = PageGetMaxOffsetNumber(rightmostpage);
740 Assert(maxoff >= P_FIRSTKEY);
741 previi = PageGetItemId(rightmostpage, P_HIKEY);
742 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
743 {
744 ItemId thisii = PageGetItemId(rightmostpage, off);
745
746 *previi = *thisii;
747 previi = thisii;
748 }
749 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
750 }
751
752 /*
753 * Add an item to a page being built.
754 *
755 * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
756 * raises an error directly.
757 *
758 * Note that our nbtsort.c caller does not know yet if the page will be
759 * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
760 * caller. Page that turns out to be the rightmost on its level is fixed by
761 * calling _bt_slideleft().
762 */
763 static void
_bt_sortaddtup(Page page,Size itemsize,IndexTuple itup,OffsetNumber itup_off,bool newfirstdataitem)764 _bt_sortaddtup(Page page,
765 Size itemsize,
766 IndexTuple itup,
767 OffsetNumber itup_off,
768 bool newfirstdataitem)
769 {
770 IndexTupleData trunctuple;
771
772 if (newfirstdataitem)
773 {
774 trunctuple = *itup;
775 trunctuple.t_info = sizeof(IndexTupleData);
776 BTreeTupleSetNAtts(&trunctuple, 0, false);
777 itup = &trunctuple;
778 itemsize = sizeof(IndexTupleData);
779 }
780
781 if (PageAddItem(page, (Item) itup, itemsize, itup_off,
782 false, false) == InvalidOffsetNumber)
783 elog(ERROR, "failed to add item to the index page");
784 }
785
786 /*----------
787 * Add an item to a disk page from the sort output (or add a posting list
788 * item formed from the sort output).
789 *
790 * We must be careful to observe the page layout conventions of nbtsearch.c:
791 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
792 * - on non-leaf pages, the key portion of the first item need not be
793 * stored, we should store only the link.
794 *
795 * A leaf page being built looks like:
796 *
797 * +----------------+---------------------------------+
798 * | PageHeaderData | linp0 linp1 linp2 ... |
799 * +-----------+----+---------------------------------+
800 * | ... linpN | |
801 * +-----------+--------------------------------------+
802 * | ^ last |
803 * | |
804 * +-------------+------------------------------------+
805 * | | itemN ... |
806 * +-------------+------------------+-----------------+
807 * | ... item3 item2 item1 | "special space" |
808 * +--------------------------------+-----------------+
809 *
810 * Contrast this with the diagram in bufpage.h; note the mismatch
811 * between linps and items. This is because we reserve linp0 as a
812 * placeholder for the pointer to the "high key" item; when we have
813 * filled up the page, we will set linp0 to point to itemN and clear
814 * linpN. On the other hand, if we find this is the last (rightmost)
815 * page, we leave the items alone and slide the linp array over. If
816 * the high key is to be truncated, offset 1 is deleted, and we insert
817 * the truncated high key at offset 1.
818 *
819 * 'last' pointer indicates the last offset added to the page.
820 *
821 * 'truncextra' is the size of the posting list in itup, if any. This
822 * information is stashed for the next call here, when we may benefit
823 * from considering the impact of truncating away the posting list on
824 * the page before deciding to finish the page off. Posting lists are
825 * often relatively large, so it is worth going to the trouble of
826 * accounting for the saving from truncating away the posting list of
827 * the tuple that becomes the high key (that may be the only way to
828 * get close to target free space on the page). Note that this is
829 * only used for the soft fillfactor-wise limit, not the critical hard
830 * limit.
831 *----------
832 */
833 static void
_bt_buildadd(BTWriteState * wstate,BTPageState * state,IndexTuple itup,Size truncextra)834 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
835 Size truncextra)
836 {
837 Page npage;
838 BlockNumber nblkno;
839 OffsetNumber last_off;
840 Size last_truncextra;
841 Size pgspc;
842 Size itupsz;
843 bool isleaf;
844
845 /*
846 * This is a handy place to check for cancel interrupts during the btree
847 * load phase of index creation.
848 */
849 CHECK_FOR_INTERRUPTS();
850
851 npage = state->btps_page;
852 nblkno = state->btps_blkno;
853 last_off = state->btps_lastoff;
854 last_truncextra = state->btps_lastextra;
855 state->btps_lastextra = truncextra;
856
857 pgspc = PageGetFreeSpace(npage);
858 itupsz = IndexTupleSize(itup);
859 itupsz = MAXALIGN(itupsz);
860 /* Leaf case has slightly different rules due to suffix truncation */
861 isleaf = (state->btps_level == 0);
862
863 /*
864 * Check whether the new item can fit on a btree page on current level at
865 * all.
866 *
867 * Every newly built index will treat heap TID as part of the keyspace,
868 * which imposes the requirement that new high keys must occasionally have
869 * a heap TID appended within _bt_truncate(). That may leave a new pivot
870 * tuple one or two MAXALIGN() quantums larger than the original
871 * firstright tuple it's derived from. v4 deals with the problem by
872 * decreasing the limit on the size of tuples inserted on the leaf level
873 * by the same small amount. Enforce the new v4+ limit on the leaf level,
874 * and the old limit on internal levels, since pivot tuples may need to
875 * make use of the reserved space. This should never fail on internal
876 * pages.
877 */
878 if (unlikely(itupsz > BTMaxItemSize(npage)))
879 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
880 itup);
881
882 /*
883 * Check to see if current page will fit new item, with space left over to
884 * append a heap TID during suffix truncation when page is a leaf page.
885 *
886 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
887 * high key with heap TID when finishing off a leaf page, since we rely on
888 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
889 * internal pages we can always fit 3 pivot tuples with larger internal
890 * page tuple limit (includes page high key).
891 *
892 * Most of the time, a page is only "full" in the sense that the soft
893 * fillfactor-wise limit has been exceeded. However, we must always leave
894 * at least two items plus a high key on each page before starting a new
895 * page. Disregard fillfactor and insert on "full" current page if we
896 * don't have the minimum number of items yet. (Note that we deliberately
897 * assume that suffix truncation neither enlarges nor shrinks new high key
898 * when applying soft limit, except when last tuple has a posting list.)
899 */
900 Assert(last_truncextra == 0 || isleaf);
901 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
902 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
903 {
904 /*
905 * Finish off the page and write it out.
906 */
907 Page opage = npage;
908 BlockNumber oblkno = nblkno;
909 ItemId ii;
910 ItemId hii;
911 IndexTuple oitup;
912
913 /* Create new page of same level */
914 npage = _bt_blnewpage(state->btps_level);
915
916 /* and assign it a page position */
917 nblkno = wstate->btws_pages_alloced++;
918
919 /*
920 * We copy the last item on the page into the new page, and then
921 * rearrange the old page so that the 'last item' becomes its high key
922 * rather than a true data item. There had better be at least two
923 * items on the page already, else the page would be empty of useful
924 * data.
925 */
926 Assert(last_off > P_FIRSTKEY);
927 ii = PageGetItemId(opage, last_off);
928 oitup = (IndexTuple) PageGetItem(opage, ii);
929 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
930 !isleaf);
931
932 /*
933 * Move 'last' into the high key position on opage. _bt_blnewpage()
934 * allocated empty space for a line pointer when opage was first
935 * created, so this is a matter of rearranging already-allocated space
936 * on page, and initializing high key line pointer. (Actually, leaf
937 * pages must also swap oitup with a truncated version of oitup, which
938 * is sometimes larger than oitup, though never by more than the space
939 * needed to append a heap TID.)
940 */
941 hii = PageGetItemId(opage, P_HIKEY);
942 *hii = *ii;
943 ItemIdSetUnused(ii); /* redundant */
944 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
945
946 if (isleaf)
947 {
948 IndexTuple lastleft;
949 IndexTuple truncated;
950
951 /*
952 * Truncate away any unneeded attributes from high key on leaf
953 * level. This is only done at the leaf level because downlinks
954 * in internal pages are either negative infinity items, or get
955 * their contents from copying from one level down. See also:
956 * _bt_split().
957 *
958 * We don't try to bias our choice of split point to make it more
959 * likely that _bt_truncate() can truncate away more attributes,
960 * whereas the split point used within _bt_split() is chosen much
961 * more delicately. Even still, the lastleft and firstright
962 * tuples passed to _bt_truncate() here are at least not fully
963 * equal to each other when deduplication is used, unless there is
964 * a large group of duplicates (also, unique index builds usually
965 * have few or no spool2 duplicates). When the split point is
966 * between two unequal tuples, _bt_truncate() will avoid including
967 * a heap TID in the new high key, which is the most important
968 * benefit of suffix truncation.
969 *
970 * Overwrite the old item with new truncated high key directly.
971 * oitup is already located at the physical beginning of tuple
972 * space, so this should directly reuse the existing tuple space.
973 */
974 ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
975 lastleft = (IndexTuple) PageGetItem(opage, ii);
976
977 Assert(IndexTupleSize(oitup) > last_truncextra);
978 truncated = _bt_truncate(wstate->index, lastleft, oitup,
979 wstate->inskey);
980 if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,
981 IndexTupleSize(truncated)))
982 elog(ERROR, "failed to add high key to the index page");
983 pfree(truncated);
984
985 /* oitup should continue to point to the page's high key */
986 hii = PageGetItemId(opage, P_HIKEY);
987 oitup = (IndexTuple) PageGetItem(opage, hii);
988 }
989
990 /*
991 * Link the old page into its parent, using its low key. If we don't
992 * have a parent, we have to create one; this adds a new btree level.
993 */
994 if (state->btps_next == NULL)
995 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
996
997 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
998 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
999 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
1000 P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1001 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
1002 !P_LEFTMOST((BTPageOpaque) PageGetSpecialPointer(opage)));
1003 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
1004 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
1005 pfree(state->btps_lowkey);
1006
1007 /*
1008 * Save a copy of the high key from the old page. It is also the low
1009 * key for the new page.
1010 */
1011 state->btps_lowkey = CopyIndexTuple(oitup);
1012
1013 /*
1014 * Set the sibling links for both pages.
1015 */
1016 {
1017 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
1018 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
1019
1020 oopaque->btpo_next = nblkno;
1021 nopaque->btpo_prev = oblkno;
1022 nopaque->btpo_next = P_NONE; /* redundant */
1023 }
1024
1025 /*
1026 * Write out the old page. We never need to touch it again, so we can
1027 * free the opage workspace too.
1028 */
1029 _bt_blwritepage(wstate, opage, oblkno);
1030
1031 /*
1032 * Reset last_off to point to new page
1033 */
1034 last_off = P_FIRSTKEY;
1035 }
1036
1037 /*
1038 * By here, either original page is still the current page, or a new page
1039 * was created that became the current page. Either way, the current page
1040 * definitely has space for new item.
1041 *
1042 * If the new item is the first for its page, it must also be the first
1043 * item on its entire level. On later same-level pages, a low key for a
1044 * page will be copied from the prior page in the code above. Generate a
1045 * minus infinity low key here instead.
1046 */
1047 if (last_off == P_HIKEY)
1048 {
1049 Assert(state->btps_lowkey == NULL);
1050 state->btps_lowkey = palloc0(sizeof(IndexTupleData));
1051 state->btps_lowkey->t_info = sizeof(IndexTupleData);
1052 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1053 }
1054
1055 /*
1056 * Add the new item into the current page.
1057 */
1058 last_off = OffsetNumberNext(last_off);
1059 _bt_sortaddtup(npage, itupsz, itup, last_off,
1060 !isleaf && last_off == P_FIRSTKEY);
1061
1062 state->btps_page = npage;
1063 state->btps_blkno = nblkno;
1064 state->btps_lastoff = last_off;
1065 }
1066
1067 /*
1068 * Finalize pending posting list tuple, and add it to the index. Final tuple
1069 * is based on saved base tuple, and saved list of heap TIDs.
1070 *
1071 * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1072 * using _bt_buildadd().
1073 */
1074 static void
_bt_sort_dedup_finish_pending(BTWriteState * wstate,BTPageState * state,BTDedupState dstate)1075 _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1076 BTDedupState dstate)
1077 {
1078 Assert(dstate->nitems > 0);
1079
1080 if (dstate->nitems == 1)
1081 _bt_buildadd(wstate, state, dstate->base, 0);
1082 else
1083 {
1084 IndexTuple postingtuple;
1085 Size truncextra;
1086
1087 /* form a tuple with a posting list */
1088 postingtuple = _bt_form_posting(dstate->base,
1089 dstate->htids,
1090 dstate->nhtids);
1091 /* Calculate posting list overhead */
1092 truncextra = IndexTupleSize(postingtuple) -
1093 BTreeTupleGetPostingOffset(postingtuple);
1094
1095 _bt_buildadd(wstate, state, postingtuple, truncextra);
1096 pfree(postingtuple);
1097 }
1098
1099 dstate->nmaxitems = 0;
1100 dstate->nhtids = 0;
1101 dstate->nitems = 0;
1102 dstate->phystupsize = 0;
1103 }
1104
1105 /*
1106 * Finish writing out the completed btree.
1107 */
1108 static void
_bt_uppershutdown(BTWriteState * wstate,BTPageState * state)1109 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1110 {
1111 BTPageState *s;
1112 BlockNumber rootblkno = P_NONE;
1113 uint32 rootlevel = 0;
1114 Page metapage;
1115
1116 /*
1117 * Each iteration of this loop completes one more level of the tree.
1118 */
1119 for (s = state; s != NULL; s = s->btps_next)
1120 {
1121 BlockNumber blkno;
1122 BTPageOpaque opaque;
1123
1124 blkno = s->btps_blkno;
1125 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
1126
1127 /*
1128 * We have to link the last page on this level to somewhere.
1129 *
1130 * If we're at the top, it's the root, so attach it to the metapage.
1131 * Otherwise, add an entry for it to its parent using its low key.
1132 * This may cause the last page of the parent level to split, but
1133 * that's not a problem -- we haven't gotten to it yet.
1134 */
1135 if (s->btps_next == NULL)
1136 {
1137 opaque->btpo_flags |= BTP_ROOT;
1138 rootblkno = blkno;
1139 rootlevel = s->btps_level;
1140 }
1141 else
1142 {
1143 Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
1144 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1145 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1146 P_LEFTMOST(opaque));
1147 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
1148 !P_LEFTMOST(opaque));
1149 BTreeTupleSetDownLink(s->btps_lowkey, blkno);
1150 _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
1151 pfree(s->btps_lowkey);
1152 s->btps_lowkey = NULL;
1153 }
1154
1155 /*
1156 * This is the rightmost page, so the ItemId array needs to be slid
1157 * back one slot. Then we can dump out the page.
1158 */
1159 _bt_slideleft(s->btps_page);
1160 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
1161 s->btps_page = NULL; /* writepage freed the workspace */
1162 }
1163
1164 /*
1165 * As the last step in the process, construct the metapage and make it
1166 * point to the new root (unless we had no data at all, in which case it's
1167 * set to point to "P_NONE"). This changes the index to the "valid" state
1168 * by filling in a valid magic number in the metapage.
1169 */
1170 metapage = (Page) palloc(BLCKSZ);
1171 _bt_initmetapage(metapage, rootblkno, rootlevel,
1172 wstate->inskey->allequalimage);
1173 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
1174 }
1175
1176 /*
1177 * Read tuples in correct sort order from tuplesort, and load them into
1178 * btree leaves.
1179 */
1180 static void
_bt_load(BTWriteState * wstate,BTSpool * btspool,BTSpool * btspool2)1181 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1182 {
1183 BTPageState *state = NULL;
1184 bool merge = (btspool2 != NULL);
1185 IndexTuple itup,
1186 itup2 = NULL;
1187 bool load1;
1188 TupleDesc tupdes = RelationGetDescr(wstate->index);
1189 int i,
1190 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1191 SortSupport sortKeys;
1192 int64 tuples_done = 0;
1193 bool deduplicate;
1194
1195 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
1196 BTGetDeduplicateItems(wstate->index);
1197
1198 if (merge)
1199 {
1200 /*
1201 * Another BTSpool for dead tuples exists. Now we have to merge
1202 * btspool and btspool2.
1203 */
1204
1205 /* the preparation of merge */
1206 itup = tuplesort_getindextuple(btspool->sortstate, true);
1207 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1208
1209 /* Prepare SortSupport data for each column */
1210 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));
1211
1212 for (i = 0; i < keysz; i++)
1213 {
1214 SortSupport sortKey = sortKeys + i;
1215 ScanKey scanKey = wstate->inskey->scankeys + i;
1216 int16 strategy;
1217
1218 sortKey->ssup_cxt = CurrentMemoryContext;
1219 sortKey->ssup_collation = scanKey->sk_collation;
1220 sortKey->ssup_nulls_first =
1221 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1222 sortKey->ssup_attno = scanKey->sk_attno;
1223 /* Abbreviation is not supported here */
1224 sortKey->abbreviate = false;
1225
1226 AssertState(sortKey->ssup_attno != 0);
1227
1228 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ?
1229 BTGreaterStrategyNumber : BTLessStrategyNumber;
1230
1231 PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey);
1232 }
1233
1234 for (;;)
1235 {
1236 load1 = true; /* load BTSpool next ? */
1237 if (itup2 == NULL)
1238 {
1239 if (itup == NULL)
1240 break;
1241 }
1242 else if (itup != NULL)
1243 {
1244 int32 compare = 0;
1245
1246 for (i = 1; i <= keysz; i++)
1247 {
1248 SortSupport entry;
1249 Datum attrDatum1,
1250 attrDatum2;
1251 bool isNull1,
1252 isNull2;
1253
1254 entry = sortKeys + i - 1;
1255 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1256 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1257
1258 compare = ApplySortComparator(attrDatum1, isNull1,
1259 attrDatum2, isNull2,
1260 entry);
1261 if (compare > 0)
1262 {
1263 load1 = false;
1264 break;
1265 }
1266 else if (compare < 0)
1267 break;
1268 }
1269
1270 /*
1271 * If key values are equal, we sort on ItemPointer. This is
1272 * required for btree indexes, since heap TID is treated as an
1273 * implicit last key attribute in order to ensure that all
1274 * keys in the index are physically unique.
1275 */
1276 if (compare == 0)
1277 {
1278 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1279 Assert(compare != 0);
1280 if (compare > 0)
1281 load1 = false;
1282 }
1283 }
1284 else
1285 load1 = false;
1286
1287 /* When we see first tuple, create first index page */
1288 if (state == NULL)
1289 state = _bt_pagestate(wstate, 0);
1290
1291 if (load1)
1292 {
1293 _bt_buildadd(wstate, state, itup, 0);
1294 itup = tuplesort_getindextuple(btspool->sortstate, true);
1295 }
1296 else
1297 {
1298 _bt_buildadd(wstate, state, itup2, 0);
1299 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1300 }
1301
1302 /* Report progress */
1303 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1304 ++tuples_done);
1305 }
1306 pfree(sortKeys);
1307 }
1308 else if (deduplicate)
1309 {
1310 /* merge is unnecessary, deduplicate into posting lists */
1311 BTDedupState dstate;
1312
1313 dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
1314 dstate->deduplicate = true; /* unused */
1315 dstate->nmaxitems = 0; /* unused */
1316 dstate->maxpostingsize = 0; /* set later */
1317 /* Metadata about base tuple of current pending posting list */
1318 dstate->base = NULL;
1319 dstate->baseoff = InvalidOffsetNumber; /* unused */
1320 dstate->basetupsize = 0;
1321 /* Metadata about current pending posting list TIDs */
1322 dstate->htids = NULL;
1323 dstate->nhtids = 0;
1324 dstate->nitems = 0;
1325 dstate->phystupsize = 0; /* unused */
1326 dstate->nintervals = 0; /* unused */
1327
1328 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1329 true)) != NULL)
1330 {
1331 /* When we see first tuple, create first index page */
1332 if (state == NULL)
1333 {
1334 state = _bt_pagestate(wstate, 0);
1335
1336 /*
1337 * Limit size of posting list tuples to 1/10 space we want to
1338 * leave behind on the page, plus space for final item's line
1339 * pointer. This is equal to the space that we'd like to
1340 * leave behind on each leaf page when fillfactor is 90,
1341 * allowing us to get close to fillfactor% space utilization
1342 * when there happen to be a great many duplicates. (This
1343 * makes higher leaf fillfactor settings ineffective when
1344 * building indexes that have many duplicates, but packing
1345 * leaf pages full with few very large tuples doesn't seem
1346 * like a useful goal.)
1347 */
1348 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1349 sizeof(ItemIdData);
1350 Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) &&
1351 dstate->maxpostingsize <= INDEX_SIZE_MASK);
1352 dstate->htids = palloc(dstate->maxpostingsize);
1353
1354 /* start new pending posting list with itup copy */
1355 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1356 InvalidOffsetNumber);
1357 }
1358 else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1359 itup) > keysz &&
1360 _bt_dedup_save_htid(dstate, itup))
1361 {
1362 /*
1363 * Tuple is equal to base tuple of pending posting list. Heap
1364 * TID from itup has been saved in state.
1365 */
1366 }
1367 else
1368 {
1369 /*
1370 * Tuple is not equal to pending posting list tuple, or
1371 * _bt_dedup_save_htid() opted to not merge current item into
1372 * pending posting list.
1373 */
1374 _bt_sort_dedup_finish_pending(wstate, state, dstate);
1375 pfree(dstate->base);
1376
1377 /* start new pending posting list with itup copy */
1378 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1379 InvalidOffsetNumber);
1380 }
1381
1382 /* Report progress */
1383 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1384 ++tuples_done);
1385 }
1386
1387 if (state)
1388 {
1389 /*
1390 * Handle the last item (there must be a last item when the
1391 * tuplesort returned one or more tuples)
1392 */
1393 _bt_sort_dedup_finish_pending(wstate, state, dstate);
1394 pfree(dstate->base);
1395 pfree(dstate->htids);
1396 }
1397
1398 pfree(dstate);
1399 }
1400 else
1401 {
1402 /* merging and deduplication are both unnecessary */
1403 while ((itup = tuplesort_getindextuple(btspool->sortstate,
1404 true)) != NULL)
1405 {
1406 /* When we see first tuple, create first index page */
1407 if (state == NULL)
1408 state = _bt_pagestate(wstate, 0);
1409
1410 _bt_buildadd(wstate, state, itup, 0);
1411
1412 /* Report progress */
1413 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1414 ++tuples_done);
1415 }
1416 }
1417
1418 /* Close down final pages and write the metapage */
1419 _bt_uppershutdown(wstate, state);
1420
1421 /*
1422 * When we WAL-logged index pages, we must nonetheless fsync index files.
1423 * Since we're building outside shared buffers, a CHECKPOINT occurring
1424 * during the build has no way to flush the previously written data to
1425 * disk (indeed it won't know the index even exists). A crash later on
1426 * would replay WAL from the checkpoint, therefore it wouldn't replay our
1427 * earlier WAL entries. If we do not fsync those pages here, they might
1428 * still not be on disk when the crash occurs.
1429 */
1430 if (wstate->btws_use_wal)
1431 {
1432 RelationOpenSmgr(wstate->index);
1433 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
1434 }
1435 }
1436
1437 /*
1438 * Create parallel context, and launch workers for leader.
1439 *
1440 * buildstate argument should be initialized (with the exception of the
1441 * tuplesort state in spools, which may later be created based on shared
1442 * state initially set up here).
1443 *
1444 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1445 *
1446 * request is the target number of parallel worker processes to launch.
1447 *
1448 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1449 * mode by passing it to _bt_end_parallel() at the very end of its index
1450 * build. If not even a single worker process can be launched, this is
1451 * never set, and caller should proceed with a serial index build.
1452 */
1453 static void
_bt_begin_parallel(BTBuildState * buildstate,bool isconcurrent,int request)1454 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1455 {
1456 ParallelContext *pcxt;
1457 int scantuplesortstates;
1458 Snapshot snapshot;
1459 Size estbtshared;
1460 Size estsort;
1461 BTShared *btshared;
1462 Sharedsort *sharedsort;
1463 Sharedsort *sharedsort2;
1464 BTSpool *btspool = buildstate->spool;
1465 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));
1466 WalUsage *walusage;
1467 BufferUsage *bufferusage;
1468 bool leaderparticipates = true;
1469 int querylen;
1470
1471 #ifdef DISABLE_LEADER_PARTICIPATION
1472 leaderparticipates = false;
1473 #endif
1474
1475 /*
1476 * Enter parallel mode, and create context for parallel build of btree
1477 * index
1478 */
1479 EnterParallelMode();
1480 Assert(request > 0);
1481 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1482 request);
1483
1484 scantuplesortstates = leaderparticipates ? request + 1 : request;
1485
1486 /*
1487 * Prepare for scan of the base relation. In a normal index build, we use
1488 * SnapshotAny because we must retrieve all tuples and do our own time
1489 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1490 * concurrent build, we take a regular MVCC snapshot and index whatever's
1491 * live according to that.
1492 */
1493 if (!isconcurrent)
1494 snapshot = SnapshotAny;
1495 else
1496 snapshot = RegisterSnapshot(GetTransactionSnapshot());
1497
1498 /*
1499 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1500 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1501 */
1502 estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
1503 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1504 estsort = tuplesort_estimate_shared(scantuplesortstates);
1505 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1506
1507 /*
1508 * Unique case requires a second spool, and so we may have to account for
1509 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1510 */
1511 if (!btspool->isunique)
1512 shm_toc_estimate_keys(&pcxt->estimator, 2);
1513 else
1514 {
1515 shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1516 shm_toc_estimate_keys(&pcxt->estimator, 3);
1517 }
1518
1519 /*
1520 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1521 * and PARALLEL_KEY_BUFFER_USAGE.
1522 *
1523 * If there are no extensions loaded that care, we could skip this. We
1524 * have no way of knowing whether anyone's looking at pgWalUsage or
1525 * pgBufferUsage, so do it unconditionally.
1526 */
1527 shm_toc_estimate_chunk(&pcxt->estimator,
1528 mul_size(sizeof(WalUsage), pcxt->nworkers));
1529 shm_toc_estimate_keys(&pcxt->estimator, 1);
1530 shm_toc_estimate_chunk(&pcxt->estimator,
1531 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1532 shm_toc_estimate_keys(&pcxt->estimator, 1);
1533
1534 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1535 if (debug_query_string)
1536 {
1537 querylen = strlen(debug_query_string);
1538 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1539 shm_toc_estimate_keys(&pcxt->estimator, 1);
1540 }
1541 else
1542 querylen = 0; /* keep compiler quiet */
1543
1544 /* Everyone's had a chance to ask for space, so now create the DSM */
1545 InitializeParallelDSM(pcxt);
1546
1547 /* If no DSM segment was available, back out (do serial build) */
1548 if (pcxt->seg == NULL)
1549 {
1550 if (IsMVCCSnapshot(snapshot))
1551 UnregisterSnapshot(snapshot);
1552 DestroyParallelContext(pcxt);
1553 ExitParallelMode();
1554 return;
1555 }
1556
1557 /* Store shared build state, for which we reserved space */
1558 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1559 /* Initialize immutable state */
1560 btshared->heaprelid = RelationGetRelid(btspool->heap);
1561 btshared->indexrelid = RelationGetRelid(btspool->index);
1562 btshared->isunique = btspool->isunique;
1563 btshared->isconcurrent = isconcurrent;
1564 btshared->scantuplesortstates = scantuplesortstates;
1565 ConditionVariableInit(&btshared->workersdonecv);
1566 SpinLockInit(&btshared->mutex);
1567 /* Initialize mutable state */
1568 btshared->nparticipantsdone = 0;
1569 btshared->reltuples = 0.0;
1570 btshared->havedead = false;
1571 btshared->indtuples = 0.0;
1572 btshared->brokenhotchain = false;
1573 table_parallelscan_initialize(btspool->heap,
1574 ParallelTableScanFromBTShared(btshared),
1575 snapshot);
1576
1577 /*
1578 * Store shared tuplesort-private state, for which we reserved space.
1579 * Then, initialize opaque state using tuplesort routine.
1580 */
1581 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1582 tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1583 pcxt->seg);
1584
1585 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1586 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1587
1588 /* Unique case requires a second spool, and associated shared state */
1589 if (!btspool->isunique)
1590 sharedsort2 = NULL;
1591 else
1592 {
1593 /*
1594 * Store additional shared tuplesort-private state, for which we
1595 * reserved space. Then, initialize opaque state using tuplesort
1596 * routine.
1597 */
1598 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1599 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1600 pcxt->seg);
1601
1602 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1603 }
1604
1605 /* Store query string for workers */
1606 if (debug_query_string)
1607 {
1608 char *sharedquery;
1609
1610 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1611 memcpy(sharedquery, debug_query_string, querylen + 1);
1612 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1613 }
1614
1615 /*
1616 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1617 * initialize.
1618 */
1619 walusage = shm_toc_allocate(pcxt->toc,
1620 mul_size(sizeof(WalUsage), pcxt->nworkers));
1621 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
1622 bufferusage = shm_toc_allocate(pcxt->toc,
1623 mul_size(sizeof(BufferUsage), pcxt->nworkers));
1624 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1625
1626 /* Launch workers, saving status for leader/caller */
1627 LaunchParallelWorkers(pcxt);
1628 btleader->pcxt = pcxt;
1629 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1630 if (leaderparticipates)
1631 btleader->nparticipanttuplesorts++;
1632 btleader->btshared = btshared;
1633 btleader->sharedsort = sharedsort;
1634 btleader->sharedsort2 = sharedsort2;
1635 btleader->snapshot = snapshot;
1636 btleader->walusage = walusage;
1637 btleader->bufferusage = bufferusage;
1638
1639 /* If no workers were successfully launched, back out (do serial build) */
1640 if (pcxt->nworkers_launched == 0)
1641 {
1642 _bt_end_parallel(btleader);
1643 return;
1644 }
1645
1646 /* Save leader state now that it's clear build will be parallel */
1647 buildstate->btleader = btleader;
1648
1649 /* Join heap scan ourselves */
1650 if (leaderparticipates)
1651 _bt_leader_participate_as_worker(buildstate);
1652
1653 /*
1654 * Caller needs to wait for all launched workers when we return. Make
1655 * sure that the failure-to-start case will not hang forever.
1656 */
1657 WaitForParallelWorkersToAttach(pcxt);
1658 }
1659
1660 /*
1661 * Shut down workers, destroy parallel context, and end parallel mode.
1662 */
1663 static void
_bt_end_parallel(BTLeader * btleader)1664 _bt_end_parallel(BTLeader *btleader)
1665 {
1666 int i;
1667
1668 /* Shutdown worker processes */
1669 WaitForParallelWorkersToFinish(btleader->pcxt);
1670
1671 /*
1672 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1673 * or we might get incomplete data.)
1674 */
1675 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
1676 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1677
1678 /* Free last reference to MVCC snapshot, if one was used */
1679 if (IsMVCCSnapshot(btleader->snapshot))
1680 UnregisterSnapshot(btleader->snapshot);
1681 DestroyParallelContext(btleader->pcxt);
1682 ExitParallelMode();
1683 }
1684
1685 /*
1686 * Returns size of shared memory required to store state for a parallel
1687 * btree index build based on the snapshot its parallel scan will use.
1688 */
1689 static Size
_bt_parallel_estimate_shared(Relation heap,Snapshot snapshot)1690 _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1691 {
1692 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1693 return add_size(BUFFERALIGN(sizeof(BTShared)),
1694 table_parallelscan_estimate(heap, snapshot));
1695 }
1696
1697 /*
1698 * Within leader, wait for end of heap scan.
1699 *
1700 * When called, parallel heap scan started by _bt_begin_parallel() will
1701 * already be underway within worker processes (when leader participates
1702 * as a worker, we should end up here just as workers are finishing).
1703 *
1704 * Fills in fields needed for ambuild statistics, and lets caller set
1705 * field indicating that some worker encountered a broken HOT chain.
1706 *
1707 * Returns the total number of heap tuples scanned.
1708 */
1709 static double
_bt_parallel_heapscan(BTBuildState * buildstate,bool * brokenhotchain)1710 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1711 {
1712 BTShared *btshared = buildstate->btleader->btshared;
1713 int nparticipanttuplesorts;
1714 double reltuples;
1715
1716 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1717 for (;;)
1718 {
1719 SpinLockAcquire(&btshared->mutex);
1720 if (btshared->nparticipantsdone == nparticipanttuplesorts)
1721 {
1722 buildstate->havedead = btshared->havedead;
1723 buildstate->indtuples = btshared->indtuples;
1724 *brokenhotchain = btshared->brokenhotchain;
1725 reltuples = btshared->reltuples;
1726 SpinLockRelease(&btshared->mutex);
1727 break;
1728 }
1729 SpinLockRelease(&btshared->mutex);
1730
1731 ConditionVariableSleep(&btshared->workersdonecv,
1732 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1733 }
1734
1735 ConditionVariableCancelSleep();
1736
1737 return reltuples;
1738 }
1739
1740 /*
1741 * Within leader, participate as a parallel worker.
1742 */
1743 static void
_bt_leader_participate_as_worker(BTBuildState * buildstate)1744 _bt_leader_participate_as_worker(BTBuildState *buildstate)
1745 {
1746 BTLeader *btleader = buildstate->btleader;
1747 BTSpool *leaderworker;
1748 BTSpool *leaderworker2;
1749 int sortmem;
1750
1751 /* Allocate memory and initialize private spool */
1752 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));
1753 leaderworker->heap = buildstate->spool->heap;
1754 leaderworker->index = buildstate->spool->index;
1755 leaderworker->isunique = buildstate->spool->isunique;
1756
1757 /* Initialize second spool, if required */
1758 if (!btleader->btshared->isunique)
1759 leaderworker2 = NULL;
1760 else
1761 {
1762 /* Allocate memory for worker's own private secondary spool */
1763 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));
1764
1765 /* Initialize worker's own secondary spool */
1766 leaderworker2->heap = leaderworker->heap;
1767 leaderworker2->index = leaderworker->index;
1768 leaderworker2->isunique = false;
1769 }
1770
1771 /*
1772 * Might as well use reliable figure when doling out maintenance_work_mem
1773 * (when requested number of workers were not launched, this will be
1774 * somewhat higher than it is for other workers).
1775 */
1776 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1777
1778 /* Perform work common to all participants */
1779 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1780 btleader->sharedsort, btleader->sharedsort2,
1781 sortmem, true);
1782
1783 #ifdef BTREE_BUILD_STATS
1784 if (log_btree_build_stats)
1785 {
1786 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1787 ResetUsage();
1788 }
1789 #endif /* BTREE_BUILD_STATS */
1790 }
1791
1792 /*
1793 * Perform work within a launched parallel process.
1794 */
1795 void
_bt_parallel_build_main(dsm_segment * seg,shm_toc * toc)1796 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1797 {
1798 char *sharedquery;
1799 BTSpool *btspool;
1800 BTSpool *btspool2;
1801 BTShared *btshared;
1802 Sharedsort *sharedsort;
1803 Sharedsort *sharedsort2;
1804 Relation heapRel;
1805 Relation indexRel;
1806 LOCKMODE heapLockmode;
1807 LOCKMODE indexLockmode;
1808 WalUsage *walusage;
1809 BufferUsage *bufferusage;
1810 int sortmem;
1811
1812 #ifdef BTREE_BUILD_STATS
1813 if (log_btree_build_stats)
1814 ResetUsage();
1815 #endif /* BTREE_BUILD_STATS */
1816
1817 /* Set debug_query_string for individual workers first */
1818 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
1819 debug_query_string = sharedquery;
1820
1821 /* Report the query string from leader */
1822 pgstat_report_activity(STATE_RUNNING, debug_query_string);
1823
1824 /* Look up nbtree shared state */
1825 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1826
1827 /* Open relations using lock modes known to be obtained by index.c */
1828 if (!btshared->isconcurrent)
1829 {
1830 heapLockmode = ShareLock;
1831 indexLockmode = AccessExclusiveLock;
1832 }
1833 else
1834 {
1835 heapLockmode = ShareUpdateExclusiveLock;
1836 indexLockmode = RowExclusiveLock;
1837 }
1838
1839 /* Open relations within worker */
1840 heapRel = table_open(btshared->heaprelid, heapLockmode);
1841 indexRel = index_open(btshared->indexrelid, indexLockmode);
1842
1843 /* Initialize worker's own spool */
1844 btspool = (BTSpool *) palloc0(sizeof(BTSpool));
1845 btspool->heap = heapRel;
1846 btspool->index = indexRel;
1847 btspool->isunique = btshared->isunique;
1848
1849 /* Look up shared state private to tuplesort.c */
1850 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1851 tuplesort_attach_shared(sharedsort, seg);
1852 if (!btshared->isunique)
1853 {
1854 btspool2 = NULL;
1855 sharedsort2 = NULL;
1856 }
1857 else
1858 {
1859 /* Allocate memory for worker's own private secondary spool */
1860 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));
1861
1862 /* Initialize worker's own secondary spool */
1863 btspool2->heap = btspool->heap;
1864 btspool2->index = btspool->index;
1865 btspool2->isunique = false;
1866 /* Look up shared state private to tuplesort.c */
1867 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1868 tuplesort_attach_shared(sharedsort2, seg);
1869 }
1870
1871 /* Prepare to track buffer usage during parallel execution */
1872 InstrStartParallelQuery();
1873
1874 /* Perform sorting of spool, and possibly a spool2 */
1875 sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1876 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1877 sharedsort2, sortmem, false);
1878
1879 /* Report WAL/buffer usage during parallel execution */
1880 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
1881 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
1882 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1883 &walusage[ParallelWorkerNumber]);
1884
1885 #ifdef BTREE_BUILD_STATS
1886 if (log_btree_build_stats)
1887 {
1888 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1889 ResetUsage();
1890 }
1891 #endif /* BTREE_BUILD_STATS */
1892
1893 index_close(indexRel, indexLockmode);
1894 table_close(heapRel, heapLockmode);
1895 }
1896
1897 /*
1898 * Perform a worker's portion of a parallel sort.
1899 *
1900 * This generates a tuplesort for passed btspool, and a second tuplesort
1901 * state if a second btspool is need (i.e. for unique index builds). All
1902 * other spool fields should already be set when this is called.
1903 *
1904 * sortmem is the amount of working memory to use within each worker,
1905 * expressed in KBs.
1906 *
1907 * When this returns, workers are done, and need only release resources.
1908 */
1909 static void
_bt_parallel_scan_and_sort(BTSpool * btspool,BTSpool * btspool2,BTShared * btshared,Sharedsort * sharedsort,Sharedsort * sharedsort2,int sortmem,bool progress)1910 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1911 BTShared *btshared, Sharedsort *sharedsort,
1912 Sharedsort *sharedsort2, int sortmem, bool progress)
1913 {
1914 SortCoordinate coordinate;
1915 BTBuildState buildstate;
1916 TableScanDesc scan;
1917 double reltuples;
1918 IndexInfo *indexInfo;
1919
1920 /* Initialize local tuplesort coordination state */
1921 coordinate = palloc0(sizeof(SortCoordinateData));
1922 coordinate->isWorker = true;
1923 coordinate->nParticipants = -1;
1924 coordinate->sharedsort = sharedsort;
1925
1926 /* Begin "partial" tuplesort */
1927 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1928 btspool->index,
1929 btspool->isunique,
1930 sortmem, coordinate,
1931 false);
1932
1933 /*
1934 * Just as with serial case, there may be a second spool. If so, a
1935 * second, dedicated spool2 partial tuplesort is required.
1936 */
1937 if (btspool2)
1938 {
1939 SortCoordinate coordinate2;
1940
1941 /*
1942 * We expect that the second one (for dead tuples) won't get very
1943 * full, so we give it only work_mem (unless sortmem is less for
1944 * worker). Worker processes are generally permitted to allocate
1945 * work_mem independently.
1946 */
1947 coordinate2 = palloc0(sizeof(SortCoordinateData));
1948 coordinate2->isWorker = true;
1949 coordinate2->nParticipants = -1;
1950 coordinate2->sharedsort = sharedsort2;
1951 btspool2->sortstate =
1952 tuplesort_begin_index_btree(btspool->heap, btspool->index, false,
1953 Min(sortmem, work_mem), coordinate2,
1954 false);
1955 }
1956
1957 /* Fill in buildstate for _bt_build_callback() */
1958 buildstate.isunique = btshared->isunique;
1959 buildstate.havedead = false;
1960 buildstate.heap = btspool->heap;
1961 buildstate.spool = btspool;
1962 buildstate.spool2 = btspool2;
1963 buildstate.indtuples = 0;
1964 buildstate.btleader = NULL;
1965
1966 /* Join parallel scan */
1967 indexInfo = BuildIndexInfo(btspool->index);
1968 indexInfo->ii_Concurrent = btshared->isconcurrent;
1969 scan = table_beginscan_parallel(btspool->heap,
1970 ParallelTableScanFromBTShared(btshared));
1971 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1972 true, progress, _bt_build_callback,
1973 (void *) &buildstate, scan);
1974
1975 /* Execute this worker's part of the sort */
1976 if (progress)
1977 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1978 PROGRESS_BTREE_PHASE_PERFORMSORT_1);
1979 tuplesort_performsort(btspool->sortstate);
1980 if (btspool2)
1981 {
1982 if (progress)
1983 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1984 PROGRESS_BTREE_PHASE_PERFORMSORT_2);
1985 tuplesort_performsort(btspool2->sortstate);
1986 }
1987
1988 /*
1989 * Done. Record ambuild statistics, and whether we encountered a broken
1990 * HOT chain.
1991 */
1992 SpinLockAcquire(&btshared->mutex);
1993 btshared->nparticipantsdone++;
1994 btshared->reltuples += reltuples;
1995 if (buildstate.havedead)
1996 btshared->havedead = true;
1997 btshared->indtuples += buildstate.indtuples;
1998 if (indexInfo->ii_BrokenHotChain)
1999 btshared->brokenhotchain = true;
2000 SpinLockRelease(&btshared->mutex);
2001
2002 /* Notify leader */
2003 ConditionVariableSignal(&btshared->workersdonecv);
2004
2005 /* We can end tuplesorts immediately */
2006 tuplesort_end(btspool->sortstate);
2007 if (btspool2)
2008 tuplesort_end(btspool2->sortstate);
2009 }
2010