1 /*------------------------------------------------------------------------- 2 * 3 * nbtsort.c 4 * Build a btree from sorted input by loading leaf pages sequentially. 5 * 6 * NOTES 7 * 8 * We use tuplesort.c to sort the given index tuples into order. 9 * Then we scan the index tuples in order and build the btree pages 10 * for each level. We load source tuples into leaf-level pages. 11 * Whenever we fill a page at one level, we add a link to it to its 12 * parent level (starting a new parent level if necessary). When 13 * done, we write out each final page on each level, adding it to 14 * its parent level. When we have only one page on a level, it must be 15 * the root -- it can be attached to the btree metapage and we are done. 16 * 17 * It is not wise to pack the pages entirely full, since then *any* main(int,char **)18 * insertion would cause a split (and not only of the leaf page; the need 19 * for a split would cascade right up the tree). The steady-state load 20 * factor for btrees is usually estimated at 70%. We choose to pack leaf 21 * pages to the user-controllable fill factor (default 90%) while upper pages 22 * are always packed to 70%. This gives us reasonable density (there aren't 23 * many upper pages if the keys are reasonable-size) without risking a lot of 24 * cascading splits during early insertions. 25 * 26 * Formerly the index pages being built were kept in shared buffers, but 27 * that is of no value (since other backends have no interest in them yet) 28 * and it created locking problems for CHECKPOINT, because the upper-level 29 * pages were held exclusive-locked for long periods. Now we just build 30 * the pages in local memory and smgrwrite or smgrextend them as we finish 31 * them. They will need to be re-read into shared buffers on first use after 32 * the build finishes. 33 * 34 * Since the index will never be used unless it is completely built, 35 * from a crash-recovery point of view there is no need to WAL-log the 36 * steps of the build. After completing the index build, we can just sync 37 * the whole file to disk using smgrimmedsync() before exiting this module. 38 * This can be seen to be sufficient for crash recovery by considering that 39 * it's effectively equivalent to what would happen if a CHECKPOINT occurred 40 * just after the index build. However, it is clearly not sufficient if the 41 * DBA is using the WAL log for PITR or replication purposes, since another 42 * machine would not be able to reconstruct the index from WAL. Therefore, 43 * we log the completed index pages to WAL if and only if WAL archiving is 44 * active. 45 * 46 * This code isn't concerned about the FSM at all. The caller is responsible 47 * for initializing that. 48 * 49 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 50 * Portions Copyright (c) 1994, Regents of the University of California 51 * 52 * IDENTIFICATION 53 * src/backend/access/nbtree/nbtsort.c 54 * 55 *------------------------------------------------------------------------- 56 */ 57 58 #include "postgres.h" 59 60 #include "access/nbtree.h" 61 #include "access/parallel.h" 62 #include "access/relscan.h" 63 #include "access/xact.h" 64 #include "access/xlog.h" 65 #include "access/xloginsert.h" 66 #include "catalog/index.h" 67 #include "executor/instrument.h" 68 #include "miscadmin.h" 69 #include "pgstat.h" 70 #include "storage/smgr.h" 71 #include "tcop/tcopprot.h" /* pgrminclude ignore */ 72 #include "utils/rel.h" 73 #include "utils/sortsupport.h" 74 #include "utils/tuplesort.h" 75 76 77 /* Magic numbers for parallel state sharing */ 78 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) 79 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) 80 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) 81 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) 82 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000005) 83 84 /* 85 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in 86 * parallel index builds. This may be useful as a debugging aid. 87 #undef DISABLE_LEADER_PARTICIPATION 88 */ 89 90 /* 91 * Status record for spooling/sorting phase. (Note we may have two of 92 * these due to the special requirements for uniqueness-checking with 93 * dead tuples.) 94 */ 95 typedef struct BTSpool 96 { 97 Tuplesortstate *sortstate; /* state data for tuplesort.c */ 98 Relation heap; 99 Relation index; 100 bool isunique; 101 } BTSpool; 102 103 /* 104 * Status for index builds performed in parallel. This is allocated in a 105 * dynamic shared memory segment. Note that there is a separate tuplesort TOC 106 * entry, private to tuplesort.c but allocated by this module on its behalf. 107 */ 108 typedef struct BTShared 109 { 110 /* 111 * These fields are not modified during the sort. They primarily exist 112 * for the benefit of worker processes that need to create BTSpool state 113 * corresponding to that used by the leader. 114 */ 115 Oid heaprelid; 116 Oid indexrelid; 117 bool isunique; 118 bool isconcurrent; 119 int scantuplesortstates; 120 121 /* 122 * workersdonecv is used to monitor the progress of workers. All parallel 123 * participants must indicate that they are done before leader can use 124 * mutable state that workers maintain during scan (and before leader can 125 * proceed to tuplesort_performsort()). 126 */ 127 ConditionVariable workersdonecv; 128 129 /* 130 * mutex protects all fields before heapdesc. 131 * 132 * These fields contain status information of interest to B-Tree index 133 * builds that must work just the same when an index is built in parallel. 134 */ 135 slock_t mutex; 136 137 /* 138 * Mutable state that is maintained by workers, and reported back to 139 * leader at end of parallel scan. 140 * 141 * nparticipantsdone is number of worker processes finished. 142 * 143 * reltuples is the total number of input heap tuples. 144 * 145 * havedead indicates if RECENTLY_DEAD tuples were encountered during 146 * build. 147 * 148 * indtuples is the total number of tuples that made it into the index. 149 * 150 * brokenhotchain indicates if any worker detected a broken HOT chain 151 * during build. 152 */ 153 int nparticipantsdone; 154 double reltuples; 155 bool havedead; 156 double indtuples; 157 bool brokenhotchain; 158 159 /* 160 * This variable-sized field must come last. 161 * 162 * See _bt_parallel_estimate_shared(). 163 */ 164 ParallelHeapScanDescData heapdesc; 165 } BTShared; 166 167 /* 168 * Status for leader in parallel index build. 169 */ 170 typedef struct BTLeader 171 { 172 /* parallel context itself */ 173 ParallelContext *pcxt; 174 175 /* 176 * nparticipanttuplesorts is the exact number of worker processes 177 * successfully launched, plus one leader process if it participates as a 178 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader 179 * participating as a worker). 180 */ 181 int nparticipanttuplesorts; 182 183 /* 184 * Leader process convenience pointers to shared state (leader avoids TOC 185 * lookups). 186 * 187 * btshared is the shared state for entire build. sharedsort is the 188 * shared, tuplesort-managed state passed to each process tuplesort. 189 * sharedsort2 is the corresponding btspool2 shared state, used only when 190 * building unique indexes. snapshot is the snapshot used by the scan iff 191 * an MVCC snapshot is required. 192 */ 193 BTShared *btshared; 194 Sharedsort *sharedsort; 195 Sharedsort *sharedsort2; 196 Snapshot snapshot; 197 BufferUsage *bufferusage; 198 } BTLeader; 199 200 /* 201 * Working state for btbuild and its callback. 202 * 203 * When parallel CREATE INDEX is used, there is a BTBuildState for each 204 * participant. 205 */ 206 typedef struct BTBuildState 207 { 208 bool isunique; 209 bool havedead; 210 Relation heap; 211 BTSpool *spool; 212 213 /* 214 * spool2 is needed only when the index is a unique index. Dead tuples are 215 * put into spool2 instead of spool in order to avoid uniqueness check. 216 */ 217 BTSpool *spool2; 218 double indtuples; 219 220 /* 221 * btleader is only present when a parallel index build is performed, and 222 * only in the leader process. (Actually, only the leader has a 223 * BTBuildState. Workers have their own spool and spool2, though.) 224 */ 225 BTLeader *btleader; 226 } BTBuildState; 227 228 /* 229 * Status record for a btree page being built. We have one of these 230 * for each active tree level. 231 * 232 * The reason we need to store a copy of the minimum key is that we'll 233 * need to propagate it to the parent node when this page is linked 234 * into its parent. However, if the page is not a leaf page, the first 235 * entry on the page doesn't need to contain a key, so we will not have 236 * stored the key itself on the page. (You might think we could skip 237 * copying the minimum key on leaf pages, but actually we must have a 238 * writable copy anyway because we'll poke the page's address into it 239 * before passing it up to the parent...) 240 */ 241 typedef struct BTPageState 242 { 243 Page btps_page; /* workspace for page building */ 244 BlockNumber btps_blkno; /* block # to write this page at */ 245 IndexTuple btps_minkey; /* copy of minimum key (first item) on page */ 246 OffsetNumber btps_lastoff; /* last item offset loaded */ 247 uint32 btps_level; /* tree level (0 = leaf) */ 248 Size btps_full; /* "full" if less than this much free space */ 249 struct BTPageState *btps_next; /* link to parent level, if any */ 250 } BTPageState; 251 252 /* 253 * Overall status record for index writing phase. 254 */ 255 typedef struct BTWriteState 256 { 257 Relation heap; 258 Relation index; 259 bool btws_use_wal; /* dump pages to WAL? */ 260 BlockNumber btws_pages_alloced; /* # pages allocated */ 261 BlockNumber btws_pages_written; /* # pages written out */ 262 Page btws_zeropage; /* workspace for filling zeroes */ 263 } BTWriteState; 264 265 266 static double _bt_spools_heapscan(Relation heap, Relation index, 267 BTBuildState *buildstate, IndexInfo *indexInfo); 268 static void _bt_spooldestroy(BTSpool *btspool); 269 static void _bt_spool(BTSpool *btspool, ItemPointer self, 270 Datum *values, bool *isnull); 271 static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); 272 static void _bt_build_callback(Relation index, HeapTuple htup, Datum *values, 273 bool *isnull, bool tupleIsAlive, void *state); 274 static Page _bt_blnewpage(uint32 level); 275 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); 276 static void _bt_slideleft(Page page); 277 static void _bt_sortaddtup(Page page, Size itemsize, 278 IndexTuple itup, OffsetNumber itup_off); 279 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, 280 IndexTuple itup); 281 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); 282 static void _bt_load(BTWriteState *wstate, 283 BTSpool *btspool, BTSpool *btspool2); 284 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, 285 int request); 286 static void _bt_end_parallel(BTLeader *btleader); 287 static Size _bt_parallel_estimate_shared(Snapshot snapshot); 288 static double _bt_parallel_heapscan(BTBuildState *buildstate, 289 bool *brokenhotchain); 290 static void _bt_leader_participate_as_worker(BTBuildState *buildstate); 291 static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, 292 BTShared *btshared, Sharedsort *sharedsort, 293 Sharedsort *sharedsort2, int sortmem); 294 295 296 /* 297 * btbuild() -- build a new btree index. 298 */ 299 IndexBuildResult * 300 btbuild(Relation heap, Relation index, IndexInfo *indexInfo) 301 { 302 IndexBuildResult *result; 303 BTBuildState buildstate; 304 double reltuples; 305 306 #ifdef BTREE_BUILD_STATS 307 if (log_btree_build_stats) 308 ResetUsage(); 309 #endif /* BTREE_BUILD_STATS */ 310 311 buildstate.isunique = indexInfo->ii_Unique; 312 buildstate.havedead = false; 313 buildstate.heap = heap; 314 buildstate.spool = NULL; 315 buildstate.spool2 = NULL; 316 buildstate.indtuples = 0; 317 buildstate.btleader = NULL; 318 319 /* 320 * We expect to be called exactly once for any index relation. If that's 321 * not the case, big trouble's what we have. 322 */ 323 if (RelationGetNumberOfBlocks(index) != 0) 324 elog(ERROR, "index \"%s\" already contains data", 325 RelationGetRelationName(index)); 326 327 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); 328 329 /* 330 * Finish the build by (1) completing the sort of the spool file, (2) 331 * inserting the sorted tuples into btree pages and (3) building the upper 332 * levels. Finally, it may also be necessary to end use of parallelism. 333 */ 334 _bt_leafbuild(buildstate.spool, buildstate.spool2); 335 _bt_spooldestroy(buildstate.spool); 336 if (buildstate.spool2) 337 _bt_spooldestroy(buildstate.spool2); 338 if (buildstate.btleader) 339 _bt_end_parallel(buildstate.btleader); 340 341 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); 342 343 result->heap_tuples = reltuples; 344 result->index_tuples = buildstate.indtuples; 345 346 #ifdef BTREE_BUILD_STATS 347 if (log_btree_build_stats) 348 { 349 ShowUsage("BTREE BUILD STATS"); 350 ResetUsage(); 351 } 352 #endif /* BTREE_BUILD_STATS */ 353 354 return result; 355 } 356 357 /* 358 * Create and initialize one or two spool structures, and save them in caller's 359 * buildstate argument. May also fill-in fields within indexInfo used by index 360 * builds. 361 * 362 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This 363 * routine encapsulates all aspects of managing parallelism. Caller need only 364 * call _bt_end_parallel() in parallel case after it is done with spool/spool2. 365 * 366 * Returns the total number of heap tuples scanned. 367 */ 368 static double 369 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, 370 IndexInfo *indexInfo) 371 { 372 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); 373 SortCoordinate coordinate = NULL; 374 double reltuples = 0; 375 376 /* 377 * We size the sort area as maintenance_work_mem rather than work_mem to 378 * speed index creation. This should be OK since a single backend can't 379 * run multiple index creations in parallel (see also: notes on 380 * parallelism and maintenance_work_mem below). 381 */ 382 btspool->heap = heap; 383 btspool->index = index; 384 btspool->isunique = indexInfo->ii_Unique; 385 386 /* Save as primary spool */ 387 buildstate->spool = btspool; 388 389 /* Attempt to launch parallel worker scan when required */ 390 if (indexInfo->ii_ParallelWorkers > 0) 391 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, 392 indexInfo->ii_ParallelWorkers); 393 394 /* 395 * If parallel build requested and at least one worker process was 396 * successfully launched, set up coordination state 397 */ 398 if (buildstate->btleader) 399 { 400 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); 401 coordinate->isWorker = false; 402 coordinate->nParticipants = 403 buildstate->btleader->nparticipanttuplesorts; 404 coordinate->sharedsort = buildstate->btleader->sharedsort; 405 } 406 407 /* 408 * Begin serial/leader tuplesort. 409 * 410 * In cases where parallelism is involved, the leader receives the same 411 * share of maintenance_work_mem as a serial sort (it is generally treated 412 * in the same way as a serial sort once we return). Parallel worker 413 * Tuplesortstates will have received only a fraction of 414 * maintenance_work_mem, though. 415 * 416 * We rely on the lifetime of the Leader Tuplesortstate almost not 417 * overlapping with any worker Tuplesortstate's lifetime. There may be 418 * some small overlap, but that's okay because we rely on leader 419 * Tuplesortstate only allocating a small, fixed amount of memory here. 420 * When its tuplesort_performsort() is called (by our caller), and 421 * significant amounts of memory are likely to be used, all workers must 422 * have already freed almost all memory held by their Tuplesortstates 423 * (they are about to go away completely, too). The overall effect is 424 * that maintenance_work_mem always represents an absolute high watermark 425 * on the amount of memory used by a CREATE INDEX operation, regardless of 426 * the use of parallelism or any other factor. 427 */ 428 buildstate->spool->sortstate = 429 tuplesort_begin_index_btree(heap, index, buildstate->isunique, 430 maintenance_work_mem, coordinate, 431 false); 432 433 /* 434 * If building a unique index, put dead tuples in a second spool to keep 435 * them out of the uniqueness check. We expect that the second spool (for 436 * dead tuples) won't get very full, so we give it only work_mem. 437 */ 438 if (indexInfo->ii_Unique) 439 { 440 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); 441 SortCoordinate coordinate2 = NULL; 442 443 /* Initialize secondary spool */ 444 btspool2->heap = heap; 445 btspool2->index = index; 446 btspool2->isunique = false; 447 /* Save as secondary spool */ 448 buildstate->spool2 = btspool2; 449 450 if (buildstate->btleader) 451 { 452 /* 453 * Set up non-private state that is passed to 454 * tuplesort_begin_index_btree() about the basic high level 455 * coordination of a parallel sort. 456 */ 457 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); 458 coordinate2->isWorker = false; 459 coordinate2->nParticipants = 460 buildstate->btleader->nparticipanttuplesorts; 461 coordinate2->sharedsort = buildstate->btleader->sharedsort2; 462 } 463 464 /* 465 * We expect that the second one (for dead tuples) won't get very 466 * full, so we give it only work_mem 467 */ 468 buildstate->spool2->sortstate = 469 tuplesort_begin_index_btree(heap, index, false, work_mem, 470 coordinate2, false); 471 } 472 473 /* Fill spool using either serial or parallel heap scan */ 474 if (!buildstate->btleader) 475 reltuples = IndexBuildHeapScan(heap, index, indexInfo, true, 476 _bt_build_callback, (void *) buildstate, 477 NULL); 478 else 479 reltuples = _bt_parallel_heapscan(buildstate, 480 &indexInfo->ii_BrokenHotChain); 481 482 /* okay, all heap tuples are spooled */ 483 if (buildstate->spool2 && !buildstate->havedead) 484 { 485 /* spool2 turns out to be unnecessary */ 486 _bt_spooldestroy(buildstate->spool2); 487 buildstate->spool2 = NULL; 488 } 489 490 return reltuples; 491 } 492 493 /* 494 * clean up a spool structure and its substructures. 495 */ 496 static void 497 _bt_spooldestroy(BTSpool *btspool) 498 { 499 tuplesort_end(btspool->sortstate); 500 pfree(btspool); 501 } 502 503 /* 504 * spool an index entry into the sort file. 505 */ 506 static void 507 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) 508 { 509 tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, 510 self, values, isnull); 511 } 512 513 /* 514 * given a spool loaded by successive calls to _bt_spool, 515 * create an entire btree. 516 */ 517 static void 518 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) 519 { 520 BTWriteState wstate; 521 522 #ifdef BTREE_BUILD_STATS 523 if (log_btree_build_stats) 524 { 525 ShowUsage("BTREE BUILD (Spool) STATISTICS"); 526 ResetUsage(); 527 } 528 #endif /* BTREE_BUILD_STATS */ 529 530 tuplesort_performsort(btspool->sortstate); 531 if (btspool2) 532 tuplesort_performsort(btspool2->sortstate); 533 534 wstate.heap = btspool->heap; 535 wstate.index = btspool->index; 536 537 /* 538 * We need to log index creation in WAL iff WAL archiving/streaming is 539 * enabled UNLESS the index isn't WAL-logged anyway. 540 */ 541 wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index); 542 543 /* reserve the metapage */ 544 wstate.btws_pages_alloced = BTREE_METAPAGE + 1; 545 wstate.btws_pages_written = 0; 546 wstate.btws_zeropage = NULL; /* until needed */ 547 548 _bt_load(&wstate, btspool, btspool2); 549 } 550 551 /* 552 * Per-tuple callback from IndexBuildHeapScan 553 */ 554 static void 555 _bt_build_callback(Relation index, 556 HeapTuple htup, 557 Datum *values, 558 bool *isnull, 559 bool tupleIsAlive, 560 void *state) 561 { 562 BTBuildState *buildstate = (BTBuildState *) state; 563 564 /* 565 * insert the index tuple into the appropriate spool file for subsequent 566 * processing 567 */ 568 if (tupleIsAlive || buildstate->spool2 == NULL) 569 _bt_spool(buildstate->spool, &htup->t_self, values, isnull); 570 else 571 { 572 /* dead tuples are put into spool2 */ 573 buildstate->havedead = true; 574 _bt_spool(buildstate->spool2, &htup->t_self, values, isnull); 575 } 576 577 buildstate->indtuples += 1; 578 } 579 580 /* 581 * allocate workspace for a new, clean btree page, not linked to any siblings. 582 */ 583 static Page 584 _bt_blnewpage(uint32 level) 585 { 586 Page page; 587 BTPageOpaque opaque; 588 589 page = (Page) palloc(BLCKSZ); 590 591 /* Zero the page and set up standard page header info */ 592 _bt_pageinit(page, BLCKSZ); 593 594 /* Initialize BT opaque state */ 595 opaque = (BTPageOpaque) PageGetSpecialPointer(page); 596 opaque->btpo_prev = opaque->btpo_next = P_NONE; 597 opaque->btpo.level = level; 598 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; 599 opaque->btpo_cycleid = 0; 600 601 /* Make the P_HIKEY line pointer appear allocated */ 602 ((PageHeader) page)->pd_lower += sizeof(ItemIdData); 603 604 return page; 605 } 606 607 /* 608 * emit a completed btree page, and release the working storage. 609 */ 610 static void 611 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) 612 { 613 /* Ensure rd_smgr is open (could have been closed by relcache flush!) */ 614 RelationOpenSmgr(wstate->index); 615 616 /* XLOG stuff */ 617 if (wstate->btws_use_wal) 618 { 619 /* We use the heap NEWPAGE record type for this */ 620 log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true); 621 } 622 623 /* 624 * If we have to write pages nonsequentially, fill in the space with 625 * zeroes until we come back and overwrite. This is not logically 626 * necessary on standard Unix filesystems (unwritten space will read as 627 * zeroes anyway), but it should help to avoid fragmentation. The dummy 628 * pages aren't WAL-logged though. 629 */ 630 while (blkno > wstate->btws_pages_written) 631 { 632 if (!wstate->btws_zeropage) 633 wstate->btws_zeropage = (Page) palloc0(BLCKSZ); 634 /* don't set checksum for all-zero page */ 635 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, 636 wstate->btws_pages_written++, 637 (char *) wstate->btws_zeropage, 638 true); 639 } 640 641 PageSetChecksumInplace(page, blkno); 642 643 /* 644 * Now write the page. There's no need for smgr to schedule an fsync for 645 * this write; we'll do it ourselves before ending the build. 646 */ 647 if (blkno == wstate->btws_pages_written) 648 { 649 /* extending the file... */ 650 smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, 651 (char *) page, true); 652 wstate->btws_pages_written++; 653 } 654 else 655 { 656 /* overwriting a block we zero-filled before */ 657 smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, 658 (char *) page, true); 659 } 660 661 pfree(page); 662 } 663 664 /* 665 * allocate and initialize a new BTPageState. the returned structure 666 * is suitable for immediate use by _bt_buildadd. 667 */ 668 static BTPageState * 669 _bt_pagestate(BTWriteState *wstate, uint32 level) 670 { 671 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); 672 673 /* create initial page for level */ 674 state->btps_page = _bt_blnewpage(level); 675 676 /* and assign it a page position */ 677 state->btps_blkno = wstate->btws_pages_alloced++; 678 679 state->btps_minkey = NULL; 680 /* initialize lastoff so first item goes into P_FIRSTKEY */ 681 state->btps_lastoff = P_HIKEY; 682 state->btps_level = level; 683 /* set "full" threshold based on level. See notes at head of file. */ 684 if (level > 0) 685 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); 686 else 687 state->btps_full = RelationGetTargetPageFreeSpace(wstate->index, 688 BTREE_DEFAULT_FILLFACTOR); 689 /* no parent level, yet */ 690 state->btps_next = NULL; 691 692 return state; 693 } 694 695 /* 696 * slide an array of ItemIds back one slot (from P_FIRSTKEY to 697 * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover 698 * that we have built an ItemId array in what has turned out to be a 699 * P_RIGHTMOST page. 700 */ 701 static void 702 _bt_slideleft(Page page) 703 { 704 OffsetNumber off; 705 OffsetNumber maxoff; 706 ItemId previi; 707 ItemId thisii; 708 709 if (!PageIsEmpty(page)) 710 { 711 maxoff = PageGetMaxOffsetNumber(page); 712 previi = PageGetItemId(page, P_HIKEY); 713 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) 714 { 715 thisii = PageGetItemId(page, off); 716 *previi = *thisii; 717 previi = thisii; 718 } 719 ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); 720 } 721 } 722 723 /* 724 * Add an item to a page being built. 725 * 726 * The main difference between this routine and a bare PageAddItem call 727 * is that this code knows that the leftmost data item on a non-leaf 728 * btree page doesn't need to have a key. Therefore, it strips such 729 * items down to just the item header. 730 * 731 * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use 732 * that because it assumes that P_RIGHTMOST() will return the correct 733 * answer for the page. Here, we don't know yet if the page will be 734 * rightmost. Offset P_FIRSTKEY is always the first data key. 735 */ 736 static void 737 _bt_sortaddtup(Page page, 738 Size itemsize, 739 IndexTuple itup, 740 OffsetNumber itup_off) 741 { 742 BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); 743 IndexTupleData trunctuple; 744 745 if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY) 746 { 747 trunctuple = *itup; 748 trunctuple.t_info = sizeof(IndexTupleData); 749 BTreeTupleSetNAtts(&trunctuple, 0); 750 itup = &trunctuple; 751 itemsize = sizeof(IndexTupleData); 752 } 753 754 if (PageAddItem(page, (Item) itup, itemsize, itup_off, 755 false, false) == InvalidOffsetNumber) 756 elog(ERROR, "failed to add item to the index page"); 757 } 758 759 /*---------- 760 * Add an item to a disk page from the sort output. 761 * 762 * We must be careful to observe the page layout conventions of nbtsearch.c: 763 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. 764 * - on non-leaf pages, the key portion of the first item need not be 765 * stored, we should store only the link. 766 * 767 * A leaf page being built looks like: 768 * 769 * +----------------+---------------------------------+ 770 * | PageHeaderData | linp0 linp1 linp2 ... | 771 * +-----------+----+---------------------------------+ 772 * | ... linpN | | 773 * +-----------+--------------------------------------+ 774 * | ^ last | 775 * | | 776 * +-------------+------------------------------------+ 777 * | | itemN ... | 778 * +-------------+------------------+-----------------+ 779 * | ... item3 item2 item1 | "special space" | 780 * +--------------------------------+-----------------+ 781 * 782 * Contrast this with the diagram in bufpage.h; note the mismatch 783 * between linps and items. This is because we reserve linp0 as a 784 * placeholder for the pointer to the "high key" item; when we have 785 * filled up the page, we will set linp0 to point to itemN and clear 786 * linpN. On the other hand, if we find this is the last (rightmost) 787 * page, we leave the items alone and slide the linp array over. If 788 * the high key is to be truncated, offset 1 is deleted, and we insert 789 * the truncated high key at offset 1. 790 * 791 * 'last' pointer indicates the last offset added to the page. 792 *---------- 793 */ 794 static void 795 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) 796 { 797 Page npage; 798 BlockNumber nblkno; 799 OffsetNumber last_off; 800 Size pgspc; 801 Size itupsz; 802 int indnatts = IndexRelationGetNumberOfAttributes(wstate->index); 803 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(wstate->index); 804 805 /* 806 * This is a handy place to check for cancel interrupts during the btree 807 * load phase of index creation. 808 */ 809 CHECK_FOR_INTERRUPTS(); 810 811 npage = state->btps_page; 812 nblkno = state->btps_blkno; 813 last_off = state->btps_lastoff; 814 815 pgspc = PageGetFreeSpace(npage); 816 itupsz = IndexTupleSize(itup); 817 itupsz = MAXALIGN(itupsz); 818 819 /* 820 * Check whether the item can fit on a btree page at all. (Eventually, we 821 * ought to try to apply TOAST methods if not.) We actually need to be 822 * able to fit three items on every page, so restrict any one item to 1/3 823 * the per-page available space. Note that at this point, itupsz doesn't 824 * include the ItemId. 825 * 826 * NOTE: similar code appears in _bt_insertonpg() to defend against 827 * oversize items being inserted into an already-existing index. But 828 * during creation of an index, we don't go through there. 829 */ 830 if (itupsz > BTMaxItemSize(npage)) 831 ereport(ERROR, 832 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 833 errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", 834 itupsz, BTMaxItemSize(npage), 835 RelationGetRelationName(wstate->index)), 836 errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" 837 "Consider a function index of an MD5 hash of the value, " 838 "or use full text indexing."), 839 errtableconstraint(wstate->heap, 840 RelationGetRelationName(wstate->index)))); 841 842 /* 843 * Check to see if page is "full". It's definitely full if the item won't 844 * fit. Otherwise, compare to the target freespace derived from the 845 * fillfactor. However, we must put at least two items on each page, so 846 * disregard fillfactor if we don't have that many. 847 */ 848 if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) 849 { 850 /* 851 * Finish off the page and write it out. 852 */ 853 Page opage = npage; 854 BlockNumber oblkno = nblkno; 855 ItemId ii; 856 ItemId hii; 857 IndexTuple oitup; 858 BTPageOpaque opageop = (BTPageOpaque) PageGetSpecialPointer(opage); 859 860 /* Create new page of same level */ 861 npage = _bt_blnewpage(state->btps_level); 862 863 /* and assign it a page position */ 864 nblkno = wstate->btws_pages_alloced++; 865 866 /* 867 * We copy the last item on the page into the new page, and then 868 * rearrange the old page so that the 'last item' becomes its high key 869 * rather than a true data item. There had better be at least two 870 * items on the page already, else the page would be empty of useful 871 * data. 872 */ 873 Assert(last_off > P_FIRSTKEY); 874 ii = PageGetItemId(opage, last_off); 875 oitup = (IndexTuple) PageGetItem(opage, ii); 876 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); 877 878 /* 879 * Move 'last' into the high key position on opage 880 */ 881 hii = PageGetItemId(opage, P_HIKEY); 882 *hii = *ii; 883 ItemIdSetUnused(ii); /* redundant */ 884 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); 885 886 if (indnkeyatts != indnatts && P_ISLEAF(opageop)) 887 { 888 IndexTuple truncated; 889 Size truncsz; 890 891 /* 892 * Truncate any non-key attributes from high key on leaf level 893 * (i.e. truncate on leaf level if we're building an INCLUDE 894 * index). This is only done at the leaf level because downlinks 895 * in internal pages are either negative infinity items, or get 896 * their contents from copying from one level down. See also: 897 * _bt_split(). 898 * 899 * Since the truncated tuple is probably smaller than the 900 * original, it cannot just be copied in place (besides, we want 901 * to actually save space on the leaf page). We delete the 902 * original high key, and add our own truncated high key at the 903 * same offset. 904 * 905 * Note that the page layout won't be changed very much. oitup is 906 * already located at the physical beginning of tuple space, so we 907 * only shift the line pointer array back and forth, and overwrite 908 * the latter portion of the space occupied by the original tuple. 909 * This is fairly cheap. 910 */ 911 truncated = _bt_nonkey_truncate(wstate->index, oitup); 912 truncsz = IndexTupleSize(truncated); 913 PageIndexTupleDelete(opage, P_HIKEY); 914 _bt_sortaddtup(opage, truncsz, truncated, P_HIKEY); 915 pfree(truncated); 916 917 /* oitup should continue to point to the page's high key */ 918 hii = PageGetItemId(opage, P_HIKEY); 919 oitup = (IndexTuple) PageGetItem(opage, hii); 920 } 921 922 /* 923 * Link the old page into its parent, using its minimum key. If we 924 * don't have a parent, we have to create one; this adds a new btree 925 * level. 926 */ 927 if (state->btps_next == NULL) 928 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); 929 930 Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 931 IndexRelationGetNumberOfKeyAttributes(wstate->index) || 932 P_LEFTMOST(opageop)); 933 Assert(BTreeTupleGetNAtts(state->btps_minkey, wstate->index) == 0 || 934 !P_LEFTMOST(opageop)); 935 BTreeInnerTupleSetDownLink(state->btps_minkey, oblkno); 936 _bt_buildadd(wstate, state->btps_next, state->btps_minkey); 937 pfree(state->btps_minkey); 938 939 /* 940 * Save a copy of the minimum key for the new page. We have to copy 941 * it off the old page, not the new one, in case we are not at leaf 942 * level. 943 */ 944 state->btps_minkey = CopyIndexTuple(oitup); 945 946 /* 947 * Set the sibling links for both pages. 948 */ 949 { 950 BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); 951 BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); 952 953 oopaque->btpo_next = nblkno; 954 nopaque->btpo_prev = oblkno; 955 nopaque->btpo_next = P_NONE; /* redundant */ 956 } 957 958 /* 959 * Write out the old page. We never need to touch it again, so we can 960 * free the opage workspace too. 961 */ 962 _bt_blwritepage(wstate, opage, oblkno); 963 964 /* 965 * Reset last_off to point to new page 966 */ 967 last_off = P_FIRSTKEY; 968 } 969 970 /* 971 * If the new item is the first for its page, stash a copy for later. Note 972 * this will only happen for the first item on a level; on later pages, 973 * the first item for a page is copied from the prior page in the code 974 * above. Since the minimum key for an entire level is only used as a 975 * minus infinity downlink, and never as a high key, there is no need to 976 * truncate away non-key attributes at this point. 977 */ 978 if (last_off == P_HIKEY) 979 { 980 Assert(state->btps_minkey == NULL); 981 state->btps_minkey = CopyIndexTuple(itup); 982 /* _bt_sortaddtup() will perform full truncation later */ 983 BTreeTupleSetNAtts(state->btps_minkey, 0); 984 } 985 986 /* 987 * Add the new item into the current page. 988 */ 989 last_off = OffsetNumberNext(last_off); 990 _bt_sortaddtup(npage, itupsz, itup, last_off); 991 992 state->btps_page = npage; 993 state->btps_blkno = nblkno; 994 state->btps_lastoff = last_off; 995 } 996 997 /* 998 * Finish writing out the completed btree. 999 */ 1000 static void 1001 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) 1002 { 1003 BTPageState *s; 1004 BlockNumber rootblkno = P_NONE; 1005 uint32 rootlevel = 0; 1006 Page metapage; 1007 1008 /* 1009 * Each iteration of this loop completes one more level of the tree. 1010 */ 1011 for (s = state; s != NULL; s = s->btps_next) 1012 { 1013 BlockNumber blkno; 1014 BTPageOpaque opaque; 1015 1016 blkno = s->btps_blkno; 1017 opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); 1018 1019 /* 1020 * We have to link the last page on this level to somewhere. 1021 * 1022 * If we're at the top, it's the root, so attach it to the metapage. 1023 * Otherwise, add an entry for it to its parent using its minimum key. 1024 * This may cause the last page of the parent level to split, but 1025 * that's not a problem -- we haven't gotten to it yet. 1026 */ 1027 if (s->btps_next == NULL) 1028 { 1029 opaque->btpo_flags |= BTP_ROOT; 1030 rootblkno = blkno; 1031 rootlevel = s->btps_level; 1032 } 1033 else 1034 { 1035 Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 1036 IndexRelationGetNumberOfKeyAttributes(wstate->index) || 1037 P_LEFTMOST(opaque)); 1038 Assert(BTreeTupleGetNAtts(s->btps_minkey, wstate->index) == 0 || 1039 !P_LEFTMOST(opaque)); 1040 BTreeInnerTupleSetDownLink(s->btps_minkey, blkno); 1041 _bt_buildadd(wstate, s->btps_next, s->btps_minkey); 1042 pfree(s->btps_minkey); 1043 s->btps_minkey = NULL; 1044 } 1045 1046 /* 1047 * This is the rightmost page, so the ItemId array needs to be slid 1048 * back one slot. Then we can dump out the page. 1049 */ 1050 _bt_slideleft(s->btps_page); 1051 _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); 1052 s->btps_page = NULL; /* writepage freed the workspace */ 1053 } 1054 1055 /* 1056 * As the last step in the process, construct the metapage and make it 1057 * point to the new root (unless we had no data at all, in which case it's 1058 * set to point to "P_NONE"). This changes the index to the "valid" state 1059 * by filling in a valid magic number in the metapage. 1060 */ 1061 metapage = (Page) palloc(BLCKSZ); 1062 _bt_initmetapage(metapage, rootblkno, rootlevel); 1063 _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); 1064 } 1065 1066 /* 1067 * Read tuples in correct sort order from tuplesort, and load them into 1068 * btree leaves. 1069 */ 1070 static void 1071 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) 1072 { 1073 BTPageState *state = NULL; 1074 bool merge = (btspool2 != NULL); 1075 IndexTuple itup, 1076 itup2 = NULL; 1077 bool load1; 1078 TupleDesc tupdes = RelationGetDescr(wstate->index); 1079 int i, 1080 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); 1081 ScanKey indexScanKey = NULL; 1082 SortSupport sortKeys; 1083 1084 if (merge) 1085 { 1086 /* 1087 * Another BTSpool for dead tuples exists. Now we have to merge 1088 * btspool and btspool2. 1089 */ 1090 1091 /* the preparation of merge */ 1092 itup = tuplesort_getindextuple(btspool->sortstate, true); 1093 itup2 = tuplesort_getindextuple(btspool2->sortstate, true); 1094 indexScanKey = _bt_mkscankey_nodata(wstate->index); 1095 1096 /* Prepare SortSupport data for each column */ 1097 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); 1098 1099 for (i = 0; i < keysz; i++) 1100 { 1101 SortSupport sortKey = sortKeys + i; 1102 ScanKey scanKey = indexScanKey + i; 1103 int16 strategy; 1104 1105 sortKey->ssup_cxt = CurrentMemoryContext; 1106 sortKey->ssup_collation = scanKey->sk_collation; 1107 sortKey->ssup_nulls_first = 1108 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; 1109 sortKey->ssup_attno = scanKey->sk_attno; 1110 /* Abbreviation is not supported here */ 1111 sortKey->abbreviate = false; 1112 1113 AssertState(sortKey->ssup_attno != 0); 1114 1115 strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? 1116 BTGreaterStrategyNumber : BTLessStrategyNumber; 1117 1118 PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); 1119 } 1120 1121 _bt_freeskey(indexScanKey); 1122 1123 for (;;) 1124 { 1125 load1 = true; /* load BTSpool next ? */ 1126 if (itup2 == NULL) 1127 { 1128 if (itup == NULL) 1129 break; 1130 } 1131 else if (itup != NULL) 1132 { 1133 for (i = 1; i <= keysz; i++) 1134 { 1135 SortSupport entry; 1136 Datum attrDatum1, 1137 attrDatum2; 1138 bool isNull1, 1139 isNull2; 1140 int32 compare; 1141 1142 entry = sortKeys + i - 1; 1143 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); 1144 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); 1145 1146 compare = ApplySortComparator(attrDatum1, isNull1, 1147 attrDatum2, isNull2, 1148 entry); 1149 if (compare > 0) 1150 { 1151 load1 = false; 1152 break; 1153 } 1154 else if (compare < 0) 1155 break; 1156 } 1157 } 1158 else 1159 load1 = false; 1160 1161 /* When we see first tuple, create first index page */ 1162 if (state == NULL) 1163 state = _bt_pagestate(wstate, 0); 1164 1165 if (load1) 1166 { 1167 _bt_buildadd(wstate, state, itup); 1168 itup = tuplesort_getindextuple(btspool->sortstate, true); 1169 } 1170 else 1171 { 1172 _bt_buildadd(wstate, state, itup2); 1173 itup2 = tuplesort_getindextuple(btspool2->sortstate, true); 1174 } 1175 } 1176 pfree(sortKeys); 1177 } 1178 else 1179 { 1180 /* merge is unnecessary */ 1181 while ((itup = tuplesort_getindextuple(btspool->sortstate, 1182 true)) != NULL) 1183 { 1184 /* When we see first tuple, create first index page */ 1185 if (state == NULL) 1186 state = _bt_pagestate(wstate, 0); 1187 1188 _bt_buildadd(wstate, state, itup); 1189 } 1190 } 1191 1192 /* Close down final pages and write the metapage */ 1193 _bt_uppershutdown(wstate, state); 1194 1195 /* 1196 * If the index is WAL-logged, we must fsync it down to disk before it's 1197 * safe to commit the transaction. (For a non-WAL-logged index we don't 1198 * care since the index will be uninteresting after a crash anyway.) 1199 * 1200 * It's obvious that we must do this when not WAL-logging the build. It's 1201 * less obvious that we have to do it even if we did WAL-log the index 1202 * pages. The reason is that since we're building outside shared buffers, 1203 * a CHECKPOINT occurring during the build has no way to flush the 1204 * previously written data to disk (indeed it won't know the index even 1205 * exists). A crash later on would replay WAL from the checkpoint, 1206 * therefore it wouldn't replay our earlier WAL entries. If we do not 1207 * fsync those pages here, they might still not be on disk when the crash 1208 * occurs. 1209 */ 1210 if (RelationNeedsWAL(wstate->index)) 1211 { 1212 RelationOpenSmgr(wstate->index); 1213 smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); 1214 } 1215 } 1216 1217 /* 1218 * Create parallel context, and launch workers for leader. 1219 * 1220 * buildstate argument should be initialized (with the exception of the 1221 * tuplesort state in spools, which may later be created based on shared 1222 * state initially set up here). 1223 * 1224 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. 1225 * 1226 * request is the target number of parallel worker processes to launch. 1227 * 1228 * Sets buildstate's BTLeader, which caller must use to shut down parallel 1229 * mode by passing it to _bt_end_parallel() at the very end of its index 1230 * build. If not even a single worker process can be launched, this is 1231 * never set, and caller should proceed with a serial index build. 1232 */ 1233 static void 1234 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) 1235 { 1236 ParallelContext *pcxt; 1237 int scantuplesortstates; 1238 Snapshot snapshot; 1239 Size estbtshared; 1240 Size estsort; 1241 BTShared *btshared; 1242 Sharedsort *sharedsort; 1243 Sharedsort *sharedsort2; 1244 BTSpool *btspool = buildstate->spool; 1245 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); 1246 BufferUsage *bufferusage; 1247 bool leaderparticipates = true; 1248 int querylen; 1249 1250 #ifdef DISABLE_LEADER_PARTICIPATION 1251 leaderparticipates = false; 1252 #endif 1253 1254 /* 1255 * Enter parallel mode, and create context for parallel build of btree 1256 * index 1257 */ 1258 EnterParallelMode(); 1259 Assert(request > 0); 1260 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main", 1261 request, true); 1262 1263 scantuplesortstates = leaderparticipates ? request + 1 : request; 1264 1265 /* 1266 * Prepare for scan of the base relation. In a normal index build, we use 1267 * SnapshotAny because we must retrieve all tuples and do our own time 1268 * qual checks (because we have to index RECENTLY_DEAD tuples). In a 1269 * concurrent build, we take a regular MVCC snapshot and index whatever's 1270 * live according to that. 1271 */ 1272 if (!isconcurrent) 1273 snapshot = SnapshotAny; 1274 else 1275 snapshot = RegisterSnapshot(GetTransactionSnapshot()); 1276 1277 /* 1278 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and 1279 * PARALLEL_KEY_TUPLESORT tuplesort workspace 1280 */ 1281 estbtshared = _bt_parallel_estimate_shared(snapshot); 1282 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); 1283 estsort = tuplesort_estimate_shared(scantuplesortstates); 1284 shm_toc_estimate_chunk(&pcxt->estimator, estsort); 1285 1286 /* 1287 * Unique case requires a second spool, and so we may have to account for 1288 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 1289 */ 1290 if (!btspool->isunique) 1291 shm_toc_estimate_keys(&pcxt->estimator, 2); 1292 else 1293 { 1294 shm_toc_estimate_chunk(&pcxt->estimator, estsort); 1295 shm_toc_estimate_keys(&pcxt->estimator, 3); 1296 } 1297 1298 /* 1299 * Estimate space for BufferUsage -- PARALLEL_KEY_BUFFER_USAGE. 1300 * 1301 * If there are no extensions loaded that care, we could skip this. We 1302 * have no way of knowing whether anyone's looking at pgBufferUsage, so do 1303 * it unconditionally. 1304 */ 1305 shm_toc_estimate_chunk(&pcxt->estimator, 1306 mul_size(sizeof(BufferUsage), pcxt->nworkers)); 1307 shm_toc_estimate_keys(&pcxt->estimator, 1); 1308 1309 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ 1310 if (debug_query_string) 1311 { 1312 querylen = strlen(debug_query_string); 1313 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); 1314 shm_toc_estimate_keys(&pcxt->estimator, 1); 1315 } 1316 else 1317 querylen = 0; /* keep compiler quiet */ 1318 1319 /* Everyone's had a chance to ask for space, so now create the DSM */ 1320 InitializeParallelDSM(pcxt); 1321 1322 /* If no DSM segment was available, back out (do serial build) */ 1323 if (pcxt->seg == NULL) 1324 { 1325 if (IsMVCCSnapshot(snapshot)) 1326 UnregisterSnapshot(snapshot); 1327 DestroyParallelContext(pcxt); 1328 ExitParallelMode(); 1329 return; 1330 } 1331 1332 /* Store shared build state, for which we reserved space */ 1333 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); 1334 /* Initialize immutable state */ 1335 btshared->heaprelid = RelationGetRelid(btspool->heap); 1336 btshared->indexrelid = RelationGetRelid(btspool->index); 1337 btshared->isunique = btspool->isunique; 1338 btshared->isconcurrent = isconcurrent; 1339 btshared->scantuplesortstates = scantuplesortstates; 1340 ConditionVariableInit(&btshared->workersdonecv); 1341 SpinLockInit(&btshared->mutex); 1342 /* Initialize mutable state */ 1343 btshared->nparticipantsdone = 0; 1344 btshared->reltuples = 0.0; 1345 btshared->havedead = false; 1346 btshared->indtuples = 0.0; 1347 btshared->brokenhotchain = false; 1348 heap_parallelscan_initialize(&btshared->heapdesc, btspool->heap, snapshot); 1349 1350 /* 1351 * Store shared tuplesort-private state, for which we reserved space. 1352 * Then, initialize opaque state using tuplesort routine. 1353 */ 1354 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); 1355 tuplesort_initialize_shared(sharedsort, scantuplesortstates, 1356 pcxt->seg); 1357 1358 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); 1359 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); 1360 1361 /* Unique case requires a second spool, and associated shared state */ 1362 if (!btspool->isunique) 1363 sharedsort2 = NULL; 1364 else 1365 { 1366 /* 1367 * Store additional shared tuplesort-private state, for which we 1368 * reserved space. Then, initialize opaque state using tuplesort 1369 * routine. 1370 */ 1371 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); 1372 tuplesort_initialize_shared(sharedsort2, scantuplesortstates, 1373 pcxt->seg); 1374 1375 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); 1376 } 1377 1378 /* Store query string for workers */ 1379 if (debug_query_string) 1380 { 1381 char *sharedquery; 1382 1383 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); 1384 memcpy(sharedquery, debug_query_string, querylen + 1); 1385 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); 1386 } 1387 1388 /* Allocate space for each worker's BufferUsage; no need to initialize */ 1389 bufferusage = shm_toc_allocate(pcxt->toc, 1390 mul_size(sizeof(BufferUsage), pcxt->nworkers)); 1391 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); 1392 1393 /* Launch workers, saving status for leader/caller */ 1394 LaunchParallelWorkers(pcxt); 1395 btleader->pcxt = pcxt; 1396 btleader->nparticipanttuplesorts = pcxt->nworkers_launched; 1397 if (leaderparticipates) 1398 btleader->nparticipanttuplesorts++; 1399 btleader->btshared = btshared; 1400 btleader->sharedsort = sharedsort; 1401 btleader->sharedsort2 = sharedsort2; 1402 btleader->snapshot = snapshot; 1403 btleader->bufferusage = bufferusage; 1404 1405 /* If no workers were successfully launched, back out (do serial build) */ 1406 if (pcxt->nworkers_launched == 0) 1407 { 1408 _bt_end_parallel(btleader); 1409 return; 1410 } 1411 1412 /* Save leader state now that it's clear build will be parallel */ 1413 buildstate->btleader = btleader; 1414 1415 /* Join heap scan ourselves */ 1416 if (leaderparticipates) 1417 _bt_leader_participate_as_worker(buildstate); 1418 1419 /* 1420 * Caller needs to wait for all launched workers when we return. Make 1421 * sure that the failure-to-start case will not hang forever. 1422 */ 1423 WaitForParallelWorkersToAttach(pcxt); 1424 } 1425 1426 /* 1427 * Shut down workers, destroy parallel context, and end parallel mode. 1428 */ 1429 static void 1430 _bt_end_parallel(BTLeader *btleader) 1431 { 1432 int i; 1433 1434 /* Shutdown worker processes */ 1435 WaitForParallelWorkersToFinish(btleader->pcxt); 1436 1437 /* 1438 * Next, accumulate buffer usage. (This must wait for the workers to 1439 * finish, or we might get incomplete data.) 1440 */ 1441 for (i = 0; i < btleader->pcxt->nworkers_launched; i++) 1442 InstrAccumParallelQuery(&btleader->bufferusage[i]); 1443 1444 /* Free last reference to MVCC snapshot, if one was used */ 1445 if (IsMVCCSnapshot(btleader->snapshot)) 1446 UnregisterSnapshot(btleader->snapshot); 1447 DestroyParallelContext(btleader->pcxt); 1448 ExitParallelMode(); 1449 } 1450 1451 /* 1452 * Returns size of shared memory required to store state for a parallel 1453 * btree index build based on the snapshot its parallel scan will use. 1454 */ 1455 static Size 1456 _bt_parallel_estimate_shared(Snapshot snapshot) 1457 { 1458 if (!IsMVCCSnapshot(snapshot)) 1459 { 1460 Assert(snapshot == SnapshotAny); 1461 return sizeof(BTShared); 1462 } 1463 1464 return add_size(offsetof(BTShared, heapdesc) + 1465 offsetof(ParallelHeapScanDescData, phs_snapshot_data), 1466 EstimateSnapshotSpace(snapshot)); 1467 } 1468 1469 /* 1470 * Within leader, wait for end of heap scan. 1471 * 1472 * When called, parallel heap scan started by _bt_begin_parallel() will 1473 * already be underway within worker processes (when leader participates 1474 * as a worker, we should end up here just as workers are finishing). 1475 * 1476 * Fills in fields needed for ambuild statistics, and lets caller set 1477 * field indicating that some worker encountered a broken HOT chain. 1478 * 1479 * Returns the total number of heap tuples scanned. 1480 */ 1481 static double 1482 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) 1483 { 1484 BTShared *btshared = buildstate->btleader->btshared; 1485 int nparticipanttuplesorts; 1486 double reltuples; 1487 1488 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; 1489 for (;;) 1490 { 1491 SpinLockAcquire(&btshared->mutex); 1492 if (btshared->nparticipantsdone == nparticipanttuplesorts) 1493 { 1494 buildstate->havedead = btshared->havedead; 1495 buildstate->indtuples = btshared->indtuples; 1496 *brokenhotchain = btshared->brokenhotchain; 1497 reltuples = btshared->reltuples; 1498 SpinLockRelease(&btshared->mutex); 1499 break; 1500 } 1501 SpinLockRelease(&btshared->mutex); 1502 1503 ConditionVariableSleep(&btshared->workersdonecv, 1504 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); 1505 } 1506 1507 ConditionVariableCancelSleep(); 1508 1509 return reltuples; 1510 } 1511 1512 /* 1513 * Within leader, participate as a parallel worker. 1514 */ 1515 static void 1516 _bt_leader_participate_as_worker(BTBuildState *buildstate) 1517 { 1518 BTLeader *btleader = buildstate->btleader; 1519 BTSpool *leaderworker; 1520 BTSpool *leaderworker2; 1521 int sortmem; 1522 1523 /* Allocate memory and initialize private spool */ 1524 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); 1525 leaderworker->heap = buildstate->spool->heap; 1526 leaderworker->index = buildstate->spool->index; 1527 leaderworker->isunique = buildstate->spool->isunique; 1528 1529 /* Initialize second spool, if required */ 1530 if (!btleader->btshared->isunique) 1531 leaderworker2 = NULL; 1532 else 1533 { 1534 /* Allocate memory for worker's own private secondary spool */ 1535 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); 1536 1537 /* Initialize worker's own secondary spool */ 1538 leaderworker2->heap = leaderworker->heap; 1539 leaderworker2->index = leaderworker->index; 1540 leaderworker2->isunique = false; 1541 } 1542 1543 /* 1544 * Might as well use reliable figure when doling out maintenance_work_mem 1545 * (when requested number of workers were not launched, this will be 1546 * somewhat higher than it is for other workers). 1547 */ 1548 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; 1549 1550 /* Perform work common to all participants */ 1551 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, 1552 btleader->sharedsort, btleader->sharedsort2, 1553 sortmem); 1554 1555 #ifdef BTREE_BUILD_STATS 1556 if (log_btree_build_stats) 1557 { 1558 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS"); 1559 ResetUsage(); 1560 } 1561 #endif /* BTREE_BUILD_STATS */ 1562 } 1563 1564 /* 1565 * Perform work within a launched parallel process. 1566 */ 1567 void 1568 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) 1569 { 1570 char *sharedquery; 1571 BTSpool *btspool; 1572 BTSpool *btspool2; 1573 BTShared *btshared; 1574 Sharedsort *sharedsort; 1575 Sharedsort *sharedsort2; 1576 Relation heapRel; 1577 Relation indexRel; 1578 LOCKMODE heapLockmode; 1579 LOCKMODE indexLockmode; 1580 BufferUsage *bufferusage; 1581 int sortmem; 1582 1583 #ifdef BTREE_BUILD_STATS 1584 if (log_btree_build_stats) 1585 ResetUsage(); 1586 #endif /* BTREE_BUILD_STATS */ 1587 1588 /* Set debug_query_string for individual workers first */ 1589 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); 1590 debug_query_string = sharedquery; 1591 1592 /* Report the query string from leader */ 1593 pgstat_report_activity(STATE_RUNNING, debug_query_string); 1594 1595 /* Look up nbtree shared state */ 1596 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); 1597 1598 /* Open relations using lock modes known to be obtained by index.c */ 1599 if (!btshared->isconcurrent) 1600 { 1601 heapLockmode = ShareLock; 1602 indexLockmode = AccessExclusiveLock; 1603 } 1604 else 1605 { 1606 heapLockmode = ShareUpdateExclusiveLock; 1607 indexLockmode = RowExclusiveLock; 1608 } 1609 1610 /* Open relations within worker */ 1611 heapRel = heap_open(btshared->heaprelid, heapLockmode); 1612 indexRel = index_open(btshared->indexrelid, indexLockmode); 1613 1614 /* Initialize worker's own spool */ 1615 btspool = (BTSpool *) palloc0(sizeof(BTSpool)); 1616 btspool->heap = heapRel; 1617 btspool->index = indexRel; 1618 btspool->isunique = btshared->isunique; 1619 1620 /* Look up shared state private to tuplesort.c */ 1621 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); 1622 tuplesort_attach_shared(sharedsort, seg); 1623 if (!btshared->isunique) 1624 { 1625 btspool2 = NULL; 1626 sharedsort2 = NULL; 1627 } 1628 else 1629 { 1630 /* Allocate memory for worker's own private secondary spool */ 1631 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); 1632 1633 /* Initialize worker's own secondary spool */ 1634 btspool2->heap = btspool->heap; 1635 btspool2->index = btspool->index; 1636 btspool2->isunique = false; 1637 /* Look up shared state private to tuplesort.c */ 1638 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); 1639 tuplesort_attach_shared(sharedsort2, seg); 1640 } 1641 1642 /* Prepare to track buffer usage during parallel execution */ 1643 InstrStartParallelQuery(); 1644 1645 /* Perform sorting of spool, and possibly a spool2 */ 1646 sortmem = maintenance_work_mem / btshared->scantuplesortstates; 1647 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, 1648 sharedsort2, sortmem); 1649 1650 /* Report buffer usage during parallel execution */ 1651 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); 1652 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber]); 1653 1654 #ifdef BTREE_BUILD_STATS 1655 if (log_btree_build_stats) 1656 { 1657 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS"); 1658 ResetUsage(); 1659 } 1660 #endif /* BTREE_BUILD_STATS */ 1661 1662 index_close(indexRel, indexLockmode); 1663 heap_close(heapRel, heapLockmode); 1664 } 1665 1666 /* 1667 * Perform a worker's portion of a parallel sort. 1668 * 1669 * This generates a tuplesort for passed btspool, and a second tuplesort 1670 * state if a second btspool is need (i.e. for unique index builds). All 1671 * other spool fields should already be set when this is called. 1672 * 1673 * sortmem is the amount of working memory to use within each worker, 1674 * expressed in KBs. 1675 * 1676 * When this returns, workers are done, and need only release resources. 1677 */ 1678 static void 1679 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, 1680 BTShared *btshared, Sharedsort *sharedsort, 1681 Sharedsort *sharedsort2, int sortmem) 1682 { 1683 SortCoordinate coordinate; 1684 BTBuildState buildstate; 1685 HeapScanDesc scan; 1686 double reltuples; 1687 IndexInfo *indexInfo; 1688 1689 /* Initialize local tuplesort coordination state */ 1690 coordinate = palloc0(sizeof(SortCoordinateData)); 1691 coordinate->isWorker = true; 1692 coordinate->nParticipants = -1; 1693 coordinate->sharedsort = sharedsort; 1694 1695 /* Begin "partial" tuplesort */ 1696 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, 1697 btspool->index, 1698 btspool->isunique, 1699 sortmem, coordinate, 1700 false); 1701 1702 /* 1703 * Just as with serial case, there may be a second spool. If so, a 1704 * second, dedicated spool2 partial tuplesort is required. 1705 */ 1706 if (btspool2) 1707 { 1708 SortCoordinate coordinate2; 1709 1710 /* 1711 * We expect that the second one (for dead tuples) won't get very 1712 * full, so we give it only work_mem (unless sortmem is less for 1713 * worker). Worker processes are generally permitted to allocate 1714 * work_mem independently. 1715 */ 1716 coordinate2 = palloc0(sizeof(SortCoordinateData)); 1717 coordinate2->isWorker = true; 1718 coordinate2->nParticipants = -1; 1719 coordinate2->sharedsort = sharedsort2; 1720 btspool2->sortstate = 1721 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, 1722 Min(sortmem, work_mem), coordinate2, 1723 false); 1724 } 1725 1726 /* Fill in buildstate for _bt_build_callback() */ 1727 buildstate.isunique = btshared->isunique; 1728 buildstate.havedead = false; 1729 buildstate.heap = btspool->heap; 1730 buildstate.spool = btspool; 1731 buildstate.spool2 = btspool2; 1732 buildstate.indtuples = 0; 1733 buildstate.btleader = NULL; 1734 1735 /* Join parallel scan */ 1736 indexInfo = BuildIndexInfo(btspool->index); 1737 indexInfo->ii_Concurrent = btshared->isconcurrent; 1738 scan = heap_beginscan_parallel(btspool->heap, &btshared->heapdesc); 1739 reltuples = IndexBuildHeapScan(btspool->heap, btspool->index, indexInfo, 1740 true, _bt_build_callback, 1741 (void *) &buildstate, scan); 1742 1743 /* 1744 * Execute this worker's part of the sort. 1745 * 1746 * Unlike leader and serial cases, we cannot avoid calling 1747 * tuplesort_performsort() for spool2 if it ends up containing no dead 1748 * tuples (this is disallowed for workers by tuplesort). 1749 */ 1750 tuplesort_performsort(btspool->sortstate); 1751 if (btspool2) 1752 tuplesort_performsort(btspool2->sortstate); 1753 1754 /* 1755 * Done. Record ambuild statistics, and whether we encountered a broken 1756 * HOT chain. 1757 */ 1758 SpinLockAcquire(&btshared->mutex); 1759 btshared->nparticipantsdone++; 1760 btshared->reltuples += reltuples; 1761 if (buildstate.havedead) 1762 btshared->havedead = true; 1763 btshared->indtuples += buildstate.indtuples; 1764 if (indexInfo->ii_BrokenHotChain) 1765 btshared->brokenhotchain = true; 1766 SpinLockRelease(&btshared->mutex); 1767 1768 /* Notify leader */ 1769 ConditionVariableSignal(&btshared->workersdonecv); 1770 1771 /* We can end tuplesorts immediately */ 1772 tuplesort_end(btspool->sortstate); 1773 if (btspool2) 1774 tuplesort_end(btspool2->sortstate); 1775 } 1776