1 /*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19 #include "postgres.h"
20
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/progress.h"
26 #include "commands/vacuum.h"
27 #include "miscadmin.h"
28 #include "nodes/execnodes.h"
29 #include "pgstat.h"
30 #include "postmaster/autovacuum.h"
31 #include "storage/condition_variable.h"
32 #include "storage/indexfsm.h"
33 #include "storage/ipc.h"
34 #include "storage/lmgr.h"
35 #include "storage/smgr.h"
36 #include "utils/builtins.h"
37 #include "utils/index_selfuncs.h"
38 #include "utils/memutils.h"
39
40
41 /*
42 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
43 *
44 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
45 * a new page; others must wait.
46 *
47 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
48 * to a new page; some process can start doing that.
49 *
50 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
51 * We reach this state once for every distinct combination of array keys.
52 */
53 typedef enum
54 {
55 BTPARALLEL_NOT_INITIALIZED,
56 BTPARALLEL_ADVANCING,
57 BTPARALLEL_IDLE,
58 BTPARALLEL_DONE
59 } BTPS_State;
60
61 /*
62 * BTParallelScanDescData contains btree specific shared information required
63 * for parallel scan.
64 */
65 typedef struct BTParallelScanDescData
66 {
67 BlockNumber btps_scanPage; /* latest or next page to be scanned */
68 BTPS_State btps_pageStatus; /* indicates whether next page is
69 * available for scan. see above for
70 * possible states of parallel scan. */
71 int btps_arrayKeyCount; /* count indicating number of array scan
72 * keys processed by parallel scan */
73 slock_t btps_mutex; /* protects above variables */
74 ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 } BTParallelScanDescData;
76
77 typedef struct BTParallelScanDescData *BTParallelScanDesc;
78
79
80 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
81 IndexBulkDeleteCallback callback, void *callback_state,
82 BTCycleId cycleid);
83 static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
84 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
85 IndexTuple posting,
86 OffsetNumber updatedoffset,
87 int *nremaining);
88
89
90 /*
91 * Btree handler function: return IndexAmRoutine with access method parameters
92 * and callbacks.
93 */
94 Datum
bthandler(PG_FUNCTION_ARGS)95 bthandler(PG_FUNCTION_ARGS)
96 {
97 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
98
99 amroutine->amstrategies = BTMaxStrategyNumber;
100 amroutine->amsupport = BTNProcs;
101 amroutine->amoptsprocnum = BTOPTIONS_PROC;
102 amroutine->amcanorder = true;
103 amroutine->amcanorderbyop = false;
104 amroutine->amcanbackward = true;
105 amroutine->amcanunique = true;
106 amroutine->amcanmulticol = true;
107 amroutine->amoptionalkey = true;
108 amroutine->amsearcharray = true;
109 amroutine->amsearchnulls = true;
110 amroutine->amstorage = false;
111 amroutine->amclusterable = true;
112 amroutine->ampredlocks = true;
113 amroutine->amcanparallel = true;
114 amroutine->amcaninclude = true;
115 amroutine->amusemaintenanceworkmem = false;
116 amroutine->amparallelvacuumoptions =
117 VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
118 amroutine->amkeytype = InvalidOid;
119
120 amroutine->ambuild = btbuild;
121 amroutine->ambuildempty = btbuildempty;
122 amroutine->aminsert = btinsert;
123 amroutine->ambulkdelete = btbulkdelete;
124 amroutine->amvacuumcleanup = btvacuumcleanup;
125 amroutine->amcanreturn = btcanreturn;
126 amroutine->amcostestimate = btcostestimate;
127 amroutine->amoptions = btoptions;
128 amroutine->amproperty = btproperty;
129 amroutine->ambuildphasename = btbuildphasename;
130 amroutine->amvalidate = btvalidate;
131 amroutine->amadjustmembers = btadjustmembers;
132 amroutine->ambeginscan = btbeginscan;
133 amroutine->amrescan = btrescan;
134 amroutine->amgettuple = btgettuple;
135 amroutine->amgetbitmap = btgetbitmap;
136 amroutine->amendscan = btendscan;
137 amroutine->ammarkpos = btmarkpos;
138 amroutine->amrestrpos = btrestrpos;
139 amroutine->amestimateparallelscan = btestimateparallelscan;
140 amroutine->aminitparallelscan = btinitparallelscan;
141 amroutine->amparallelrescan = btparallelrescan;
142
143 PG_RETURN_POINTER(amroutine);
144 }
145
146 /*
147 * btbuildempty() -- build an empty btree index in the initialization fork
148 */
149 void
btbuildempty(Relation index)150 btbuildempty(Relation index)
151 {
152 Page metapage;
153
154 /* Construct metapage. */
155 metapage = (Page) palloc(BLCKSZ);
156 _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
157
158 /*
159 * Write the page and log it. It might seem that an immediate sync would
160 * be sufficient to guarantee that the file exists on disk, but recovery
161 * itself might remove it while replaying, for example, an
162 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
163 * this even when wal_level=minimal.
164 */
165 PageSetChecksumInplace(metapage, BTREE_METAPAGE);
166 smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
167 (char *) metapage, true);
168 log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
169 BTREE_METAPAGE, metapage, true);
170
171 /*
172 * An immediate sync is required even if we xlog'd the page, because the
173 * write did not go through shared_buffers and therefore a concurrent
174 * checkpoint may have moved the redo pointer past our xlog record.
175 */
176 smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
177 }
178
179 /*
180 * btinsert() -- insert an index tuple into a btree.
181 *
182 * Descend the tree recursively, find the appropriate location for our
183 * new tuple, and put it there.
184 */
185 bool
btinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,bool indexUnchanged,IndexInfo * indexInfo)186 btinsert(Relation rel, Datum *values, bool *isnull,
187 ItemPointer ht_ctid, Relation heapRel,
188 IndexUniqueCheck checkUnique,
189 bool indexUnchanged,
190 IndexInfo *indexInfo)
191 {
192 bool result;
193 IndexTuple itup;
194
195 /* generate an index tuple */
196 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
197 itup->t_tid = *ht_ctid;
198
199 result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
200
201 pfree(itup);
202
203 return result;
204 }
205
206 /*
207 * btgettuple() -- Get the next tuple in the scan.
208 */
209 bool
btgettuple(IndexScanDesc scan,ScanDirection dir)210 btgettuple(IndexScanDesc scan, ScanDirection dir)
211 {
212 BTScanOpaque so = (BTScanOpaque) scan->opaque;
213 bool res;
214
215 /* btree indexes are never lossy */
216 scan->xs_recheck = false;
217
218 /*
219 * If we have any array keys, initialize them during first call for a
220 * scan. We can't do this in btrescan because we don't know the scan
221 * direction at that time.
222 */
223 if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
224 {
225 /* punt if we have any unsatisfiable array keys */
226 if (so->numArrayKeys < 0)
227 return false;
228
229 _bt_start_array_keys(scan, dir);
230 }
231
232 /* This loop handles advancing to the next array elements, if any */
233 do
234 {
235 /*
236 * If we've already initialized this scan, we can just advance it in
237 * the appropriate direction. If we haven't done so yet, we call
238 * _bt_first() to get the first item in the scan.
239 */
240 if (!BTScanPosIsValid(so->currPos))
241 res = _bt_first(scan, dir);
242 else
243 {
244 /*
245 * Check to see if we should kill the previously-fetched tuple.
246 */
247 if (scan->kill_prior_tuple)
248 {
249 /*
250 * Yes, remember it for later. (We'll deal with all such
251 * tuples at once right before leaving the index page.) The
252 * test for numKilled overrun is not just paranoia: if the
253 * caller reverses direction in the indexscan then the same
254 * item might get entered multiple times. It's not worth
255 * trying to optimize that, so we don't detect it, but instead
256 * just forget any excess entries.
257 */
258 if (so->killedItems == NULL)
259 so->killedItems = (int *)
260 palloc(MaxTIDsPerBTreePage * sizeof(int));
261 if (so->numKilled < MaxTIDsPerBTreePage)
262 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
263 }
264
265 /*
266 * Now continue the scan.
267 */
268 res = _bt_next(scan, dir);
269 }
270
271 /* If we have a tuple, return it ... */
272 if (res)
273 break;
274 /* ... otherwise see if we have more array keys to deal with */
275 } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
276
277 return res;
278 }
279
280 /*
281 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
282 */
283 int64
btgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)284 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
285 {
286 BTScanOpaque so = (BTScanOpaque) scan->opaque;
287 int64 ntids = 0;
288 ItemPointer heapTid;
289
290 /*
291 * If we have any array keys, initialize them.
292 */
293 if (so->numArrayKeys)
294 {
295 /* punt if we have any unsatisfiable array keys */
296 if (so->numArrayKeys < 0)
297 return ntids;
298
299 _bt_start_array_keys(scan, ForwardScanDirection);
300 }
301
302 /* This loop handles advancing to the next array elements, if any */
303 do
304 {
305 /* Fetch the first page & tuple */
306 if (_bt_first(scan, ForwardScanDirection))
307 {
308 /* Save tuple ID, and continue scanning */
309 heapTid = &scan->xs_heaptid;
310 tbm_add_tuples(tbm, heapTid, 1, false);
311 ntids++;
312
313 for (;;)
314 {
315 /*
316 * Advance to next tuple within page. This is the same as the
317 * easy case in _bt_next().
318 */
319 if (++so->currPos.itemIndex > so->currPos.lastItem)
320 {
321 /* let _bt_next do the heavy lifting */
322 if (!_bt_next(scan, ForwardScanDirection))
323 break;
324 }
325
326 /* Save tuple ID, and continue scanning */
327 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
328 tbm_add_tuples(tbm, heapTid, 1, false);
329 ntids++;
330 }
331 }
332 /* Now see if we have more array keys to deal with */
333 } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
334
335 return ntids;
336 }
337
338 /*
339 * btbeginscan() -- start a scan on a btree index
340 */
341 IndexScanDesc
btbeginscan(Relation rel,int nkeys,int norderbys)342 btbeginscan(Relation rel, int nkeys, int norderbys)
343 {
344 IndexScanDesc scan;
345 BTScanOpaque so;
346
347 /* no order by operators allowed */
348 Assert(norderbys == 0);
349
350 /* get the scan */
351 scan = RelationGetIndexScan(rel, nkeys, norderbys);
352
353 /* allocate private workspace */
354 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
355 BTScanPosInvalidate(so->currPos);
356 BTScanPosInvalidate(so->markPos);
357 if (scan->numberOfKeys > 0)
358 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
359 else
360 so->keyData = NULL;
361
362 so->arrayKeyData = NULL; /* assume no array keys for now */
363 so->numArrayKeys = 0;
364 so->arrayKeys = NULL;
365 so->arrayContext = NULL;
366
367 so->killedItems = NULL; /* until needed */
368 so->numKilled = 0;
369
370 /*
371 * We don't know yet whether the scan will be index-only, so we do not
372 * allocate the tuple workspace arrays until btrescan. However, we set up
373 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
374 */
375 so->currTuples = so->markTuples = NULL;
376
377 scan->xs_itupdesc = RelationGetDescr(rel);
378
379 scan->opaque = so;
380
381 return scan;
382 }
383
384 /*
385 * btrescan() -- rescan an index relation
386 */
387 void
btrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)388 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389 ScanKey orderbys, int norderbys)
390 {
391 BTScanOpaque so = (BTScanOpaque) scan->opaque;
392
393 /* we aren't holding any read locks, but gotta drop the pins */
394 if (BTScanPosIsValid(so->currPos))
395 {
396 /* Before leaving current page, deal with any killed items */
397 if (so->numKilled > 0)
398 _bt_killitems(scan);
399 BTScanPosUnpinIfPinned(so->currPos);
400 BTScanPosInvalidate(so->currPos);
401 }
402
403 so->markItemIndex = -1;
404 so->arrayKeyCount = 0;
405 BTScanPosUnpinIfPinned(so->markPos);
406 BTScanPosInvalidate(so->markPos);
407
408 /*
409 * Allocate tuple workspace arrays, if needed for an index-only scan and
410 * not already done in a previous rescan call. To save on palloc
411 * overhead, both workspaces are allocated as one palloc block; only this
412 * function and btendscan know that.
413 *
414 * NOTE: this data structure also makes it safe to return data from a
415 * "name" column, even though btree name_ops uses an underlying storage
416 * datatype of cstring. The risk there is that "name" is supposed to be
417 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
418 * However, since we only return data out of tuples sitting in the
419 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
420 * data out of the markTuples array --- running off the end of memory for
421 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
422 * adding special-case treatment for name_ops elsewhere.
423 */
424 if (scan->xs_want_itup && so->currTuples == NULL)
425 {
426 so->currTuples = (char *) palloc(BLCKSZ * 2);
427 so->markTuples = so->currTuples + BLCKSZ;
428 }
429
430 /*
431 * Reset the scan keys
432 */
433 if (scankey && scan->numberOfKeys > 0)
434 memmove(scan->keyData,
435 scankey,
436 scan->numberOfKeys * sizeof(ScanKeyData));
437 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
438
439 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
440 _bt_preprocess_array_keys(scan);
441 }
442
443 /*
444 * btendscan() -- close down a scan
445 */
446 void
btendscan(IndexScanDesc scan)447 btendscan(IndexScanDesc scan)
448 {
449 BTScanOpaque so = (BTScanOpaque) scan->opaque;
450
451 /* we aren't holding any read locks, but gotta drop the pins */
452 if (BTScanPosIsValid(so->currPos))
453 {
454 /* Before leaving current page, deal with any killed items */
455 if (so->numKilled > 0)
456 _bt_killitems(scan);
457 BTScanPosUnpinIfPinned(so->currPos);
458 }
459
460 so->markItemIndex = -1;
461 BTScanPosUnpinIfPinned(so->markPos);
462
463 /* No need to invalidate positions, the RAM is about to be freed. */
464
465 /* Release storage */
466 if (so->keyData != NULL)
467 pfree(so->keyData);
468 /* so->arrayKeyData and so->arrayKeys are in arrayContext */
469 if (so->arrayContext != NULL)
470 MemoryContextDelete(so->arrayContext);
471 if (so->killedItems != NULL)
472 pfree(so->killedItems);
473 if (so->currTuples != NULL)
474 pfree(so->currTuples);
475 /* so->markTuples should not be pfree'd, see btrescan */
476 pfree(so);
477 }
478
479 /*
480 * btmarkpos() -- save current scan position
481 */
482 void
btmarkpos(IndexScanDesc scan)483 btmarkpos(IndexScanDesc scan)
484 {
485 BTScanOpaque so = (BTScanOpaque) scan->opaque;
486
487 /* There may be an old mark with a pin (but no lock). */
488 BTScanPosUnpinIfPinned(so->markPos);
489
490 /*
491 * Just record the current itemIndex. If we later step to next page
492 * before releasing the marked position, _bt_steppage makes a full copy of
493 * the currPos struct in markPos. If (as often happens) the mark is moved
494 * before we leave the page, we don't have to do that work.
495 */
496 if (BTScanPosIsValid(so->currPos))
497 so->markItemIndex = so->currPos.itemIndex;
498 else
499 {
500 BTScanPosInvalidate(so->markPos);
501 so->markItemIndex = -1;
502 }
503
504 /* Also record the current positions of any array keys */
505 if (so->numArrayKeys)
506 _bt_mark_array_keys(scan);
507 }
508
509 /*
510 * btrestrpos() -- restore scan to last saved position
511 */
512 void
btrestrpos(IndexScanDesc scan)513 btrestrpos(IndexScanDesc scan)
514 {
515 BTScanOpaque so = (BTScanOpaque) scan->opaque;
516
517 /* Restore the marked positions of any array keys */
518 if (so->numArrayKeys)
519 _bt_restore_array_keys(scan);
520
521 if (so->markItemIndex >= 0)
522 {
523 /*
524 * The scan has never moved to a new page since the last mark. Just
525 * restore the itemIndex.
526 *
527 * NB: In this case we can't count on anything in so->markPos to be
528 * accurate.
529 */
530 so->currPos.itemIndex = so->markItemIndex;
531 }
532 else
533 {
534 /*
535 * The scan moved to a new page after last mark or restore, and we are
536 * now restoring to the marked page. We aren't holding any read
537 * locks, but if we're still holding the pin for the current position,
538 * we must drop it.
539 */
540 if (BTScanPosIsValid(so->currPos))
541 {
542 /* Before leaving current page, deal with any killed items */
543 if (so->numKilled > 0)
544 _bt_killitems(scan);
545 BTScanPosUnpinIfPinned(so->currPos);
546 }
547
548 if (BTScanPosIsValid(so->markPos))
549 {
550 /* bump pin on mark buffer for assignment to current buffer */
551 if (BTScanPosIsPinned(so->markPos))
552 IncrBufferRefCount(so->markPos.buf);
553 memcpy(&so->currPos, &so->markPos,
554 offsetof(BTScanPosData, items[1]) +
555 so->markPos.lastItem * sizeof(BTScanPosItem));
556 if (so->currTuples)
557 memcpy(so->currTuples, so->markTuples,
558 so->markPos.nextTupleOffset);
559 }
560 else
561 BTScanPosInvalidate(so->currPos);
562 }
563 }
564
565 /*
566 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
567 */
568 Size
btestimateparallelscan(void)569 btestimateparallelscan(void)
570 {
571 return sizeof(BTParallelScanDescData);
572 }
573
574 /*
575 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
576 */
577 void
btinitparallelscan(void * target)578 btinitparallelscan(void *target)
579 {
580 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
581
582 SpinLockInit(&bt_target->btps_mutex);
583 bt_target->btps_scanPage = InvalidBlockNumber;
584 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
585 bt_target->btps_arrayKeyCount = 0;
586 ConditionVariableInit(&bt_target->btps_cv);
587 }
588
589 /*
590 * btparallelrescan() -- reset parallel scan
591 */
592 void
btparallelrescan(IndexScanDesc scan)593 btparallelrescan(IndexScanDesc scan)
594 {
595 BTParallelScanDesc btscan;
596 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
597
598 Assert(parallel_scan);
599
600 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
601 parallel_scan->ps_offset);
602
603 /*
604 * In theory, we don't need to acquire the spinlock here, because there
605 * shouldn't be any other workers running at this point, but we do so for
606 * consistency.
607 */
608 SpinLockAcquire(&btscan->btps_mutex);
609 btscan->btps_scanPage = InvalidBlockNumber;
610 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
611 btscan->btps_arrayKeyCount = 0;
612 SpinLockRelease(&btscan->btps_mutex);
613 }
614
615 /*
616 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
617 * page. Other scans must wait until we call _bt_parallel_release()
618 * or _bt_parallel_done().
619 *
620 * The return value is true if we successfully seized the scan and false
621 * if we did not. The latter case occurs if no pages remain for the current
622 * set of scankeys.
623 *
624 * If the return value is true, *pageno returns the next or current page
625 * of the scan (depending on the scan direction). An invalid block number
626 * means the scan hasn't yet started, and P_NONE means we've reached the end.
627 * The first time a participating process reaches the last page, it will return
628 * true and set *pageno to P_NONE; after that, further attempts to seize the
629 * scan will return false.
630 *
631 * Callers should ignore the value of pageno if the return value is false.
632 */
633 bool
_bt_parallel_seize(IndexScanDesc scan,BlockNumber * pageno)634 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
635 {
636 BTScanOpaque so = (BTScanOpaque) scan->opaque;
637 BTPS_State pageStatus;
638 bool exit_loop = false;
639 bool status = true;
640 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
641 BTParallelScanDesc btscan;
642
643 *pageno = P_NONE;
644
645 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
646 parallel_scan->ps_offset);
647
648 while (1)
649 {
650 SpinLockAcquire(&btscan->btps_mutex);
651 pageStatus = btscan->btps_pageStatus;
652
653 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
654 {
655 /* Parallel scan has already advanced to a new set of scankeys. */
656 status = false;
657 }
658 else if (pageStatus == BTPARALLEL_DONE)
659 {
660 /*
661 * We're done with this set of scankeys. This may be the end, or
662 * there could be more sets to try.
663 */
664 status = false;
665 }
666 else if (pageStatus != BTPARALLEL_ADVANCING)
667 {
668 /*
669 * We have successfully seized control of the scan for the purpose
670 * of advancing it to a new page!
671 */
672 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
673 *pageno = btscan->btps_scanPage;
674 exit_loop = true;
675 }
676 SpinLockRelease(&btscan->btps_mutex);
677 if (exit_loop || !status)
678 break;
679 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
680 }
681 ConditionVariableCancelSleep();
682
683 return status;
684 }
685
686 /*
687 * _bt_parallel_release() -- Complete the process of advancing the scan to a
688 * new page. We now have the new value btps_scanPage; some other backend
689 * can now begin advancing the scan.
690 */
691 void
_bt_parallel_release(IndexScanDesc scan,BlockNumber scan_page)692 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
693 {
694 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
695 BTParallelScanDesc btscan;
696
697 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
698 parallel_scan->ps_offset);
699
700 SpinLockAcquire(&btscan->btps_mutex);
701 btscan->btps_scanPage = scan_page;
702 btscan->btps_pageStatus = BTPARALLEL_IDLE;
703 SpinLockRelease(&btscan->btps_mutex);
704 ConditionVariableSignal(&btscan->btps_cv);
705 }
706
707 /*
708 * _bt_parallel_done() -- Mark the parallel scan as complete.
709 *
710 * When there are no pages left to scan, this function should be called to
711 * notify other workers. Otherwise, they might wait forever for the scan to
712 * advance to the next page.
713 */
714 void
_bt_parallel_done(IndexScanDesc scan)715 _bt_parallel_done(IndexScanDesc scan)
716 {
717 BTScanOpaque so = (BTScanOpaque) scan->opaque;
718 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
719 BTParallelScanDesc btscan;
720 bool status_changed = false;
721
722 /* Do nothing, for non-parallel scans */
723 if (parallel_scan == NULL)
724 return;
725
726 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
727 parallel_scan->ps_offset);
728
729 /*
730 * Mark the parallel scan as done for this combination of scan keys,
731 * unless some other process already did so. See also
732 * _bt_advance_array_keys.
733 */
734 SpinLockAcquire(&btscan->btps_mutex);
735 if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
736 btscan->btps_pageStatus != BTPARALLEL_DONE)
737 {
738 btscan->btps_pageStatus = BTPARALLEL_DONE;
739 status_changed = true;
740 }
741 SpinLockRelease(&btscan->btps_mutex);
742
743 /* wake up all the workers associated with this parallel scan */
744 if (status_changed)
745 ConditionVariableBroadcast(&btscan->btps_cv);
746 }
747
748 /*
749 * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
750 * keys.
751 *
752 * Updates the count of array keys processed for both local and parallel
753 * scans.
754 */
755 void
_bt_parallel_advance_array_keys(IndexScanDesc scan)756 _bt_parallel_advance_array_keys(IndexScanDesc scan)
757 {
758 BTScanOpaque so = (BTScanOpaque) scan->opaque;
759 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
760 BTParallelScanDesc btscan;
761
762 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
763 parallel_scan->ps_offset);
764
765 so->arrayKeyCount++;
766 SpinLockAcquire(&btscan->btps_mutex);
767 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
768 {
769 btscan->btps_scanPage = InvalidBlockNumber;
770 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
771 btscan->btps_arrayKeyCount++;
772 }
773 SpinLockRelease(&btscan->btps_mutex);
774 }
775
776 /*
777 * Bulk deletion of all index entries pointing to a set of heap tuples.
778 * The set of target tuples is specified via a callback routine that tells
779 * whether any given heap tuple (identified by ItemPointer) is being deleted.
780 *
781 * Result: a palloc'd struct containing statistical info for VACUUM displays.
782 */
783 IndexBulkDeleteResult *
btbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)784 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
785 IndexBulkDeleteCallback callback, void *callback_state)
786 {
787 Relation rel = info->index;
788 BTCycleId cycleid;
789
790 /* allocate stats if first time through, else re-use existing struct */
791 if (stats == NULL)
792 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
793
794 /* Establish the vacuum cycle ID to use for this scan */
795 /* The ENSURE stuff ensures we clean up shared memory on failure */
796 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
797 {
798 cycleid = _bt_start_vacuum(rel);
799
800 btvacuumscan(info, stats, callback, callback_state, cycleid);
801 }
802 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
803 _bt_end_vacuum(rel);
804
805 return stats;
806 }
807
808 /*
809 * Post-VACUUM cleanup.
810 *
811 * Result: a palloc'd struct containing statistical info for VACUUM displays.
812 */
813 IndexBulkDeleteResult *
btvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)814 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
815 {
816 BlockNumber num_delpages;
817
818 /* No-op in ANALYZE ONLY mode */
819 if (info->analyze_only)
820 return stats;
821
822 /*
823 * If btbulkdelete was called, we need not do anything (we just maintain
824 * the information used within _bt_vacuum_needs_cleanup() by calling
825 * _bt_set_cleanup_info() below).
826 *
827 * If btbulkdelete was _not_ called, then we have a choice to make: we
828 * must decide whether or not a btvacuumscan() call is needed now (i.e.
829 * whether the ongoing VACUUM operation can entirely avoid a physical scan
830 * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
831 * now.
832 */
833 if (stats == NULL)
834 {
835 /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
836 if (!_bt_vacuum_needs_cleanup(info->index))
837 return NULL;
838
839 /*
840 * Since we aren't going to actually delete any leaf items, there's no
841 * need to go through all the vacuum-cycle-ID pushups here.
842 *
843 * Posting list tuples are a source of inaccuracy for cleanup-only
844 * scans. btvacuumscan() will assume that the number of index tuples
845 * from each page can be used as num_index_tuples, even though
846 * num_index_tuples is supposed to represent the number of TIDs in the
847 * index. This naive approach can underestimate the number of tuples
848 * in the index significantly.
849 *
850 * We handle the problem by making num_index_tuples an estimate in
851 * cleanup-only case.
852 */
853 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
854 btvacuumscan(info, stats, NULL, NULL, 0);
855 stats->estimated_count = true;
856 }
857
858 /*
859 * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
860 *
861 * num_delpages is the number of deleted pages now in the index that were
862 * not safe to place in the FSM to be recycled just yet. num_delpages is
863 * greater than 0 only when _bt_pagedel() actually deleted pages during
864 * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
865 * have failed to place any newly deleted pages in the FSM just moments
866 * ago. (Actually, there are edge cases where recycling of the current
867 * VACUUM's newly deleted pages does not even become safe by the time the
868 * next VACUUM comes around. See nbtree/README.)
869 */
870 Assert(stats->pages_deleted >= stats->pages_free);
871 num_delpages = stats->pages_deleted - stats->pages_free;
872 _bt_set_cleanup_info(info->index, num_delpages);
873
874 /*
875 * It's quite possible for us to be fooled by concurrent page splits into
876 * double-counting some index tuples, so disbelieve any total that exceeds
877 * the underlying heap's count ... if we know that accurately. Otherwise
878 * this might just make matters worse.
879 */
880 if (!info->estimated_count)
881 {
882 if (stats->num_index_tuples > info->num_heap_tuples)
883 stats->num_index_tuples = info->num_heap_tuples;
884 }
885
886 return stats;
887 }
888
889 /*
890 * btvacuumscan --- scan the index for VACUUMing purposes
891 *
892 * This combines the functions of looking for leaf tuples that are deletable
893 * according to the vacuum callback, looking for empty pages that can be
894 * deleted, and looking for old deleted pages that can be recycled. Both
895 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
896 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
897 *
898 * The caller is responsible for initially allocating/zeroing a stats struct
899 * and for obtaining a vacuum cycle ID if necessary.
900 */
901 static void
btvacuumscan(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state,BTCycleId cycleid)902 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
903 IndexBulkDeleteCallback callback, void *callback_state,
904 BTCycleId cycleid)
905 {
906 Relation rel = info->index;
907 BTVacState vstate;
908 BlockNumber num_pages;
909 BlockNumber scanblkno;
910 bool needLock;
911
912 /*
913 * Reset fields that track information about the entire index now. This
914 * avoids double-counting in the case where a single VACUUM command
915 * requires multiple scans of the index.
916 *
917 * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
918 * since they track information about the VACUUM command, and so must last
919 * across each call to btvacuumscan().
920 *
921 * (Note that pages_free is treated as state about the whole index, not
922 * the current VACUUM. This is appropriate because RecordFreeIndexPage()
923 * calls are idempotent, and get repeated for the same deleted pages in
924 * some scenarios. The point for us is to track the number of recyclable
925 * pages in the index at the end of the VACUUM command.)
926 */
927 stats->num_pages = 0;
928 stats->num_index_tuples = 0;
929 stats->pages_deleted = 0;
930 stats->pages_free = 0;
931
932 /* Set up info to pass down to btvacuumpage */
933 vstate.info = info;
934 vstate.stats = stats;
935 vstate.callback = callback;
936 vstate.callback_state = callback_state;
937 vstate.cycleid = cycleid;
938
939 /* Create a temporary memory context to run _bt_pagedel in */
940 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
941 "_bt_pagedel",
942 ALLOCSET_DEFAULT_SIZES);
943
944 /* Initialize vstate fields used by _bt_pendingfsm_finalize */
945 vstate.bufsize = 0;
946 vstate.maxbufsize = 0;
947 vstate.pendingpages = NULL;
948 vstate.npendingpages = 0;
949 /* Consider applying _bt_pendingfsm_finalize optimization */
950 _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
951
952 /*
953 * The outer loop iterates over all index pages except the metapage, in
954 * physical order (we hope the kernel will cooperate in providing
955 * read-ahead for speed). It is critical that we visit all leaf pages,
956 * including ones added after we start the scan, else we might fail to
957 * delete some deletable tuples. Hence, we must repeatedly check the
958 * relation length. We must acquire the relation-extension lock while
959 * doing so to avoid a race condition: if someone else is extending the
960 * relation, there is a window where bufmgr/smgr have created a new
961 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
962 * we manage to scan such a page here, we'll improperly assume it can be
963 * recycled. Taking the lock synchronizes things enough to prevent a
964 * problem: either num_pages won't include the new page, or _bt_getbuf
965 * already has write lock on the buffer and it will be fully initialized
966 * before we can examine it. (See also vacuumlazy.c, which has the same
967 * issue.) Also, we need not worry if a page is added immediately after
968 * we look; the page splitting code already has write-lock on the left
969 * page before it adds a right page, so we must already have processed any
970 * tuples due to be moved into such a page.
971 *
972 * We can skip locking for new or temp relations, however, since no one
973 * else could be accessing them.
974 */
975 needLock = !RELATION_IS_LOCAL(rel);
976
977 scanblkno = BTREE_METAPAGE + 1;
978 for (;;)
979 {
980 /* Get the current relation length */
981 if (needLock)
982 LockRelationForExtension(rel, ExclusiveLock);
983 num_pages = RelationGetNumberOfBlocks(rel);
984 if (needLock)
985 UnlockRelationForExtension(rel, ExclusiveLock);
986
987 if (info->report_progress)
988 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
989 num_pages);
990
991 /* Quit if we've scanned the whole relation */
992 if (scanblkno >= num_pages)
993 break;
994 /* Iterate over pages, then loop back to recheck length */
995 for (; scanblkno < num_pages; scanblkno++)
996 {
997 btvacuumpage(&vstate, scanblkno);
998 if (info->report_progress)
999 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1000 scanblkno);
1001 }
1002 }
1003
1004 /* Set statistics num_pages field to final size of index */
1005 stats->num_pages = num_pages;
1006
1007 MemoryContextDelete(vstate.pagedelcontext);
1008
1009 /*
1010 * If there were any calls to _bt_pagedel() during scan of the index then
1011 * see if any of the resulting pages can be placed in the FSM now. When
1012 * it's not safe we'll have to leave it up to a future VACUUM operation.
1013 *
1014 * Finally, if we placed any pages in the FSM (either just now or during
1015 * the scan), forcibly update the upper-level FSM pages to ensure that
1016 * searchers can find them.
1017 */
1018 _bt_pendingfsm_finalize(rel, &vstate);
1019 if (stats->pages_free > 0)
1020 IndexFreeSpaceMapVacuum(rel);
1021 }
1022
1023 /*
1024 * btvacuumpage --- VACUUM one page
1025 *
1026 * This processes a single page for btvacuumscan(). In some cases we must
1027 * backtrack to re-examine and VACUUM pages that were the scanblkno during
1028 * a previous call here. This is how we handle page splits (that happened
1029 * after our cycleid was acquired) whose right half page happened to reuse
1030 * a block that we might have processed at some point before it was
1031 * recycled (i.e. before the page split).
1032 */
1033 static void
btvacuumpage(BTVacState * vstate,BlockNumber scanblkno)1034 btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1035 {
1036 IndexVacuumInfo *info = vstate->info;
1037 IndexBulkDeleteResult *stats = vstate->stats;
1038 IndexBulkDeleteCallback callback = vstate->callback;
1039 void *callback_state = vstate->callback_state;
1040 Relation rel = info->index;
1041 bool attempt_pagedel;
1042 BlockNumber blkno,
1043 backtrack_to;
1044 Buffer buf;
1045 Page page;
1046 BTPageOpaque opaque;
1047
1048 blkno = scanblkno;
1049
1050 backtrack:
1051
1052 attempt_pagedel = false;
1053 backtrack_to = P_NONE;
1054
1055 /* call vacuum_delay_point while not holding any buffer lock */
1056 vacuum_delay_point();
1057
1058 /*
1059 * We can't use _bt_getbuf() here because it always applies
1060 * _bt_checkpage(), which will barf on an all-zero page. We want to
1061 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1062 * buffer access strategy.
1063 */
1064 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1065 info->strategy);
1066 _bt_lockbuf(rel, buf, BT_READ);
1067 page = BufferGetPage(buf);
1068 opaque = NULL;
1069 if (!PageIsNew(page))
1070 {
1071 _bt_checkpage(rel, buf);
1072 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1073 }
1074
1075 Assert(blkno <= scanblkno);
1076 if (blkno != scanblkno)
1077 {
1078 /*
1079 * We're backtracking.
1080 *
1081 * We followed a right link to a sibling leaf page (a page that
1082 * happens to be from a block located before scanblkno). The only
1083 * case we want to do anything with is a live leaf page having the
1084 * current vacuum cycle ID.
1085 *
1086 * The page had better be in a state that's consistent with what we
1087 * expect. Check for conditions that imply corruption in passing. It
1088 * can't be half-dead because only an interrupted VACUUM process can
1089 * leave pages in that state, so we'd definitely have dealt with it
1090 * back when the page was the scanblkno page (half-dead pages are
1091 * always marked fully deleted by _bt_pagedel()). This assumes that
1092 * there can be only one vacuum process running at a time.
1093 */
1094 if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1095 {
1096 Assert(false);
1097 ereport(LOG,
1098 (errcode(ERRCODE_INDEX_CORRUPTED),
1099 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1100 blkno, scanblkno, RelationGetRelationName(rel))));
1101 _bt_relbuf(rel, buf);
1102 return;
1103 }
1104
1105 /*
1106 * We may have already processed the page in an earlier call, when the
1107 * page was scanblkno. This happens when the leaf page split occurred
1108 * after the scan began, but before the right sibling page became the
1109 * scanblkno.
1110 *
1111 * Page may also have been deleted by current btvacuumpage() call,
1112 * since _bt_pagedel() sometimes deletes the right sibling page of
1113 * scanblkno in passing (it does so after we decided where to
1114 * backtrack to). We don't need to process this page as a deleted
1115 * page a second time now (in fact, it would be wrong to count it as a
1116 * deleted page in the bulk delete statistics a second time).
1117 */
1118 if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1119 {
1120 /* Done with current scanblkno (and all lower split pages) */
1121 _bt_relbuf(rel, buf);
1122 return;
1123 }
1124 }
1125
1126 if (!opaque || BTPageIsRecyclable(page))
1127 {
1128 /* Okay to recycle this page (which could be leaf or internal) */
1129 RecordFreeIndexPage(rel, blkno);
1130 stats->pages_deleted++;
1131 stats->pages_free++;
1132 }
1133 else if (P_ISDELETED(opaque))
1134 {
1135 /*
1136 * Already deleted page (which could be leaf or internal). Can't
1137 * recycle yet.
1138 */
1139 stats->pages_deleted++;
1140 }
1141 else if (P_ISHALFDEAD(opaque))
1142 {
1143 /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1144 attempt_pagedel = true;
1145
1146 /*
1147 * _bt_pagedel() will increment both pages_newly_deleted and
1148 * pages_deleted stats in all cases (barring corruption)
1149 */
1150 }
1151 else if (P_ISLEAF(opaque))
1152 {
1153 OffsetNumber deletable[MaxIndexTuplesPerPage];
1154 int ndeletable;
1155 BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1156 int nupdatable;
1157 OffsetNumber offnum,
1158 minoff,
1159 maxoff;
1160 int nhtidsdead,
1161 nhtidslive;
1162
1163 /*
1164 * Trade in the initial read lock for a super-exclusive write lock on
1165 * this page. We must get such a lock on every leaf page over the
1166 * course of the vacuum scan, whether or not it actually contains any
1167 * deletable tuples --- see nbtree/README.
1168 */
1169 _bt_upgradelockbufcleanup(rel, buf);
1170
1171 /*
1172 * Check whether we need to backtrack to earlier pages. What we are
1173 * concerned about is a page split that happened since we started the
1174 * vacuum scan. If the split moved tuples on the right half of the
1175 * split (i.e. the tuples that sort high) to a block that we already
1176 * passed over, then we might have missed the tuples. We need to
1177 * backtrack now. (Must do this before possibly clearing btpo_cycleid
1178 * or deleting scanblkno page below!)
1179 */
1180 if (vstate->cycleid != 0 &&
1181 opaque->btpo_cycleid == vstate->cycleid &&
1182 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1183 !P_RIGHTMOST(opaque) &&
1184 opaque->btpo_next < scanblkno)
1185 backtrack_to = opaque->btpo_next;
1186
1187 /*
1188 * When each VACUUM begins, it determines an OldestXmin cutoff value.
1189 * Tuples before the cutoff are removed by VACUUM. Scan over all
1190 * items to see which ones need to be deleted according to cutoff
1191 * point using callback.
1192 */
1193 ndeletable = 0;
1194 nupdatable = 0;
1195 minoff = P_FIRSTDATAKEY(opaque);
1196 maxoff = PageGetMaxOffsetNumber(page);
1197 nhtidsdead = 0;
1198 nhtidslive = 0;
1199 if (callback)
1200 {
1201 for (offnum = minoff;
1202 offnum <= maxoff;
1203 offnum = OffsetNumberNext(offnum))
1204 {
1205 IndexTuple itup;
1206
1207 itup = (IndexTuple) PageGetItem(page,
1208 PageGetItemId(page, offnum));
1209
1210 /*
1211 * Hot Standby assumes that it's okay that XLOG_BTREE_VACUUM
1212 * records do not produce their own conflicts. This is safe
1213 * as long as the callback function only considers whether the
1214 * index tuple refers to pre-cutoff heap tuples that were
1215 * certainly already pruned away during VACUUM's initial heap
1216 * scan by the time we get here. (heapam's XLOG_HEAP2_PRUNE
1217 * records produce conflicts using a latestRemovedXid value
1218 * for the pointed-to heap tuples, so there is no need to
1219 * produce our own conflict now.)
1220 *
1221 * Backends with snapshots acquired after a VACUUM starts but
1222 * before it finishes could have visibility cutoff with a
1223 * later xid than VACUUM's OldestXmin cutoff. These backends
1224 * might happen to opportunistically mark some index tuples
1225 * LP_DEAD before we reach them, even though they may be after
1226 * our cutoff. We don't try to kill these "extra" index
1227 * tuples in _bt_delitems_vacuum(). This keep things simple,
1228 * and allows us to always avoid generating our own conflicts.
1229 */
1230 Assert(!BTreeTupleIsPivot(itup));
1231 if (!BTreeTupleIsPosting(itup))
1232 {
1233 /* Regular tuple, standard table TID representation */
1234 if (callback(&itup->t_tid, callback_state))
1235 {
1236 deletable[ndeletable++] = offnum;
1237 nhtidsdead++;
1238 }
1239 else
1240 nhtidslive++;
1241 }
1242 else
1243 {
1244 BTVacuumPosting vacposting;
1245 int nremaining;
1246
1247 /* Posting list tuple */
1248 vacposting = btreevacuumposting(vstate, itup, offnum,
1249 &nremaining);
1250 if (vacposting == NULL)
1251 {
1252 /*
1253 * All table TIDs from the posting tuple remain, so no
1254 * delete or update required
1255 */
1256 Assert(nremaining == BTreeTupleGetNPosting(itup));
1257 }
1258 else if (nremaining > 0)
1259 {
1260
1261 /*
1262 * Store metadata about posting list tuple in
1263 * updatable array for entire page. Existing tuple
1264 * will be updated during the later call to
1265 * _bt_delitems_vacuum().
1266 */
1267 Assert(nremaining < BTreeTupleGetNPosting(itup));
1268 updatable[nupdatable++] = vacposting;
1269 nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1270 }
1271 else
1272 {
1273 /*
1274 * All table TIDs from the posting list must be
1275 * deleted. We'll delete the index tuple completely
1276 * (no update required).
1277 */
1278 Assert(nremaining == 0);
1279 deletable[ndeletable++] = offnum;
1280 nhtidsdead += BTreeTupleGetNPosting(itup);
1281 pfree(vacposting);
1282 }
1283
1284 nhtidslive += nremaining;
1285 }
1286 }
1287 }
1288
1289 /*
1290 * Apply any needed deletes or updates. We issue just one
1291 * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1292 */
1293 if (ndeletable > 0 || nupdatable > 0)
1294 {
1295 Assert(nhtidsdead >= ndeletable + nupdatable);
1296 _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1297 nupdatable);
1298
1299 stats->tuples_removed += nhtidsdead;
1300 /* must recompute maxoff */
1301 maxoff = PageGetMaxOffsetNumber(page);
1302
1303 /* can't leak memory here */
1304 for (int i = 0; i < nupdatable; i++)
1305 pfree(updatable[i]);
1306 }
1307 else
1308 {
1309 /*
1310 * If the leaf page has been split during this vacuum cycle, it
1311 * seems worth expending a write to clear btpo_cycleid even if we
1312 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1313 * takes care of this.) This ensures we won't process the page
1314 * again.
1315 *
1316 * We treat this like a hint-bit update because there's no need to
1317 * WAL-log it.
1318 */
1319 Assert(nhtidsdead == 0);
1320 if (vstate->cycleid != 0 &&
1321 opaque->btpo_cycleid == vstate->cycleid)
1322 {
1323 opaque->btpo_cycleid = 0;
1324 MarkBufferDirtyHint(buf, true);
1325 }
1326 }
1327
1328 /*
1329 * If the leaf page is now empty, try to delete it; else count the
1330 * live tuples (live table TIDs in posting lists are counted as
1331 * separate live tuples). We don't delete when backtracking, though,
1332 * since that would require teaching _bt_pagedel() about backtracking
1333 * (doesn't seem worth adding more complexity to deal with that).
1334 *
1335 * We don't count the number of live TIDs during cleanup-only calls to
1336 * btvacuumscan (i.e. when callback is not set). We count the number
1337 * of index tuples directly instead. This avoids the expense of
1338 * directly examining all of the tuples on each page. VACUUM will
1339 * treat num_index_tuples as an estimate in cleanup-only case, so it
1340 * doesn't matter that this underestimates num_index_tuples
1341 * significantly in some cases.
1342 */
1343 if (minoff > maxoff)
1344 attempt_pagedel = (blkno == scanblkno);
1345 else if (callback)
1346 stats->num_index_tuples += nhtidslive;
1347 else
1348 stats->num_index_tuples += maxoff - minoff + 1;
1349
1350 Assert(!attempt_pagedel || nhtidslive == 0);
1351 }
1352
1353 if (attempt_pagedel)
1354 {
1355 MemoryContext oldcontext;
1356
1357 /* Run pagedel in a temp context to avoid memory leakage */
1358 MemoryContextReset(vstate->pagedelcontext);
1359 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1360
1361 /*
1362 * _bt_pagedel maintains the bulk delete stats on our behalf;
1363 * pages_newly_deleted and pages_deleted are likely to be incremented
1364 * during call
1365 */
1366 Assert(blkno == scanblkno);
1367 _bt_pagedel(rel, buf, vstate);
1368
1369 MemoryContextSwitchTo(oldcontext);
1370 /* pagedel released buffer, so we shouldn't */
1371 }
1372 else
1373 _bt_relbuf(rel, buf);
1374
1375 if (backtrack_to != P_NONE)
1376 {
1377 blkno = backtrack_to;
1378 goto backtrack;
1379 }
1380 }
1381
1382 /*
1383 * btreevacuumposting --- determine TIDs still needed in posting list
1384 *
1385 * Returns metadata describing how to build replacement tuple without the TIDs
1386 * that VACUUM needs to delete. Returned value is NULL in the common case
1387 * where no changes are needed to caller's posting list tuple (we avoid
1388 * allocating memory here as an optimization).
1389 *
1390 * The number of TIDs that should remain in the posting list tuple is set for
1391 * caller in *nremaining.
1392 */
1393 static BTVacuumPosting
btreevacuumposting(BTVacState * vstate,IndexTuple posting,OffsetNumber updatedoffset,int * nremaining)1394 btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1395 OffsetNumber updatedoffset, int *nremaining)
1396 {
1397 int live = 0;
1398 int nitem = BTreeTupleGetNPosting(posting);
1399 ItemPointer items = BTreeTupleGetPosting(posting);
1400 BTVacuumPosting vacposting = NULL;
1401
1402 for (int i = 0; i < nitem; i++)
1403 {
1404 if (!vstate->callback(items + i, vstate->callback_state))
1405 {
1406 /* Live table TID */
1407 live++;
1408 }
1409 else if (vacposting == NULL)
1410 {
1411 /*
1412 * First dead table TID encountered.
1413 *
1414 * It's now clear that we need to delete one or more dead table
1415 * TIDs, so start maintaining metadata describing how to update
1416 * existing posting list tuple.
1417 */
1418 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1419 nitem * sizeof(uint16));
1420
1421 vacposting->itup = posting;
1422 vacposting->updatedoffset = updatedoffset;
1423 vacposting->ndeletedtids = 0;
1424 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1425 }
1426 else
1427 {
1428 /* Second or subsequent dead table TID */
1429 vacposting->deletetids[vacposting->ndeletedtids++] = i;
1430 }
1431 }
1432
1433 *nremaining = live;
1434 return vacposting;
1435 }
1436
1437 /*
1438 * btcanreturn() -- Check whether btree indexes support index-only scans.
1439 *
1440 * btrees always do, so this is trivial.
1441 */
1442 bool
btcanreturn(Relation index,int attno)1443 btcanreturn(Relation index, int attno)
1444 {
1445 return true;
1446 }
1447