1 /*-------------------------------------------------------------------------
2 *
3 * nbtree.c
4 * Implementation of Lehman and Yao's btree management algorithm for
5 * Postgres.
6 *
7 * NOTES
8 * This file contains only the public interface routines.
9 *
10 *
11 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
12 * Portions Copyright (c) 1994, Regents of the University of California
13 *
14 * IDENTIFICATION
15 * src/backend/access/nbtree/nbtree.c
16 *
17 *-------------------------------------------------------------------------
18 */
19 #include "postgres.h"
20
21 #include "access/nbtree.h"
22 #include "access/nbtxlog.h"
23 #include "access/relscan.h"
24 #include "access/xlog.h"
25 #include "commands/vacuum.h"
26 #include "miscadmin.h"
27 #include "nodes/execnodes.h"
28 #include "pgstat.h"
29 #include "postmaster/autovacuum.h"
30 #include "storage/condition_variable.h"
31 #include "storage/indexfsm.h"
32 #include "storage/ipc.h"
33 #include "storage/lmgr.h"
34 #include "storage/smgr.h"
35 #include "utils/builtins.h"
36 #include "utils/index_selfuncs.h"
37 #include "utils/memutils.h"
38
39
40 /* Working state needed by btvacuumpage */
41 typedef struct
42 {
43 IndexVacuumInfo *info;
44 IndexBulkDeleteResult *stats;
45 IndexBulkDeleteCallback callback;
46 void *callback_state;
47 BTCycleId cycleid;
48 BlockNumber lastBlockVacuumed; /* highest blkno actually vacuumed */
49 BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */
50 BlockNumber totFreePages; /* true total # of free pages */
51 TransactionId oldestBtpoXact;
52 MemoryContext pagedelcontext;
53 } BTVacState;
54
55 /*
56 * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
57 *
58 * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
59 * a new page; others must wait.
60 *
61 * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
62 * to a new page; some process can start doing that.
63 *
64 * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
65 * We reach this state once for every distinct combination of array keys.
66 */
67 typedef enum
68 {
69 BTPARALLEL_NOT_INITIALIZED,
70 BTPARALLEL_ADVANCING,
71 BTPARALLEL_IDLE,
72 BTPARALLEL_DONE
73 } BTPS_State;
74
75 /*
76 * BTParallelScanDescData contains btree specific shared information required
77 * for parallel scan.
78 */
79 typedef struct BTParallelScanDescData
80 {
81 BlockNumber btps_scanPage; /* latest or next page to be scanned */
82 BTPS_State btps_pageStatus; /* indicates whether next page is
83 * available for scan. see above for
84 * possible states of parallel scan. */
85 int btps_arrayKeyCount; /* count indicating number of array scan
86 * keys processed by parallel scan */
87 slock_t btps_mutex; /* protects above variables */
88 ConditionVariable btps_cv; /* used to synchronize parallel scan */
89 } BTParallelScanDescData;
90
91 typedef struct BTParallelScanDescData *BTParallelScanDesc;
92
93
94 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
95 IndexBulkDeleteCallback callback, void *callback_state,
96 BTCycleId cycleid);
97 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
98 BlockNumber orig_blkno);
99
100
101 /*
102 * Btree handler function: return IndexAmRoutine with access method parameters
103 * and callbacks.
104 */
105 Datum
bthandler(PG_FUNCTION_ARGS)106 bthandler(PG_FUNCTION_ARGS)
107 {
108 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
109
110 amroutine->amstrategies = BTMaxStrategyNumber;
111 amroutine->amsupport = BTNProcs;
112 amroutine->amcanorder = true;
113 amroutine->amcanorderbyop = false;
114 amroutine->amcanbackward = true;
115 amroutine->amcanunique = true;
116 amroutine->amcanmulticol = true;
117 amroutine->amoptionalkey = true;
118 amroutine->amsearcharray = true;
119 amroutine->amsearchnulls = true;
120 amroutine->amstorage = false;
121 amroutine->amclusterable = true;
122 amroutine->ampredlocks = true;
123 amroutine->amcanparallel = true;
124 amroutine->amcaninclude = true;
125 amroutine->amkeytype = InvalidOid;
126
127 amroutine->ambuild = btbuild;
128 amroutine->ambuildempty = btbuildempty;
129 amroutine->aminsert = btinsert;
130 amroutine->ambulkdelete = btbulkdelete;
131 amroutine->amvacuumcleanup = btvacuumcleanup;
132 amroutine->amcanreturn = btcanreturn;
133 amroutine->amcostestimate = btcostestimate;
134 amroutine->amoptions = btoptions;
135 amroutine->amproperty = btproperty;
136 amroutine->amvalidate = btvalidate;
137 amroutine->ambeginscan = btbeginscan;
138 amroutine->amrescan = btrescan;
139 amroutine->amgettuple = btgettuple;
140 amroutine->amgetbitmap = btgetbitmap;
141 amroutine->amendscan = btendscan;
142 amroutine->ammarkpos = btmarkpos;
143 amroutine->amrestrpos = btrestrpos;
144 amroutine->amestimateparallelscan = btestimateparallelscan;
145 amroutine->aminitparallelscan = btinitparallelscan;
146 amroutine->amparallelrescan = btparallelrescan;
147
148 PG_RETURN_POINTER(amroutine);
149 }
150
151 /*
152 * btbuildempty() -- build an empty btree index in the initialization fork
153 */
154 void
btbuildempty(Relation index)155 btbuildempty(Relation index)
156 {
157 Page metapage;
158
159 /* Construct metapage. */
160 metapage = (Page) palloc(BLCKSZ);
161 _bt_initmetapage(metapage, P_NONE, 0);
162
163 /*
164 * Write the page and log it. It might seem that an immediate sync would
165 * be sufficient to guarantee that the file exists on disk, but recovery
166 * itself might remove it while replaying, for example, an
167 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore, we need
168 * this even when wal_level=minimal.
169 */
170 PageSetChecksumInplace(metapage, BTREE_METAPAGE);
171 smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
172 (char *) metapage, true);
173 log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
174 BTREE_METAPAGE, metapage, true);
175
176 /*
177 * An immediate sync is required even if we xlog'd the page, because the
178 * write did not go through shared_buffers and therefore a concurrent
179 * checkpoint may have moved the redo pointer past our xlog record.
180 */
181 smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
182 }
183
184 /*
185 * btinsert() -- insert an index tuple into a btree.
186 *
187 * Descend the tree recursively, find the appropriate location for our
188 * new tuple, and put it there.
189 */
190 bool
btinsert(Relation rel,Datum * values,bool * isnull,ItemPointer ht_ctid,Relation heapRel,IndexUniqueCheck checkUnique,IndexInfo * indexInfo)191 btinsert(Relation rel, Datum *values, bool *isnull,
192 ItemPointer ht_ctid, Relation heapRel,
193 IndexUniqueCheck checkUnique,
194 IndexInfo *indexInfo)
195 {
196 bool result;
197 IndexTuple itup;
198
199 /* generate an index tuple */
200 itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
201 itup->t_tid = *ht_ctid;
202
203 result = _bt_doinsert(rel, itup, checkUnique, heapRel);
204
205 pfree(itup);
206
207 return result;
208 }
209
210 /*
211 * btgettuple() -- Get the next tuple in the scan.
212 */
213 bool
btgettuple(IndexScanDesc scan,ScanDirection dir)214 btgettuple(IndexScanDesc scan, ScanDirection dir)
215 {
216 BTScanOpaque so = (BTScanOpaque) scan->opaque;
217 bool res;
218
219 /* btree indexes are never lossy */
220 scan->xs_recheck = false;
221
222 /*
223 * If we have any array keys, initialize them during first call for a
224 * scan. We can't do this in btrescan because we don't know the scan
225 * direction at that time.
226 */
227 if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
228 {
229 /* punt if we have any unsatisfiable array keys */
230 if (so->numArrayKeys < 0)
231 return false;
232
233 _bt_start_array_keys(scan, dir);
234 }
235
236 /* This loop handles advancing to the next array elements, if any */
237 do
238 {
239 /*
240 * If we've already initialized this scan, we can just advance it in
241 * the appropriate direction. If we haven't done so yet, we call
242 * _bt_first() to get the first item in the scan.
243 */
244 if (!BTScanPosIsValid(so->currPos))
245 res = _bt_first(scan, dir);
246 else
247 {
248 /*
249 * Check to see if we should kill the previously-fetched tuple.
250 */
251 if (scan->kill_prior_tuple)
252 {
253 /*
254 * Yes, remember it for later. (We'll deal with all such
255 * tuples at once right before leaving the index page.) The
256 * test for numKilled overrun is not just paranoia: if the
257 * caller reverses direction in the indexscan then the same
258 * item might get entered multiple times. It's not worth
259 * trying to optimize that, so we don't detect it, but instead
260 * just forget any excess entries.
261 */
262 if (so->killedItems == NULL)
263 so->killedItems = (int *)
264 palloc(MaxIndexTuplesPerPage * sizeof(int));
265 if (so->numKilled < MaxIndexTuplesPerPage)
266 so->killedItems[so->numKilled++] = so->currPos.itemIndex;
267 }
268
269 /*
270 * Now continue the scan.
271 */
272 res = _bt_next(scan, dir);
273 }
274
275 /* If we have a tuple, return it ... */
276 if (res)
277 break;
278 /* ... otherwise see if we have more array keys to deal with */
279 } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
280
281 return res;
282 }
283
284 /*
285 * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
286 */
287 int64
btgetbitmap(IndexScanDesc scan,TIDBitmap * tbm)288 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
289 {
290 BTScanOpaque so = (BTScanOpaque) scan->opaque;
291 int64 ntids = 0;
292 ItemPointer heapTid;
293
294 /*
295 * If we have any array keys, initialize them.
296 */
297 if (so->numArrayKeys)
298 {
299 /* punt if we have any unsatisfiable array keys */
300 if (so->numArrayKeys < 0)
301 return ntids;
302
303 _bt_start_array_keys(scan, ForwardScanDirection);
304 }
305
306 /* This loop handles advancing to the next array elements, if any */
307 do
308 {
309 /* Fetch the first page & tuple */
310 if (_bt_first(scan, ForwardScanDirection))
311 {
312 /* Save tuple ID, and continue scanning */
313 heapTid = &scan->xs_ctup.t_self;
314 tbm_add_tuples(tbm, heapTid, 1, false);
315 ntids++;
316
317 for (;;)
318 {
319 /*
320 * Advance to next tuple within page. This is the same as the
321 * easy case in _bt_next().
322 */
323 if (++so->currPos.itemIndex > so->currPos.lastItem)
324 {
325 /* let _bt_next do the heavy lifting */
326 if (!_bt_next(scan, ForwardScanDirection))
327 break;
328 }
329
330 /* Save tuple ID, and continue scanning */
331 heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
332 tbm_add_tuples(tbm, heapTid, 1, false);
333 ntids++;
334 }
335 }
336 /* Now see if we have more array keys to deal with */
337 } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
338
339 return ntids;
340 }
341
342 /*
343 * btbeginscan() -- start a scan on a btree index
344 */
345 IndexScanDesc
btbeginscan(Relation rel,int nkeys,int norderbys)346 btbeginscan(Relation rel, int nkeys, int norderbys)
347 {
348 IndexScanDesc scan;
349 BTScanOpaque so;
350
351 /* no order by operators allowed */
352 Assert(norderbys == 0);
353
354 /* get the scan */
355 scan = RelationGetIndexScan(rel, nkeys, norderbys);
356
357 /* allocate private workspace */
358 so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
359 BTScanPosInvalidate(so->currPos);
360 BTScanPosInvalidate(so->markPos);
361 if (scan->numberOfKeys > 0)
362 so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
363 else
364 so->keyData = NULL;
365
366 so->arrayKeyData = NULL; /* assume no array keys for now */
367 so->numArrayKeys = 0;
368 so->arrayKeys = NULL;
369 so->arrayContext = NULL;
370
371 so->killedItems = NULL; /* until needed */
372 so->numKilled = 0;
373
374 /*
375 * We don't know yet whether the scan will be index-only, so we do not
376 * allocate the tuple workspace arrays until btrescan. However, we set up
377 * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
378 */
379 so->currTuples = so->markTuples = NULL;
380
381 scan->xs_itupdesc = RelationGetDescr(rel);
382
383 scan->opaque = so;
384
385 return scan;
386 }
387
388 /*
389 * btrescan() -- rescan an index relation
390 */
391 void
btrescan(IndexScanDesc scan,ScanKey scankey,int nscankeys,ScanKey orderbys,int norderbys)392 btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
393 ScanKey orderbys, int norderbys)
394 {
395 BTScanOpaque so = (BTScanOpaque) scan->opaque;
396
397 /* we aren't holding any read locks, but gotta drop the pins */
398 if (BTScanPosIsValid(so->currPos))
399 {
400 /* Before leaving current page, deal with any killed items */
401 if (so->numKilled > 0)
402 _bt_killitems(scan);
403 BTScanPosUnpinIfPinned(so->currPos);
404 BTScanPosInvalidate(so->currPos);
405 }
406
407 so->markItemIndex = -1;
408 so->arrayKeyCount = 0;
409 BTScanPosUnpinIfPinned(so->markPos);
410 BTScanPosInvalidate(so->markPos);
411
412 /*
413 * Allocate tuple workspace arrays, if needed for an index-only scan and
414 * not already done in a previous rescan call. To save on palloc
415 * overhead, both workspaces are allocated as one palloc block; only this
416 * function and btendscan know that.
417 *
418 * NOTE: this data structure also makes it safe to return data from a
419 * "name" column, even though btree name_ops uses an underlying storage
420 * datatype of cstring. The risk there is that "name" is supposed to be
421 * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
422 * However, since we only return data out of tuples sitting in the
423 * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
424 * data out of the markTuples array --- running off the end of memory for
425 * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
426 * adding special-case treatment for name_ops elsewhere.
427 */
428 if (scan->xs_want_itup && so->currTuples == NULL)
429 {
430 so->currTuples = (char *) palloc(BLCKSZ * 2);
431 so->markTuples = so->currTuples + BLCKSZ;
432 }
433
434 /*
435 * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
436 * - vadim 05/05/97
437 */
438 if (scankey && scan->numberOfKeys > 0)
439 memmove(scan->keyData,
440 scankey,
441 scan->numberOfKeys * sizeof(ScanKeyData));
442 so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
443
444 /* If any keys are SK_SEARCHARRAY type, set up array-key info */
445 _bt_preprocess_array_keys(scan);
446 }
447
448 /*
449 * btendscan() -- close down a scan
450 */
451 void
btendscan(IndexScanDesc scan)452 btendscan(IndexScanDesc scan)
453 {
454 BTScanOpaque so = (BTScanOpaque) scan->opaque;
455
456 /* we aren't holding any read locks, but gotta drop the pins */
457 if (BTScanPosIsValid(so->currPos))
458 {
459 /* Before leaving current page, deal with any killed items */
460 if (so->numKilled > 0)
461 _bt_killitems(scan);
462 BTScanPosUnpinIfPinned(so->currPos);
463 }
464
465 so->markItemIndex = -1;
466 BTScanPosUnpinIfPinned(so->markPos);
467
468 /* No need to invalidate positions, the RAM is about to be freed. */
469
470 /* Release storage */
471 if (so->keyData != NULL)
472 pfree(so->keyData);
473 /* so->arrayKeyData and so->arrayKeys are in arrayContext */
474 if (so->arrayContext != NULL)
475 MemoryContextDelete(so->arrayContext);
476 if (so->killedItems != NULL)
477 pfree(so->killedItems);
478 if (so->currTuples != NULL)
479 pfree(so->currTuples);
480 /* so->markTuples should not be pfree'd, see btrescan */
481 pfree(so);
482 }
483
484 /*
485 * btmarkpos() -- save current scan position
486 */
487 void
btmarkpos(IndexScanDesc scan)488 btmarkpos(IndexScanDesc scan)
489 {
490 BTScanOpaque so = (BTScanOpaque) scan->opaque;
491
492 /* There may be an old mark with a pin (but no lock). */
493 BTScanPosUnpinIfPinned(so->markPos);
494
495 /*
496 * Just record the current itemIndex. If we later step to next page
497 * before releasing the marked position, _bt_steppage makes a full copy of
498 * the currPos struct in markPos. If (as often happens) the mark is moved
499 * before we leave the page, we don't have to do that work.
500 */
501 if (BTScanPosIsValid(so->currPos))
502 so->markItemIndex = so->currPos.itemIndex;
503 else
504 {
505 BTScanPosInvalidate(so->markPos);
506 so->markItemIndex = -1;
507 }
508
509 /* Also record the current positions of any array keys */
510 if (so->numArrayKeys)
511 _bt_mark_array_keys(scan);
512 }
513
514 /*
515 * btrestrpos() -- restore scan to last saved position
516 */
517 void
btrestrpos(IndexScanDesc scan)518 btrestrpos(IndexScanDesc scan)
519 {
520 BTScanOpaque so = (BTScanOpaque) scan->opaque;
521
522 /* Restore the marked positions of any array keys */
523 if (so->numArrayKeys)
524 _bt_restore_array_keys(scan);
525
526 if (so->markItemIndex >= 0)
527 {
528 /*
529 * The scan has never moved to a new page since the last mark. Just
530 * restore the itemIndex.
531 *
532 * NB: In this case we can't count on anything in so->markPos to be
533 * accurate.
534 */
535 so->currPos.itemIndex = so->markItemIndex;
536 }
537 else
538 {
539 /*
540 * The scan moved to a new page after last mark or restore, and we are
541 * now restoring to the marked page. We aren't holding any read
542 * locks, but if we're still holding the pin for the current position,
543 * we must drop it.
544 */
545 if (BTScanPosIsValid(so->currPos))
546 {
547 /* Before leaving current page, deal with any killed items */
548 if (so->numKilled > 0)
549 _bt_killitems(scan);
550 BTScanPosUnpinIfPinned(so->currPos);
551 }
552
553 if (BTScanPosIsValid(so->markPos))
554 {
555 /* bump pin on mark buffer for assignment to current buffer */
556 if (BTScanPosIsPinned(so->markPos))
557 IncrBufferRefCount(so->markPos.buf);
558 memcpy(&so->currPos, &so->markPos,
559 offsetof(BTScanPosData, items[1]) +
560 so->markPos.lastItem * sizeof(BTScanPosItem));
561 if (so->currTuples)
562 memcpy(so->currTuples, so->markTuples,
563 so->markPos.nextTupleOffset);
564 }
565 else
566 BTScanPosInvalidate(so->currPos);
567 }
568 }
569
570 /*
571 * btestimateparallelscan -- estimate storage for BTParallelScanDescData
572 */
573 Size
btestimateparallelscan(void)574 btestimateparallelscan(void)
575 {
576 return sizeof(BTParallelScanDescData);
577 }
578
579 /*
580 * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
581 */
582 void
btinitparallelscan(void * target)583 btinitparallelscan(void *target)
584 {
585 BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
586
587 SpinLockInit(&bt_target->btps_mutex);
588 bt_target->btps_scanPage = InvalidBlockNumber;
589 bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
590 bt_target->btps_arrayKeyCount = 0;
591 ConditionVariableInit(&bt_target->btps_cv);
592 }
593
594 /*
595 * btparallelrescan() -- reset parallel scan
596 */
597 void
btparallelrescan(IndexScanDesc scan)598 btparallelrescan(IndexScanDesc scan)
599 {
600 BTParallelScanDesc btscan;
601 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
602
603 Assert(parallel_scan);
604
605 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
606 parallel_scan->ps_offset);
607
608 /*
609 * In theory, we don't need to acquire the spinlock here, because there
610 * shouldn't be any other workers running at this point, but we do so for
611 * consistency.
612 */
613 SpinLockAcquire(&btscan->btps_mutex);
614 btscan->btps_scanPage = InvalidBlockNumber;
615 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
616 btscan->btps_arrayKeyCount = 0;
617 SpinLockRelease(&btscan->btps_mutex);
618 }
619
620 /*
621 * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
622 * page. Other scans must wait until we call bt_parallel_release() or
623 * bt_parallel_done().
624 *
625 * The return value is true if we successfully seized the scan and false
626 * if we did not. The latter case occurs if no pages remain for the current
627 * set of scankeys.
628 *
629 * If the return value is true, *pageno returns the next or current page
630 * of the scan (depending on the scan direction). An invalid block number
631 * means the scan hasn't yet started, and P_NONE means we've reached the end.
632 * The first time a participating process reaches the last page, it will return
633 * true and set *pageno to P_NONE; after that, further attempts to seize the
634 * scan will return false.
635 *
636 * Callers should ignore the value of pageno if the return value is false.
637 */
638 bool
_bt_parallel_seize(IndexScanDesc scan,BlockNumber * pageno)639 _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
640 {
641 BTScanOpaque so = (BTScanOpaque) scan->opaque;
642 BTPS_State pageStatus;
643 bool exit_loop = false;
644 bool status = true;
645 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
646 BTParallelScanDesc btscan;
647
648 *pageno = P_NONE;
649
650 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
651 parallel_scan->ps_offset);
652
653 while (1)
654 {
655 SpinLockAcquire(&btscan->btps_mutex);
656 pageStatus = btscan->btps_pageStatus;
657
658 if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
659 {
660 /* Parallel scan has already advanced to a new set of scankeys. */
661 status = false;
662 }
663 else if (pageStatus == BTPARALLEL_DONE)
664 {
665 /*
666 * We're done with this set of scankeys. This may be the end, or
667 * there could be more sets to try.
668 */
669 status = false;
670 }
671 else if (pageStatus != BTPARALLEL_ADVANCING)
672 {
673 /*
674 * We have successfully seized control of the scan for the purpose
675 * of advancing it to a new page!
676 */
677 btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
678 *pageno = btscan->btps_scanPage;
679 exit_loop = true;
680 }
681 SpinLockRelease(&btscan->btps_mutex);
682 if (exit_loop || !status)
683 break;
684 ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
685 }
686 ConditionVariableCancelSleep();
687
688 return status;
689 }
690
691 /*
692 * _bt_parallel_release() -- Complete the process of advancing the scan to a
693 * new page. We now have the new value btps_scanPage; some other backend
694 * can now begin advancing the scan.
695 */
696 void
_bt_parallel_release(IndexScanDesc scan,BlockNumber scan_page)697 _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
698 {
699 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
700 BTParallelScanDesc btscan;
701
702 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
703 parallel_scan->ps_offset);
704
705 SpinLockAcquire(&btscan->btps_mutex);
706 btscan->btps_scanPage = scan_page;
707 btscan->btps_pageStatus = BTPARALLEL_IDLE;
708 SpinLockRelease(&btscan->btps_mutex);
709 ConditionVariableSignal(&btscan->btps_cv);
710 }
711
712 /*
713 * _bt_parallel_done() -- Mark the parallel scan as complete.
714 *
715 * When there are no pages left to scan, this function should be called to
716 * notify other workers. Otherwise, they might wait forever for the scan to
717 * advance to the next page.
718 */
719 void
_bt_parallel_done(IndexScanDesc scan)720 _bt_parallel_done(IndexScanDesc scan)
721 {
722 BTScanOpaque so = (BTScanOpaque) scan->opaque;
723 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
724 BTParallelScanDesc btscan;
725 bool status_changed = false;
726
727 /* Do nothing, for non-parallel scans */
728 if (parallel_scan == NULL)
729 return;
730
731 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
732 parallel_scan->ps_offset);
733
734 /*
735 * Mark the parallel scan as done for this combination of scan keys,
736 * unless some other process already did so. See also
737 * _bt_advance_array_keys.
738 */
739 SpinLockAcquire(&btscan->btps_mutex);
740 if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
741 btscan->btps_pageStatus != BTPARALLEL_DONE)
742 {
743 btscan->btps_pageStatus = BTPARALLEL_DONE;
744 status_changed = true;
745 }
746 SpinLockRelease(&btscan->btps_mutex);
747
748 /* wake up all the workers associated with this parallel scan */
749 if (status_changed)
750 ConditionVariableBroadcast(&btscan->btps_cv);
751 }
752
753 /*
754 * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
755 * keys.
756 *
757 * Updates the count of array keys processed for both local and parallel
758 * scans.
759 */
760 void
_bt_parallel_advance_array_keys(IndexScanDesc scan)761 _bt_parallel_advance_array_keys(IndexScanDesc scan)
762 {
763 BTScanOpaque so = (BTScanOpaque) scan->opaque;
764 ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
765 BTParallelScanDesc btscan;
766
767 btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
768 parallel_scan->ps_offset);
769
770 so->arrayKeyCount++;
771 SpinLockAcquire(&btscan->btps_mutex);
772 if (btscan->btps_pageStatus == BTPARALLEL_DONE)
773 {
774 btscan->btps_scanPage = InvalidBlockNumber;
775 btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
776 btscan->btps_arrayKeyCount++;
777 }
778 SpinLockRelease(&btscan->btps_mutex);
779 }
780
781 /*
782 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
783 *
784 * Called by btvacuumcleanup when btbulkdelete was never called because no
785 * tuples need to be deleted.
786 *
787 * When we return false, VACUUM can even skip the cleanup-only call to
788 * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
789 * all). Otherwise, a cleanup-only btvacuumscan call is required.
790 */
791 static bool
_bt_vacuum_needs_cleanup(IndexVacuumInfo * info)792 _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
793 {
794 Buffer metabuf;
795 Page metapg;
796 BTMetaPageData *metad;
797 bool result = false;
798
799 metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
800 metapg = BufferGetPage(metabuf);
801 metad = BTPageGetMeta(metapg);
802
803 if (metad->btm_version < BTREE_VERSION)
804 {
805 /*
806 * Do cleanup if metapage needs upgrade, because we don't have
807 * cleanup-related meta-information yet.
808 */
809 result = true;
810 }
811 else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
812 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
813 RecentGlobalXmin))
814 {
815 /*
816 * If any oldest btpo.xact from a previously deleted page in the index
817 * is older than RecentGlobalXmin, then at least one deleted page can
818 * be recycled -- don't skip cleanup.
819 */
820 result = true;
821 }
822 else
823 {
824 StdRdOptions *relopts;
825 float8 cleanup_scale_factor;
826 float8 prev_num_heap_tuples;
827
828 /*
829 * If table receives enough insertions and no cleanup was performed,
830 * then index would appear have stale statistics. If scale factor is
831 * set, we avoid that by performing cleanup if the number of inserted
832 * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
833 * original tuples count.
834 */
835 relopts = (StdRdOptions *) info->index->rd_options;
836 cleanup_scale_factor = (relopts &&
837 relopts->vacuum_cleanup_index_scale_factor >= 0)
838 ? relopts->vacuum_cleanup_index_scale_factor
839 : vacuum_cleanup_index_scale_factor;
840 prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
841
842 if (cleanup_scale_factor <= 0 ||
843 prev_num_heap_tuples <= 0 ||
844 (info->num_heap_tuples - prev_num_heap_tuples) /
845 prev_num_heap_tuples >= cleanup_scale_factor)
846 result = true;
847 }
848
849 _bt_relbuf(info->index, metabuf);
850 return result;
851 }
852
853 /*
854 * Bulk deletion of all index entries pointing to a set of heap tuples.
855 * The set of target tuples is specified via a callback routine that tells
856 * whether any given heap tuple (identified by ItemPointer) is being deleted.
857 *
858 * Result: a palloc'd struct containing statistical info for VACUUM displays.
859 */
860 IndexBulkDeleteResult *
btbulkdelete(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state)861 btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
862 IndexBulkDeleteCallback callback, void *callback_state)
863 {
864 Relation rel = info->index;
865 BTCycleId cycleid;
866
867 /* allocate stats if first time through, else re-use existing struct */
868 if (stats == NULL)
869 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
870
871 /* Establish the vacuum cycle ID to use for this scan */
872 /* The ENSURE stuff ensures we clean up shared memory on failure */
873 PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
874 {
875 cycleid = _bt_start_vacuum(rel);
876
877 btvacuumscan(info, stats, callback, callback_state, cycleid);
878 }
879 PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
880 _bt_end_vacuum(rel);
881
882 return stats;
883 }
884
885 /*
886 * Post-VACUUM cleanup.
887 *
888 * Result: a palloc'd struct containing statistical info for VACUUM displays.
889 */
890 IndexBulkDeleteResult *
btvacuumcleanup(IndexVacuumInfo * info,IndexBulkDeleteResult * stats)891 btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
892 {
893 /* No-op in ANALYZE ONLY mode */
894 if (info->analyze_only)
895 return stats;
896
897 /*
898 * If btbulkdelete was called, we need not do anything, just return the
899 * stats from the latest btbulkdelete call. If it wasn't called, we might
900 * still need to do a pass over the index, to recycle any newly-recyclable
901 * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
902 * determines if either are needed.
903 *
904 * Since we aren't going to actually delete any leaf items, there's no
905 * need to go through all the vacuum-cycle-ID pushups.
906 */
907 if (stats == NULL)
908 {
909 /* Check if we need a cleanup */
910 if (!_bt_vacuum_needs_cleanup(info))
911 return NULL;
912
913 stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
914 btvacuumscan(info, stats, NULL, NULL, 0);
915 }
916
917 /*
918 * It's quite possible for us to be fooled by concurrent page splits into
919 * double-counting some index tuples, so disbelieve any total that exceeds
920 * the underlying heap's count ... if we know that accurately. Otherwise
921 * this might just make matters worse.
922 */
923 if (!info->estimated_count)
924 {
925 if (stats->num_index_tuples > info->num_heap_tuples)
926 stats->num_index_tuples = info->num_heap_tuples;
927 }
928
929 return stats;
930 }
931
932 /*
933 * btvacuumscan --- scan the index for VACUUMing purposes
934 *
935 * This combines the functions of looking for leaf tuples that are deletable
936 * according to the vacuum callback, looking for empty pages that can be
937 * deleted, and looking for old deleted pages that can be recycled. Both
938 * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
939 * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
940 * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
941 * is maintained.
942 *
943 * The caller is responsible for initially allocating/zeroing a stats struct
944 * and for obtaining a vacuum cycle ID if necessary.
945 */
946 static void
btvacuumscan(IndexVacuumInfo * info,IndexBulkDeleteResult * stats,IndexBulkDeleteCallback callback,void * callback_state,BTCycleId cycleid)947 btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
948 IndexBulkDeleteCallback callback, void *callback_state,
949 BTCycleId cycleid)
950 {
951 Relation rel = info->index;
952 BTVacState vstate;
953 BlockNumber num_pages;
954 BlockNumber blkno;
955 bool needLock;
956
957 /*
958 * Reset counts that will be incremented during the scan; needed in case
959 * of multiple scans during a single VACUUM command
960 */
961 stats->estimated_count = false;
962 stats->num_index_tuples = 0;
963 stats->pages_deleted = 0;
964
965 /* Set up info to pass down to btvacuumpage */
966 vstate.info = info;
967 vstate.stats = stats;
968 vstate.callback = callback;
969 vstate.callback_state = callback_state;
970 vstate.cycleid = cycleid;
971 vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
972 vstate.lastBlockLocked = BTREE_METAPAGE;
973 vstate.totFreePages = 0;
974 vstate.oldestBtpoXact = InvalidTransactionId;
975
976 /* Create a temporary memory context to run _bt_pagedel in */
977 vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
978 "_bt_pagedel",
979 ALLOCSET_DEFAULT_SIZES);
980
981 /*
982 * The outer loop iterates over all index pages except the metapage, in
983 * physical order (we hope the kernel will cooperate in providing
984 * read-ahead for speed). It is critical that we visit all leaf pages,
985 * including ones added after we start the scan, else we might fail to
986 * delete some deletable tuples. Hence, we must repeatedly check the
987 * relation length. We must acquire the relation-extension lock while
988 * doing so to avoid a race condition: if someone else is extending the
989 * relation, there is a window where bufmgr/smgr have created a new
990 * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
991 * we manage to scan such a page here, we'll improperly assume it can be
992 * recycled. Taking the lock synchronizes things enough to prevent a
993 * problem: either num_pages won't include the new page, or _bt_getbuf
994 * already has write lock on the buffer and it will be fully initialized
995 * before we can examine it. (See also vacuumlazy.c, which has the same
996 * issue.) Also, we need not worry if a page is added immediately after
997 * we look; the page splitting code already has write-lock on the left
998 * page before it adds a right page, so we must already have processed any
999 * tuples due to be moved into such a page.
1000 *
1001 * We can skip locking for new or temp relations, however, since no one
1002 * else could be accessing them.
1003 */
1004 needLock = !RELATION_IS_LOCAL(rel);
1005
1006 blkno = BTREE_METAPAGE + 1;
1007 for (;;)
1008 {
1009 /* Get the current relation length */
1010 if (needLock)
1011 LockRelationForExtension(rel, ExclusiveLock);
1012 num_pages = RelationGetNumberOfBlocks(rel);
1013 if (needLock)
1014 UnlockRelationForExtension(rel, ExclusiveLock);
1015
1016 /* Quit if we've scanned the whole relation */
1017 if (blkno >= num_pages)
1018 break;
1019 /* Iterate over pages, then loop back to recheck length */
1020 for (; blkno < num_pages; blkno++)
1021 {
1022 btvacuumpage(&vstate, blkno, blkno);
1023 }
1024 }
1025
1026 /*
1027 * Check to see if we need to issue one final WAL record for this index,
1028 * which may be needed for correctness on a hot standby node when non-MVCC
1029 * index scans could take place.
1030 *
1031 * If the WAL is replayed in hot standby, the replay process needs to get
1032 * cleanup locks on all index leaf pages, just as we've been doing here.
1033 * However, we won't issue any WAL records about pages that have no items
1034 * to be deleted. For pages between pages we've vacuumed, the replay code
1035 * will take locks under the direction of the lastBlockVacuumed fields in
1036 * the XLOG_BTREE_VACUUM WAL records. To cover pages after the last one
1037 * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
1038 * against the last leaf page in the index, if that one wasn't vacuumed.
1039 */
1040 if (XLogStandbyInfoActive() &&
1041 vstate.lastBlockVacuumed < vstate.lastBlockLocked)
1042 {
1043 Buffer buf;
1044
1045 /*
1046 * The page should be valid, but we can't use _bt_getbuf() because we
1047 * want to use a nondefault buffer access strategy. Since we aren't
1048 * going to delete any items, getting cleanup lock again is probably
1049 * overkill, but for consistency do that anyway.
1050 */
1051 buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
1052 RBM_NORMAL, info->strategy);
1053 LockBufferForCleanup(buf);
1054 _bt_checkpage(rel, buf);
1055 _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
1056 _bt_relbuf(rel, buf);
1057 }
1058
1059 MemoryContextDelete(vstate.pagedelcontext);
1060
1061 /*
1062 * If we found any recyclable pages (and recorded them in the FSM), then
1063 * forcibly update the upper-level FSM pages to ensure that searchers can
1064 * find them. It's possible that the pages were also found during
1065 * previous scans and so this is a waste of time, but it's cheap enough
1066 * relative to scanning the index that it shouldn't matter much, and
1067 * making sure that free pages are available sooner not later seems
1068 * worthwhile.
1069 *
1070 * Note that if no recyclable pages exist, we don't bother vacuuming the
1071 * FSM at all.
1072 */
1073 if (vstate.totFreePages > 0)
1074 IndexFreeSpaceMapVacuum(rel);
1075
1076 /*
1077 * Maintain the oldest btpo.xact and a count of the current number of heap
1078 * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
1079 *
1080 * The page with the oldest btpo.xact is typically a page deleted by this
1081 * VACUUM operation, since pages deleted by a previous VACUUM operation
1082 * tend to be placed in the FSM (by the current VACUUM operation) -- such
1083 * pages are not candidates to be the oldest btpo.xact. (Note that pages
1084 * placed in the FSM are reported as deleted pages in the bulk delete
1085 * statistics, despite not counting as deleted pages for the purposes of
1086 * determining the oldest btpo.xact.)
1087 */
1088 _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
1089 info->num_heap_tuples);
1090
1091 /* update statistics */
1092 stats->num_pages = num_pages;
1093 stats->pages_free = vstate.totFreePages;
1094 }
1095
1096 /*
1097 * btvacuumpage --- VACUUM one page
1098 *
1099 * This processes a single page for btvacuumscan(). In some cases we
1100 * must go back and re-examine previously-scanned pages; this routine
1101 * recurses when necessary to handle that case.
1102 *
1103 * blkno is the page to process. orig_blkno is the highest block number
1104 * reached by the outer btvacuumscan loop (the same as blkno, unless we
1105 * are recursing to re-examine a previous page).
1106 */
1107 static void
btvacuumpage(BTVacState * vstate,BlockNumber blkno,BlockNumber orig_blkno)1108 btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
1109 {
1110 IndexVacuumInfo *info = vstate->info;
1111 IndexBulkDeleteResult *stats = vstate->stats;
1112 IndexBulkDeleteCallback callback = vstate->callback;
1113 void *callback_state = vstate->callback_state;
1114 Relation rel = info->index;
1115 bool delete_now;
1116 BlockNumber recurse_to;
1117 Buffer buf;
1118 Page page;
1119 BTPageOpaque opaque = NULL;
1120
1121 restart:
1122 delete_now = false;
1123 recurse_to = P_NONE;
1124
1125 /* call vacuum_delay_point while not holding any buffer lock */
1126 vacuum_delay_point();
1127
1128 /*
1129 * We can't use _bt_getbuf() here because it always applies
1130 * _bt_checkpage(), which will barf on an all-zero page. We want to
1131 * recycle all-zero pages, not fail. Also, we want to use a nondefault
1132 * buffer access strategy.
1133 */
1134 buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1135 info->strategy);
1136 LockBuffer(buf, BT_READ);
1137 page = BufferGetPage(buf);
1138 if (!PageIsNew(page))
1139 {
1140 _bt_checkpage(rel, buf);
1141 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
1142 }
1143
1144 /*
1145 * If we are recursing, the only case we want to do anything with is a
1146 * live leaf page having the current vacuum cycle ID. Any other state
1147 * implies we already saw the page (eg, deleted it as being empty).
1148 */
1149 if (blkno != orig_blkno)
1150 {
1151 if (_bt_page_recyclable(page) ||
1152 P_IGNORE(opaque) ||
1153 !P_ISLEAF(opaque) ||
1154 opaque->btpo_cycleid != vstate->cycleid)
1155 {
1156 _bt_relbuf(rel, buf);
1157 return;
1158 }
1159 }
1160
1161 /* Page is valid, see what to do with it */
1162 if (_bt_page_recyclable(page))
1163 {
1164 /* Okay to recycle this page (which could be leaf or internal) */
1165 RecordFreeIndexPage(rel, blkno);
1166 vstate->totFreePages++;
1167 stats->pages_deleted++;
1168 }
1169 else if (P_ISDELETED(opaque))
1170 {
1171 /*
1172 * Already deleted page (which could be leaf or internal). Can't
1173 * recycle yet.
1174 */
1175 stats->pages_deleted++;
1176
1177 /* Maintain the oldest btpo.xact */
1178 if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
1179 TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
1180 vstate->oldestBtpoXact = opaque->btpo.xact;
1181 }
1182 else if (P_ISHALFDEAD(opaque))
1183 {
1184 /*
1185 * Half-dead leaf page. Try to delete now. Might update
1186 * oldestBtpoXact and pages_deleted below.
1187 */
1188 delete_now = true;
1189 }
1190 else if (P_ISLEAF(opaque))
1191 {
1192 OffsetNumber deletable[MaxOffsetNumber];
1193 int ndeletable;
1194 OffsetNumber offnum,
1195 minoff,
1196 maxoff;
1197
1198 /*
1199 * Trade in the initial read lock for a super-exclusive write lock on
1200 * this page. We must get such a lock on every leaf page over the
1201 * course of the vacuum scan, whether or not it actually contains any
1202 * deletable tuples --- see nbtree/README.
1203 */
1204 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1205 LockBufferForCleanup(buf);
1206
1207 /*
1208 * Remember highest leaf page number we've taken cleanup lock on; see
1209 * notes in btvacuumscan
1210 */
1211 if (blkno > vstate->lastBlockLocked)
1212 vstate->lastBlockLocked = blkno;
1213
1214 /*
1215 * Check whether we need to recurse back to earlier pages. What we
1216 * are concerned about is a page split that happened since we started
1217 * the vacuum scan. If the split moved some tuples to a lower page
1218 * then we might have missed 'em. If so, set up for tail recursion.
1219 * (Must do this before possibly clearing btpo_cycleid below!)
1220 */
1221 if (vstate->cycleid != 0 &&
1222 opaque->btpo_cycleid == vstate->cycleid &&
1223 !(opaque->btpo_flags & BTP_SPLIT_END) &&
1224 !P_RIGHTMOST(opaque) &&
1225 opaque->btpo_next < orig_blkno)
1226 recurse_to = opaque->btpo_next;
1227
1228 /*
1229 * Scan over all items to see which ones need deleted according to the
1230 * callback function.
1231 */
1232 ndeletable = 0;
1233 minoff = P_FIRSTDATAKEY(opaque);
1234 maxoff = PageGetMaxOffsetNumber(page);
1235 if (callback)
1236 {
1237 for (offnum = minoff;
1238 offnum <= maxoff;
1239 offnum = OffsetNumberNext(offnum))
1240 {
1241 IndexTuple itup;
1242 ItemPointer htup;
1243
1244 itup = (IndexTuple) PageGetItem(page,
1245 PageGetItemId(page, offnum));
1246 htup = &(itup->t_tid);
1247
1248 /*
1249 * During Hot Standby we currently assume that
1250 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
1251 * only true as long as the callback function depends only
1252 * upon whether the index tuple refers to heap tuples removed
1253 * in the initial heap scan. When vacuum starts it derives a
1254 * value of OldestXmin. Backends taking later snapshots could
1255 * have a RecentGlobalXmin with a later xid than the vacuum's
1256 * OldestXmin, so it is possible that row versions deleted
1257 * after OldestXmin could be marked as killed by other
1258 * backends. The callback function *could* look at the index
1259 * tuple state in isolation and decide to delete the index
1260 * tuple, though currently it does not. If it ever did, we
1261 * would need to reconsider whether XLOG_BTREE_VACUUM records
1262 * should cause conflicts. If they did cause conflicts they
1263 * would be fairly harsh conflicts, since we haven't yet
1264 * worked out a way to pass a useful value for
1265 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
1266 * applies to *any* type of index that marks index tuples as
1267 * killed.
1268 */
1269 if (callback(htup, callback_state))
1270 deletable[ndeletable++] = offnum;
1271 }
1272 }
1273
1274 /*
1275 * Apply any needed deletes. We issue just one _bt_delitems_vacuum()
1276 * call per page, so as to minimize WAL traffic.
1277 */
1278 if (ndeletable > 0)
1279 {
1280 /*
1281 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
1282 * all information to the replay code to allow it to get a cleanup
1283 * lock on all pages between the previous lastBlockVacuumed and
1284 * this page. This ensures that WAL replay locks all leaf pages at
1285 * some point, which is important should non-MVCC scans be
1286 * requested. This is currently unused on standby, but we record
1287 * it anyway, so that the WAL contains the required information.
1288 *
1289 * Since we can visit leaf pages out-of-order when recursing,
1290 * replay might end up locking such pages an extra time, but it
1291 * doesn't seem worth the amount of bookkeeping it'd take to avoid
1292 * that.
1293 */
1294 _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
1295 vstate->lastBlockVacuumed);
1296
1297 /*
1298 * Remember highest leaf page number we've issued a
1299 * XLOG_BTREE_VACUUM WAL record for.
1300 */
1301 if (blkno > vstate->lastBlockVacuumed)
1302 vstate->lastBlockVacuumed = blkno;
1303
1304 stats->tuples_removed += ndeletable;
1305 /* must recompute maxoff */
1306 maxoff = PageGetMaxOffsetNumber(page);
1307 }
1308 else
1309 {
1310 /*
1311 * If the leaf page has been split during this vacuum cycle, it
1312 * seems worth expending a write to clear btpo_cycleid even if we
1313 * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1314 * takes care of this.) This ensures we won't process the page
1315 * again.
1316 *
1317 * We treat this like a hint-bit update because there's no need to
1318 * WAL-log it.
1319 */
1320 if (vstate->cycleid != 0 &&
1321 opaque->btpo_cycleid == vstate->cycleid)
1322 {
1323 opaque->btpo_cycleid = 0;
1324 MarkBufferDirtyHint(buf, true);
1325 }
1326 }
1327
1328 /*
1329 * If the leaf page is now empty, try to delete it; else count the
1330 * live tuples. We don't delete when recursing, though, to avoid
1331 * putting entries into freePages out-of-order (doesn't seem worth any
1332 * extra code to handle the case).
1333 */
1334 if (minoff > maxoff)
1335 delete_now = (blkno == orig_blkno);
1336 else
1337 stats->num_index_tuples += maxoff - minoff + 1;
1338 }
1339
1340 if (delete_now)
1341 {
1342 MemoryContext oldcontext;
1343
1344 /* Run pagedel in a temp context to avoid memory leakage */
1345 MemoryContextReset(vstate->pagedelcontext);
1346 oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1347
1348 /*
1349 * We trust the _bt_pagedel return value because it does not include
1350 * any page that a future call here from btvacuumscan is expected to
1351 * count. There will be no double-counting.
1352 */
1353 stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
1354
1355 MemoryContextSwitchTo(oldcontext);
1356 /* pagedel released buffer, so we shouldn't */
1357 }
1358 else
1359 _bt_relbuf(rel, buf);
1360
1361 /*
1362 * This is really tail recursion, but if the compiler is too stupid to
1363 * optimize it as such, we'd eat an uncomfortably large amount of stack
1364 * space per recursion level (due to the deletable[] array). A failure is
1365 * improbable since the number of levels isn't likely to be large ... but
1366 * just in case, let's hand-optimize into a loop.
1367 */
1368 if (recurse_to != P_NONE)
1369 {
1370 blkno = recurse_to;
1371 goto restart;
1372 }
1373 }
1374
1375 /*
1376 * btcanreturn() -- Check whether btree indexes support index-only scans.
1377 *
1378 * btrees always do, so this is trivial.
1379 */
1380 bool
btcanreturn(Relation index,int attno)1381 btcanreturn(Relation index, int attno)
1382 {
1383 return true;
1384 }
1385