1 /*------------------------------------------------------------------------- 2 * 3 * gist_private.h 4 * private declarations for GiST -- declarations related to the 5 * internal implementation of GiST, not the public API 6 * 7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/access/gist_private.h 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #ifndef GIST_PRIVATE_H 15 #define GIST_PRIVATE_H 16 17 #include "access/amapi.h" 18 #include "access/gist.h" 19 #include "access/itup.h" 20 #include "fmgr.h" 21 #include "lib/pairingheap.h" 22 #include "storage/bufmgr.h" 23 #include "storage/buffile.h" 24 #include "utils/hsearch.h" 25 #include "access/genam.h" 26 27 /* 28 * Maximum number of "halves" a page can be split into in one operation. 29 * Typically a split produces 2 halves, but can be more if keys have very 30 * different lengths, or when inserting multiple keys in one operation (as 31 * when inserting downlinks to an internal node). There is no theoretical 32 * limit on this, but in practice if you get more than a handful page halves 33 * in one split, there's something wrong with the opclass implementation. 34 * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some 35 * local arrays used during split. Note that there is also a limit on the 36 * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS, 37 * so if you raise this higher than that limit, you'll just get a different 38 * error. 39 */ 40 #define GIST_MAX_SPLIT_PAGES 75 41 42 /* Buffer lock modes */ 43 #define GIST_SHARE BUFFER_LOCK_SHARE 44 #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE 45 #define GIST_UNLOCK BUFFER_LOCK_UNLOCK 46 47 typedef struct 48 { 49 BlockNumber prev; 50 uint32 freespace; 51 char tupledata[FLEXIBLE_ARRAY_MEMBER]; 52 } GISTNodeBufferPage; 53 54 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) 55 /* Returns free space in node buffer page */ 56 #define PAGE_FREE_SPACE(nbp) (nbp->freespace) 57 /* Checks if node buffer page is empty */ 58 #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) 59 /* Checks if node buffers page don't contain sufficient space for index tuple */ 60 #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ 61 MAXALIGN(IndexTupleSize(itup))) 62 63 /* 64 * GISTSTATE: information needed for any GiST index operation 65 * 66 * This struct retains call info for the index's opclass-specific support 67 * functions (per index column), plus the index's tuple descriptor. 68 * 69 * scanCxt holds the GISTSTATE itself as well as any data that lives for the 70 * lifetime of the index operation. We pass this to the support functions 71 * via fn_mcxt, so that they can store scan-lifespan data in it. The 72 * functions are invoked in tempCxt, which is typically short-lifespan 73 * (that is, it's reset after each tuple). However, tempCxt can be the same 74 * as scanCxt if we're not bothering with per-tuple context resets. 75 */ 76 typedef struct GISTSTATE 77 { 78 MemoryContext scanCxt; /* context for scan-lifespan data */ 79 MemoryContext tempCxt; /* short-term context for calling functions */ 80 81 TupleDesc leafTupdesc; /* index's tuple descriptor */ 82 TupleDesc nonLeafTupdesc; /* truncated tuple descriptor for non-leaf 83 * pages */ 84 TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an 85 * index-only scan */ 86 87 FmgrInfo consistentFn[INDEX_MAX_KEYS]; 88 FmgrInfo unionFn[INDEX_MAX_KEYS]; 89 FmgrInfo compressFn[INDEX_MAX_KEYS]; 90 FmgrInfo decompressFn[INDEX_MAX_KEYS]; 91 FmgrInfo penaltyFn[INDEX_MAX_KEYS]; 92 FmgrInfo picksplitFn[INDEX_MAX_KEYS]; 93 FmgrInfo equalFn[INDEX_MAX_KEYS]; 94 FmgrInfo distanceFn[INDEX_MAX_KEYS]; 95 FmgrInfo fetchFn[INDEX_MAX_KEYS]; 96 97 /* Collations to pass to the support functions */ 98 Oid supportCollation[INDEX_MAX_KEYS]; 99 } GISTSTATE; 100 101 102 /* 103 * During a GiST index search, we must maintain a queue of unvisited items, 104 * which can be either individual heap tuples or whole index pages. If it 105 * is an ordered search, the unvisited items should be visited in distance 106 * order. Unvisited items at the same distance should be visited in 107 * depth-first order, that is heap items first, then lower index pages, then 108 * upper index pages; this rule avoids doing extra work during a search that 109 * ends early due to LIMIT. 110 * 111 * To perform an ordered search, we use a pairing heap to manage the 112 * distance-order queue. In a non-ordered search (no order-by operators), 113 * we use it to return heap tuples before unvisited index pages, to 114 * ensure depth-first order, but all entries are otherwise considered 115 * equal. 116 */ 117 118 /* Individual heap tuple to be visited */ 119 typedef struct GISTSearchHeapItem 120 { 121 ItemPointerData heapPtr; 122 bool recheck; /* T if quals must be rechecked */ 123 bool recheckDistances; /* T if distances must be rechecked */ 124 HeapTuple recontup; /* data reconstructed from the index, used in 125 * index-only scans */ 126 OffsetNumber offnum; /* track offset in page to mark tuple as 127 * LP_DEAD */ 128 } GISTSearchHeapItem; 129 130 /* Unvisited item, either index page or heap tuple */ 131 typedef struct GISTSearchItem 132 { 133 pairingheap_node phNode; 134 BlockNumber blkno; /* index page number, or InvalidBlockNumber */ 135 union 136 { 137 GistNSN parentlsn; /* parent page's LSN, if index page */ 138 /* we must store parentlsn to detect whether a split occurred */ 139 GISTSearchHeapItem heap; /* heap info, if heap tuple */ 140 } data; 141 142 /* numberOfOrderBys entries */ 143 IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; 144 } GISTSearchItem; 145 146 #define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber) 147 148 #define SizeOfGISTSearchItem(n_distances) \ 149 (offsetof(GISTSearchItem, distances) + \ 150 sizeof(IndexOrderByDistance) * (n_distances)) 151 152 /* 153 * GISTScanOpaqueData: private state for a scan of a GiST index 154 */ 155 typedef struct GISTScanOpaqueData 156 { 157 GISTSTATE *giststate; /* index information, see above */ 158 Oid *orderByTypes; /* datatypes of ORDER BY expressions */ 159 160 pairingheap *queue; /* queue of unvisited items */ 161 MemoryContext queueCxt; /* context holding the queue */ 162 bool qual_ok; /* false if qual can never be satisfied */ 163 bool firstCall; /* true until first gistgettuple call */ 164 165 /* pre-allocated workspace arrays */ 166 IndexOrderByDistance *distances; /* output area for gistindex_keytest */ 167 168 /* info about killed items if any (killedItems is NULL if never used) */ 169 OffsetNumber *killedItems; /* offset numbers of killed items */ 170 int numKilled; /* number of currently stored items */ 171 BlockNumber curBlkno; /* current number of block */ 172 GistNSN curPageLSN; /* pos in the WAL stream when page was read */ 173 174 /* In a non-ordered search, returnable heap items are stored here: */ 175 GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; 176 OffsetNumber nPageData; /* number of valid items in array */ 177 OffsetNumber curPageData; /* next item to return */ 178 MemoryContext pageDataCxt; /* context holding the fetched tuples, for 179 * index-only scans */ 180 } GISTScanOpaqueData; 181 182 typedef GISTScanOpaqueData *GISTScanOpaque; 183 184 /* despite the name, gistxlogPage is not part of any xlog record */ 185 typedef struct gistxlogPage 186 { 187 BlockNumber blkno; 188 int num; /* number of index tuples following */ 189 } gistxlogPage; 190 191 /* SplitedPageLayout - gistSplit function result */ 192 typedef struct SplitedPageLayout 193 { 194 gistxlogPage block; 195 IndexTupleData *list; 196 int lenlist; 197 IndexTuple itup; /* union key for page */ 198 Page page; /* to operate */ 199 Buffer buffer; /* to write after all proceed */ 200 201 struct SplitedPageLayout *next; 202 } SplitedPageLayout; 203 204 /* 205 * GISTInsertStack used for locking buffers and transfer arguments during 206 * insertion 207 */ 208 typedef struct GISTInsertStack 209 { 210 /* current page */ 211 BlockNumber blkno; 212 Buffer buffer; 213 Page page; 214 215 /* 216 * log sequence number from page->lsn to recognize page update and compare 217 * it with page's nsn to recognize page split 218 */ 219 GistNSN lsn; 220 221 /* 222 * If set, we split the page while descending the tree to find an 223 * insertion target. It means that we need to retry from the parent, 224 * because the downlink of this page might no longer cover the new key. 225 */ 226 bool retry_from_parent; 227 228 /* offset of the downlink in the parent page, that points to this page */ 229 OffsetNumber downlinkoffnum; 230 231 /* pointer to parent */ 232 struct GISTInsertStack *parent; 233 } GISTInsertStack; 234 235 /* Working state and results for multi-column split logic in gistsplit.c */ 236 typedef struct GistSplitVector 237 { 238 GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */ 239 240 Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in 241 * splitVector.spl_left */ 242 bool spl_lisnull[INDEX_MAX_KEYS]; 243 244 Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in 245 * splitVector.spl_right */ 246 bool spl_risnull[INDEX_MAX_KEYS]; 247 248 bool *spl_dontcare; /* flags tuples which could go to either side 249 * of the split for zero penalty */ 250 } GistSplitVector; 251 252 typedef struct 253 { 254 Relation r; 255 Relation heapRel; 256 Size freespace; /* free space to be left */ 257 bool is_build; 258 259 GISTInsertStack *stack; 260 } GISTInsertState; 261 262 /* root page of a gist index */ 263 #define GIST_ROOT_BLKNO 0 264 265 /* 266 * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on 267 * inner pages to finish crash recovery of incomplete page splits. If a crash 268 * happened in the middle of a page split, so that the downlink pointers were 269 * not yet inserted, crash recovery inserted a special downlink pointer. The 270 * semantics of an invalid tuple was that it if you encounter one in a scan, 271 * it must always be followed, because we don't know if the tuples on the 272 * child page match or not. 273 * 274 * We no longer create such invalid tuples, we now mark the left-half of such 275 * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the 276 * split properly the next time we need to insert on that page. To retain 277 * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as 278 * the offset number of all inner tuples. If we encounter any invalid tuples 279 * with 0xfffe during insertion, we throw an error, though scans still handle 280 * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1 281 * gist index which already has invalid tuples in it because of a crash. That 282 * should be rare, and you are recommended to REINDEX anyway if you have any 283 * invalid tuples in an index, so throwing an error is as far as we go with 284 * supporting that. 285 */ 286 #define TUPLE_IS_VALID 0xffff 287 #define TUPLE_IS_INVALID 0xfffe 288 289 #define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID ) 290 #define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID ) 291 292 293 294 295 /* 296 * A buffer attached to an internal node, used when building an index in 297 * buffering mode. 298 */ 299 typedef struct 300 { 301 BlockNumber nodeBlocknum; /* index block # this buffer is for */ 302 int32 blocksCount; /* current # of blocks occupied by buffer */ 303 304 BlockNumber pageBlocknum; /* temporary file block # */ 305 GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */ 306 307 /* is this buffer queued for emptying? */ 308 bool queuedForEmptying; 309 310 /* is this a temporary copy, not in the hash table? */ 311 bool isTemp; 312 313 int level; /* 0 == leaf */ 314 } GISTNodeBuffer; 315 316 /* 317 * Does specified level have buffers? (Beware of multiple evaluation of 318 * arguments.) 319 */ 320 #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \ 321 ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \ 322 (nlevel) != (gfbb)->rootlevel) 323 324 /* Is specified buffer at least half-filled (should be queued for emptying)? */ 325 #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \ 326 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2) 327 328 /* 329 * Is specified buffer full? Our buffers can actually grow indefinitely, 330 * beyond the "maximum" size, so this just means whether the buffer has grown 331 * beyond the nominal maximum size. 332 */ 333 #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \ 334 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer) 335 336 /* 337 * Data structure with general information about build buffers. 338 */ 339 typedef struct GISTBuildBuffers 340 { 341 /* Persistent memory context for the buffers and metadata. */ 342 MemoryContext context; 343 344 BufFile *pfile; /* Temporary file to store buffers in */ 345 long nFileBlocks; /* Current size of the temporary file */ 346 347 /* 348 * resizable array of free blocks. 349 */ 350 long *freeBlocks; 351 int nFreeBlocks; /* # of currently free blocks in the array */ 352 int freeBlocksLen; /* current allocated length of the array */ 353 354 /* Hash for buffers by block number */ 355 HTAB *nodeBuffersTab; 356 357 /* List of buffers scheduled for emptying */ 358 List *bufferEmptyingQueue; 359 360 /* 361 * Parameters to the buffering build algorithm. levelStep determines which 362 * levels in the tree have buffers, and pagesPerBuffer determines how 363 * large each buffer is. 364 */ 365 int levelStep; 366 int pagesPerBuffer; 367 368 /* Array of lists of buffers on each level, for final emptying */ 369 List **buffersOnLevels; 370 int buffersOnLevelsLen; 371 372 /* 373 * Dynamically-sized array of buffers that currently have their last page 374 * loaded in main memory. 375 */ 376 GISTNodeBuffer **loadedBuffers; 377 int loadedBuffersCount; /* # of entries in loadedBuffers */ 378 int loadedBuffersLen; /* allocated size of loadedBuffers */ 379 380 /* Level of the current root node (= height of the index tree - 1) */ 381 int rootlevel; 382 } GISTBuildBuffers; 383 384 /* 385 * Storage type for GiST's reloptions 386 */ 387 typedef struct GiSTOptions 388 { 389 int32 vl_len_; /* varlena header (do not touch directly!) */ 390 int fillfactor; /* page fill factor in percent (0..100) */ 391 int bufferingModeOffset; /* use buffering build? */ 392 } GiSTOptions; 393 394 /* gist.c */ 395 extern void gistbuildempty(Relation index); 396 extern bool gistinsert(Relation r, Datum *values, bool *isnull, 397 ItemPointer ht_ctid, Relation heapRel, 398 IndexUniqueCheck checkUnique, 399 struct IndexInfo *indexInfo); 400 extern MemoryContext createTempGistContext(void); 401 extern GISTSTATE *initGISTstate(Relation index); 402 extern void freeGISTstate(GISTSTATE *giststate); 403 extern void gistdoinsert(Relation r, 404 IndexTuple itup, 405 Size freespace, 406 GISTSTATE *GISTstate, 407 Relation heapRel, 408 bool is_build); 409 410 /* A List of these is returned from gistplacetopage() in *splitinfo */ 411 typedef struct 412 { 413 Buffer buf; /* the split page "half" */ 414 IndexTuple downlink; /* downlink for this half. */ 415 } GISTPageSplitInfo; 416 417 extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, 418 Buffer buffer, 419 IndexTuple *itup, int ntup, 420 OffsetNumber oldoffnum, BlockNumber *newblkno, 421 Buffer leftchildbuf, 422 List **splitinfo, 423 bool markleftchild, 424 Relation heapRel, 425 bool is_build); 426 427 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, 428 int len, GISTSTATE *giststate); 429 430 /* gistxlog.c */ 431 extern XLogRecPtr gistXLogPageDelete(Buffer buffer, 432 FullTransactionId xid, Buffer parentBuffer, 433 OffsetNumber downlinkOffset); 434 435 extern void gistXLogPageReuse(Relation rel, BlockNumber blkno, 436 FullTransactionId latestRemovedXid); 437 438 extern XLogRecPtr gistXLogUpdate(Buffer buffer, 439 OffsetNumber *todelete, int ntodelete, 440 IndexTuple *itup, int ntup, 441 Buffer leftchild); 442 443 extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, 444 int ntodelete, TransactionId latestRemovedXid); 445 446 extern XLogRecPtr gistXLogSplit(bool page_is_leaf, 447 SplitedPageLayout *dist, 448 BlockNumber origrlink, GistNSN oldnsn, 449 Buffer leftchild, bool markfollowright); 450 451 /* gistget.c */ 452 extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); 453 extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); 454 extern bool gistcanreturn(Relation index, int attno); 455 456 /* gistvalidate.c */ 457 extern bool gistvalidate(Oid opclassoid); 458 459 /* gistutil.c */ 460 461 #define GiSTPageSize \ 462 ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) 463 464 #define GIST_MIN_FILLFACTOR 10 465 #define GIST_DEFAULT_FILLFACTOR 90 466 467 extern bytea *gistoptions(Datum reloptions, bool validate); 468 extern bool gistproperty(Oid index_oid, int attno, 469 IndexAMProperty prop, const char *propname, 470 bool *res, bool *isnull); 471 extern bool gistfitpage(IndexTuple *itvec, int len); 472 extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace); 473 extern void gistcheckpage(Relation rel, Buffer buf); 474 extern Buffer gistNewBuffer(Relation r); 475 extern bool gistPageRecyclable(Page page); 476 extern void gistfillbuffer(Page page, IndexTuple *itup, int len, 477 OffsetNumber off); 478 extern IndexTuple *gistextractpage(Page page, int *len /* out */ ); 479 extern IndexTuple *gistjoinvector(IndexTuple *itvec, int *len, 480 IndexTuple *additvec, int addlen); 481 extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen); 482 483 extern IndexTuple gistunion(Relation r, IndexTuple *itvec, 484 int len, GISTSTATE *giststate); 485 extern IndexTuple gistgetadjusted(Relation r, 486 IndexTuple oldtup, 487 IndexTuple addtup, 488 GISTSTATE *giststate); 489 extern IndexTuple gistFormTuple(GISTSTATE *giststate, 490 Relation r, Datum *attdata, bool *isnull, bool isleaf); 491 492 extern OffsetNumber gistchoose(Relation r, Page p, 493 IndexTuple it, 494 GISTSTATE *giststate); 495 496 extern void GISTInitBuffer(Buffer b, uint32 f); 497 extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, 498 Datum k, Relation r, Page pg, OffsetNumber o, 499 bool l, bool isNull); 500 501 extern float gistpenalty(GISTSTATE *giststate, int attno, 502 GISTENTRY *key1, bool isNull1, 503 GISTENTRY *key2, bool isNull2); 504 extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, 505 Datum *attr, bool *isnull); 506 extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b); 507 extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, 508 OffsetNumber o, GISTENTRY *attdata, bool *isnull); 509 extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, 510 IndexTuple tuple); 511 extern void gistMakeUnionKey(GISTSTATE *giststate, int attno, 512 GISTENTRY *entry1, bool isnull1, 513 GISTENTRY *entry2, bool isnull2, 514 Datum *dst, bool *dstisnull); 515 516 extern XLogRecPtr gistGetFakeLSN(Relation rel); 517 518 /* gistvacuum.c */ 519 extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info, 520 IndexBulkDeleteResult *stats, 521 IndexBulkDeleteCallback callback, 522 void *callback_state); 523 extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info, 524 IndexBulkDeleteResult *stats); 525 526 /* gistsplit.c */ 527 extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, 528 int len, GISTSTATE *giststate, 529 GistSplitVector *v, 530 int attno); 531 532 /* gistbuild.c */ 533 extern IndexBuildResult *gistbuild(Relation heap, Relation index, 534 struct IndexInfo *indexInfo); 535 extern void gistValidateBufferingOption(const char *value); 536 537 /* gistbuildbuffers.c */ 538 extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep, 539 int maxLevel); 540 extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb, 541 GISTSTATE *giststate, 542 BlockNumber blkno, int level); 543 extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, 544 GISTNodeBuffer *nodeBuffer, IndexTuple item); 545 extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, 546 GISTNodeBuffer *nodeBuffer, IndexTuple *item); 547 extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb); 548 extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, 549 GISTSTATE *giststate, Relation r, 550 int level, Buffer buffer, 551 List *splitinfo); 552 extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb); 553 554 #endif /* GIST_PRIVATE_H */ 555