1 /*------------------------------------------------------------------------- 2 * 3 * gist_private.h 4 * private declarations for GiST -- declarations related to the 5 * internal implementation of GiST, not the public API 6 * 7 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group 8 * Portions Copyright (c) 1994, Regents of the University of California 9 * 10 * src/include/access/gist_private.h 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #ifndef GIST_PRIVATE_H 15 #define GIST_PRIVATE_H 16 17 #include "access/amapi.h" 18 #include "access/gist.h" 19 #include "access/itup.h" 20 #include "fmgr.h" 21 #include "lib/pairingheap.h" 22 #include "storage/bufmgr.h" 23 #include "storage/buffile.h" 24 #include "utils/hsearch.h" 25 #include "access/genam.h" 26 27 /* 28 * Maximum number of "halves" a page can be split into in one operation. 29 * Typically a split produces 2 halves, but can be more if keys have very 30 * different lengths, or when inserting multiple keys in one operation (as 31 * when inserting downlinks to an internal node). There is no theoretical 32 * limit on this, but in practice if you get more than a handful page halves 33 * in one split, there's something wrong with the opclass implementation. 34 * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some 35 * local arrays used during split. Note that there is also a limit on the 36 * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS, 37 * so if you raise this higher than that limit, you'll just get a different 38 * error. 39 */ 40 #define GIST_MAX_SPLIT_PAGES 75 41 42 /* Buffer lock modes */ 43 #define GIST_SHARE BUFFER_LOCK_SHARE 44 #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE 45 #define GIST_UNLOCK BUFFER_LOCK_UNLOCK 46 47 typedef struct 48 { 49 BlockNumber prev; 50 uint32 freespace; 51 char tupledata[FLEXIBLE_ARRAY_MEMBER]; 52 } GISTNodeBufferPage; 53 54 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)) 55 /* Returns free space in node buffer page */ 56 #define PAGE_FREE_SPACE(nbp) (nbp->freespace) 57 /* Checks if node buffer page is empty */ 58 #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET) 59 /* Checks if node buffers page don't contain sufficient space for index tuple */ 60 #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \ 61 MAXALIGN(IndexTupleSize(itup))) 62 63 /* 64 * GISTSTATE: information needed for any GiST index operation 65 * 66 * This struct retains call info for the index's opclass-specific support 67 * functions (per index column), plus the index's tuple descriptor. 68 * 69 * scanCxt holds the GISTSTATE itself as well as any data that lives for the 70 * lifetime of the index operation. We pass this to the support functions 71 * via fn_mcxt, so that they can store scan-lifespan data in it. The 72 * functions are invoked in tempCxt, which is typically short-lifespan 73 * (that is, it's reset after each tuple). However, tempCxt can be the same 74 * as scanCxt if we're not bothering with per-tuple context resets. 75 */ 76 typedef struct GISTSTATE 77 { 78 MemoryContext scanCxt; /* context for scan-lifespan data */ 79 MemoryContext tempCxt; /* short-term context for calling functions */ 80 81 TupleDesc tupdesc; /* index's tuple descriptor */ 82 TupleDesc fetchTupdesc; /* tuple descriptor for tuples returned in an 83 * index-only scan */ 84 85 FmgrInfo consistentFn[INDEX_MAX_KEYS]; 86 FmgrInfo unionFn[INDEX_MAX_KEYS]; 87 FmgrInfo compressFn[INDEX_MAX_KEYS]; 88 FmgrInfo decompressFn[INDEX_MAX_KEYS]; 89 FmgrInfo penaltyFn[INDEX_MAX_KEYS]; 90 FmgrInfo picksplitFn[INDEX_MAX_KEYS]; 91 FmgrInfo equalFn[INDEX_MAX_KEYS]; 92 FmgrInfo distanceFn[INDEX_MAX_KEYS]; 93 FmgrInfo fetchFn[INDEX_MAX_KEYS]; 94 95 /* Collations to pass to the support functions */ 96 Oid supportCollation[INDEX_MAX_KEYS]; 97 } GISTSTATE; 98 99 100 /* 101 * During a GiST index search, we must maintain a queue of unvisited items, 102 * which can be either individual heap tuples or whole index pages. If it 103 * is an ordered search, the unvisited items should be visited in distance 104 * order. Unvisited items at the same distance should be visited in 105 * depth-first order, that is heap items first, then lower index pages, then 106 * upper index pages; this rule avoids doing extra work during a search that 107 * ends early due to LIMIT. 108 * 109 * To perform an ordered search, we use a pairing heap to manage the 110 * distance-order queue. In a non-ordered search (no order-by operators), 111 * we use it to return heap tuples before unvisited index pages, to 112 * ensure depth-first order, but all entries are otherwise considered 113 * equal. 114 */ 115 116 /* Individual heap tuple to be visited */ 117 typedef struct GISTSearchHeapItem 118 { 119 ItemPointerData heapPtr; 120 bool recheck; /* T if quals must be rechecked */ 121 bool recheckDistances; /* T if distances must be rechecked */ 122 HeapTuple recontup; /* data reconstructed from the index, used in 123 * index-only scans */ 124 OffsetNumber offnum; /* track offset in page to mark tuple as 125 * LP_DEAD */ 126 } GISTSearchHeapItem; 127 128 /* Unvisited item, either index page or heap tuple */ 129 typedef struct GISTSearchItem 130 { 131 pairingheap_node phNode; 132 BlockNumber blkno; /* index page number, or InvalidBlockNumber */ 133 union 134 { 135 GistNSN parentlsn; /* parent page's LSN, if index page */ 136 /* we must store parentlsn to detect whether a split occurred */ 137 GISTSearchHeapItem heap; /* heap info, if heap tuple */ 138 } data; 139 140 /* numberOfOrderBys entries */ 141 IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; 142 } GISTSearchItem; 143 144 #define GISTSearchItemIsHeap(item) ((item).blkno == InvalidBlockNumber) 145 146 #define SizeOfGISTSearchItem(n_distances) \ 147 (offsetof(GISTSearchItem, distances) + \ 148 sizeof(IndexOrderByDistance) * (n_distances)) 149 150 /* 151 * GISTScanOpaqueData: private state for a scan of a GiST index 152 */ 153 typedef struct GISTScanOpaqueData 154 { 155 GISTSTATE *giststate; /* index information, see above */ 156 Oid *orderByTypes; /* datatypes of ORDER BY expressions */ 157 158 pairingheap *queue; /* queue of unvisited items */ 159 MemoryContext queueCxt; /* context holding the queue */ 160 bool qual_ok; /* false if qual can never be satisfied */ 161 bool firstCall; /* true until first gistgettuple call */ 162 163 /* pre-allocated workspace arrays */ 164 IndexOrderByDistance *distances; /* output area for gistindex_keytest */ 165 166 /* info about killed items if any (killedItems is NULL if never used) */ 167 OffsetNumber *killedItems; /* offset numbers of killed items */ 168 int numKilled; /* number of currently stored items */ 169 BlockNumber curBlkno; /* current number of block */ 170 GistNSN curPageLSN; /* pos in the WAL stream when page was read */ 171 172 /* In a non-ordered search, returnable heap items are stored here: */ 173 GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; 174 OffsetNumber nPageData; /* number of valid items in array */ 175 OffsetNumber curPageData; /* next item to return */ 176 MemoryContext pageDataCxt; /* context holding the fetched tuples, for 177 * index-only scans */ 178 } GISTScanOpaqueData; 179 180 typedef GISTScanOpaqueData *GISTScanOpaque; 181 182 /* despite the name, gistxlogPage is not part of any xlog record */ 183 typedef struct gistxlogPage 184 { 185 BlockNumber blkno; 186 int num; /* number of index tuples following */ 187 } gistxlogPage; 188 189 /* SplitedPageLayout - gistSplit function result */ 190 typedef struct SplitedPageLayout 191 { 192 gistxlogPage block; 193 IndexTupleData *list; 194 int lenlist; 195 IndexTuple itup; /* union key for page */ 196 Page page; /* to operate */ 197 Buffer buffer; /* to write after all proceed */ 198 199 struct SplitedPageLayout *next; 200 } SplitedPageLayout; 201 202 /* 203 * GISTInsertStack used for locking buffers and transfer arguments during 204 * insertion 205 */ 206 typedef struct GISTInsertStack 207 { 208 /* current page */ 209 BlockNumber blkno; 210 Buffer buffer; 211 Page page; 212 213 /* 214 * log sequence number from page->lsn to recognize page update and compare 215 * it with page's nsn to recognize page split 216 */ 217 GistNSN lsn; 218 219 /* offset of the downlink in the parent page, that points to this page */ 220 OffsetNumber downlinkoffnum; 221 222 /* pointer to parent */ 223 struct GISTInsertStack *parent; 224 } GISTInsertStack; 225 226 /* Working state and results for multi-column split logic in gistsplit.c */ 227 typedef struct GistSplitVector 228 { 229 GIST_SPLITVEC splitVector; /* passed to/from user PickSplit method */ 230 231 Datum spl_lattr[INDEX_MAX_KEYS]; /* Union of subkeys in 232 * splitVector.spl_left */ 233 bool spl_lisnull[INDEX_MAX_KEYS]; 234 235 Datum spl_rattr[INDEX_MAX_KEYS]; /* Union of subkeys in 236 * splitVector.spl_right */ 237 bool spl_risnull[INDEX_MAX_KEYS]; 238 239 bool *spl_dontcare; /* flags tuples which could go to either side 240 * of the split for zero penalty */ 241 } GistSplitVector; 242 243 typedef struct 244 { 245 Relation r; 246 Relation heapRel; 247 Size freespace; /* free space to be left */ 248 249 GISTInsertStack *stack; 250 } GISTInsertState; 251 252 /* root page of a gist index */ 253 #define GIST_ROOT_BLKNO 0 254 255 /* 256 * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on 257 * inner pages to finish crash recovery of incomplete page splits. If a crash 258 * happened in the middle of a page split, so that the downlink pointers were 259 * not yet inserted, crash recovery inserted a special downlink pointer. The 260 * semantics of an invalid tuple was that it if you encounter one in a scan, 261 * it must always be followed, because we don't know if the tuples on the 262 * child page match or not. 263 * 264 * We no longer create such invalid tuples, we now mark the left-half of such 265 * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the 266 * split properly the next time we need to insert on that page. To retain 267 * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as 268 * the offset number of all inner tuples. If we encounter any invalid tuples 269 * with 0xfffe during insertion, we throw an error, though scans still handle 270 * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1 271 * gist index which already has invalid tuples in it because of a crash. That 272 * should be rare, and you are recommended to REINDEX anyway if you have any 273 * invalid tuples in an index, so throwing an error is as far as we go with 274 * supporting that. 275 */ 276 #define TUPLE_IS_VALID 0xffff 277 #define TUPLE_IS_INVALID 0xfffe 278 279 #define GistTupleIsInvalid(itup) ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID ) 280 #define GistTupleSetValid(itup) ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID ) 281 282 283 284 285 /* 286 * A buffer attached to an internal node, used when building an index in 287 * buffering mode. 288 */ 289 typedef struct 290 { 291 BlockNumber nodeBlocknum; /* index block # this buffer is for */ 292 int32 blocksCount; /* current # of blocks occupied by buffer */ 293 294 BlockNumber pageBlocknum; /* temporary file block # */ 295 GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */ 296 297 /* is this buffer queued for emptying? */ 298 bool queuedForEmptying; 299 300 /* is this a temporary copy, not in the hash table? */ 301 bool isTemp; 302 303 int level; /* 0 == leaf */ 304 } GISTNodeBuffer; 305 306 /* 307 * Does specified level have buffers? (Beware of multiple evaluation of 308 * arguments.) 309 */ 310 #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \ 311 ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \ 312 (nlevel) != (gfbb)->rootlevel) 313 314 /* Is specified buffer at least half-filled (should be queued for emptying)? */ 315 #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \ 316 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2) 317 318 /* 319 * Is specified buffer full? Our buffers can actually grow indefinitely, 320 * beyond the "maximum" size, so this just means whether the buffer has grown 321 * beyond the nominal maximum size. 322 */ 323 #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \ 324 ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer) 325 326 /* 327 * Data structure with general information about build buffers. 328 */ 329 typedef struct GISTBuildBuffers 330 { 331 /* Persistent memory context for the buffers and metadata. */ 332 MemoryContext context; 333 334 BufFile *pfile; /* Temporary file to store buffers in */ 335 long nFileBlocks; /* Current size of the temporary file */ 336 337 /* 338 * resizable array of free blocks. 339 */ 340 long *freeBlocks; 341 int nFreeBlocks; /* # of currently free blocks in the array */ 342 int freeBlocksLen; /* current allocated length of the array */ 343 344 /* Hash for buffers by block number */ 345 HTAB *nodeBuffersTab; 346 347 /* List of buffers scheduled for emptying */ 348 List *bufferEmptyingQueue; 349 350 /* 351 * Parameters to the buffering build algorithm. levelStep determines which 352 * levels in the tree have buffers, and pagesPerBuffer determines how 353 * large each buffer is. 354 */ 355 int levelStep; 356 int pagesPerBuffer; 357 358 /* Array of lists of buffers on each level, for final emptying */ 359 List **buffersOnLevels; 360 int buffersOnLevelsLen; 361 362 /* 363 * Dynamically-sized array of buffers that currently have their last page 364 * loaded in main memory. 365 */ 366 GISTNodeBuffer **loadedBuffers; 367 int loadedBuffersCount; /* # of entries in loadedBuffers */ 368 int loadedBuffersLen; /* allocated size of loadedBuffers */ 369 370 /* Level of the current root node (= height of the index tree - 1) */ 371 int rootlevel; 372 } GISTBuildBuffers; 373 374 /* 375 * Storage type for GiST's reloptions 376 */ 377 typedef struct GiSTOptions 378 { 379 int32 vl_len_; /* varlena header (do not touch directly!) */ 380 int fillfactor; /* page fill factor in percent (0..100) */ 381 int bufferingModeOffset; /* use buffering build? */ 382 } GiSTOptions; 383 384 /* gist.c */ 385 extern void gistbuildempty(Relation index); 386 extern bool gistinsert(Relation r, Datum *values, bool *isnull, 387 ItemPointer ht_ctid, Relation heapRel, 388 IndexUniqueCheck checkUnique, 389 struct IndexInfo *indexInfo); 390 extern MemoryContext createTempGistContext(void); 391 extern GISTSTATE *initGISTstate(Relation index); 392 extern void freeGISTstate(GISTSTATE *giststate); 393 extern void gistdoinsert(Relation r, 394 IndexTuple itup, 395 Size freespace, 396 GISTSTATE *GISTstate, 397 Relation heapRel); 398 399 /* A List of these is returned from gistplacetopage() in *splitinfo */ 400 typedef struct 401 { 402 Buffer buf; /* the split page "half" */ 403 IndexTuple downlink; /* downlink for this half. */ 404 } GISTPageSplitInfo; 405 406 extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, 407 Buffer buffer, 408 IndexTuple *itup, int ntup, 409 OffsetNumber oldoffnum, BlockNumber *newblkno, 410 Buffer leftchildbuf, 411 List **splitinfo, 412 bool markleftchild, 413 Relation heapRel); 414 415 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup, 416 int len, GISTSTATE *giststate); 417 418 extern XLogRecPtr gistXLogUpdate(Buffer buffer, 419 OffsetNumber *todelete, int ntodelete, 420 IndexTuple *itup, int ntup, 421 Buffer leftchild, RelFileNode *hnode); 422 423 extern XLogRecPtr gistXLogSplit(bool page_is_leaf, 424 SplitedPageLayout *dist, 425 BlockNumber origrlink, GistNSN oldnsn, 426 Buffer leftchild, bool markfollowright); 427 428 /* gistget.c */ 429 extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); 430 extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); 431 extern bool gistcanreturn(Relation index, int attno); 432 433 /* gistvalidate.c */ 434 extern bool gistvalidate(Oid opclassoid); 435 436 /* gistutil.c */ 437 438 #define GiSTPageSize \ 439 ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) ) 440 441 #define GIST_MIN_FILLFACTOR 10 442 #define GIST_DEFAULT_FILLFACTOR 90 443 444 extern bytea *gistoptions(Datum reloptions, bool validate); 445 extern bool gistproperty(Oid index_oid, int attno, 446 IndexAMProperty prop, const char *propname, 447 bool *res, bool *isnull); 448 extern bool gistfitpage(IndexTuple *itvec, int len); 449 extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace); 450 extern void gistcheckpage(Relation rel, Buffer buf); 451 extern Buffer gistNewBuffer(Relation r); 452 extern void gistfillbuffer(Page page, IndexTuple *itup, int len, 453 OffsetNumber off); 454 extern IndexTuple *gistextractpage(Page page, int *len /* out */ ); 455 extern IndexTuple *gistjoinvector( 456 IndexTuple *itvec, int *len, 457 IndexTuple *additvec, int addlen); 458 extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen); 459 460 extern IndexTuple gistunion(Relation r, IndexTuple *itvec, 461 int len, GISTSTATE *giststate); 462 extern IndexTuple gistgetadjusted(Relation r, 463 IndexTuple oldtup, 464 IndexTuple addtup, 465 GISTSTATE *giststate); 466 extern IndexTuple gistFormTuple(GISTSTATE *giststate, 467 Relation r, Datum *attdata, bool *isnull, bool isleaf); 468 469 extern OffsetNumber gistchoose(Relation r, Page p, 470 IndexTuple it, 471 GISTSTATE *giststate); 472 473 extern void GISTInitBuffer(Buffer b, uint32 f); 474 extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, 475 Datum k, Relation r, Page pg, OffsetNumber o, 476 bool l, bool isNull); 477 478 extern float gistpenalty(GISTSTATE *giststate, int attno, 479 GISTENTRY *key1, bool isNull1, 480 GISTENTRY *key2, bool isNull2); 481 extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, 482 Datum *attr, bool *isnull); 483 extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b); 484 extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, 485 OffsetNumber o, GISTENTRY *attdata, bool *isnull); 486 extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, 487 IndexTuple tuple); 488 extern void gistMakeUnionKey(GISTSTATE *giststate, int attno, 489 GISTENTRY *entry1, bool isnull1, 490 GISTENTRY *entry2, bool isnull2, 491 Datum *dst, bool *dstisnull); 492 493 extern XLogRecPtr gistGetFakeLSN(Relation rel); 494 495 /* gistvacuum.c */ 496 extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info, 497 IndexBulkDeleteResult *stats, 498 IndexBulkDeleteCallback callback, 499 void *callback_state); 500 extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info, 501 IndexBulkDeleteResult *stats); 502 503 /* gistsplit.c */ 504 extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup, 505 int len, GISTSTATE *giststate, 506 GistSplitVector *v, 507 int attno); 508 509 /* gistbuild.c */ 510 extern IndexBuildResult *gistbuild(Relation heap, Relation index, 511 struct IndexInfo *indexInfo); 512 extern void gistValidateBufferingOption(char *value); 513 514 /* gistbuildbuffers.c */ 515 extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep, 516 int maxLevel); 517 extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb, 518 GISTSTATE *giststate, 519 BlockNumber blkno, int level); 520 extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, 521 GISTNodeBuffer *nodeBuffer, IndexTuple item); 522 extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, 523 GISTNodeBuffer *nodeBuffer, IndexTuple *item); 524 extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb); 525 extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, 526 GISTSTATE *giststate, Relation r, 527 int level, Buffer buffer, 528 List *splitinfo); 529 extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb); 530 531 #endif /* GIST_PRIVATE_H */ 532