1 /*-------------------------------------------------------------------------
2  *
3  * gist_private.h
4  *	  private declarations for GiST -- declarations related to the
5  *	  internal implementation of GiST, not the public API
6  *
7  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * src/include/access/gist_private.h
11  *
12  *-------------------------------------------------------------------------
13  */
14 #ifndef GIST_PRIVATE_H
15 #define GIST_PRIVATE_H
16 
17 #include "access/amapi.h"
18 #include "access/gist.h"
19 #include "access/itup.h"
20 #include "fmgr.h"
21 #include "lib/pairingheap.h"
22 #include "storage/bufmgr.h"
23 #include "storage/buffile.h"
24 #include "utils/hsearch.h"
25 #include "access/genam.h"
26 
27 /*
28  * Maximum number of "halves" a page can be split into in one operation.
29  * Typically a split produces 2 halves, but can be more if keys have very
30  * different lengths, or when inserting multiple keys in one operation (as
31  * when inserting downlinks to an internal node).  There is no theoretical
32  * limit on this, but in practice if you get more than a handful page halves
33  * in one split, there's something wrong with the opclass implementation.
34  * GIST_MAX_SPLIT_PAGES is an arbitrary limit on that, used to size some
35  * local arrays used during split.  Note that there is also a limit on the
36  * number of buffers that can be held locked at a time, MAX_SIMUL_LWLOCKS,
37  * so if you raise this higher than that limit, you'll just get a different
38  * error.
39  */
40 #define GIST_MAX_SPLIT_PAGES		75
41 
42 /* Buffer lock modes */
43 #define GIST_SHARE	BUFFER_LOCK_SHARE
44 #define GIST_EXCLUSIVE	BUFFER_LOCK_EXCLUSIVE
45 #define GIST_UNLOCK BUFFER_LOCK_UNLOCK
46 
47 typedef struct
48 {
49 	BlockNumber prev;
50 	uint32		freespace;
51 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
52 } GISTNodeBufferPage;
53 
54 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
55 /* Returns free space in node buffer page */
56 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
57 /* Checks if node buffer page is empty */
58 #define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
59 /* Checks if node buffers page don't contain sufficient space for index tuple */
60 #define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
61 										MAXALIGN(IndexTupleSize(itup)))
62 
63 /*
64  * GISTSTATE: information needed for any GiST index operation
65  *
66  * This struct retains call info for the index's opclass-specific support
67  * functions (per index column), plus the index's tuple descriptor.
68  *
69  * scanCxt holds the GISTSTATE itself as well as any data that lives for the
70  * lifetime of the index operation.  We pass this to the support functions
71  * via fn_mcxt, so that they can store scan-lifespan data in it.  The
72  * functions are invoked in tempCxt, which is typically short-lifespan
73  * (that is, it's reset after each tuple).  However, tempCxt can be the same
74  * as scanCxt if we're not bothering with per-tuple context resets.
75  */
76 typedef struct GISTSTATE
77 {
78 	MemoryContext scanCxt;		/* context for scan-lifespan data */
79 	MemoryContext tempCxt;		/* short-term context for calling functions */
80 
81 	TupleDesc	tupdesc;		/* index's tuple descriptor */
82 	TupleDesc	fetchTupdesc;	/* tuple descriptor for tuples returned in an
83 								 * index-only scan */
84 
85 	FmgrInfo	consistentFn[INDEX_MAX_KEYS];
86 	FmgrInfo	unionFn[INDEX_MAX_KEYS];
87 	FmgrInfo	compressFn[INDEX_MAX_KEYS];
88 	FmgrInfo	decompressFn[INDEX_MAX_KEYS];
89 	FmgrInfo	penaltyFn[INDEX_MAX_KEYS];
90 	FmgrInfo	picksplitFn[INDEX_MAX_KEYS];
91 	FmgrInfo	equalFn[INDEX_MAX_KEYS];
92 	FmgrInfo	distanceFn[INDEX_MAX_KEYS];
93 	FmgrInfo	fetchFn[INDEX_MAX_KEYS];
94 
95 	/* Collations to pass to the support functions */
96 	Oid			supportCollation[INDEX_MAX_KEYS];
97 } GISTSTATE;
98 
99 
100 /*
101  * During a GiST index search, we must maintain a queue of unvisited items,
102  * which can be either individual heap tuples or whole index pages.  If it
103  * is an ordered search, the unvisited items should be visited in distance
104  * order.  Unvisited items at the same distance should be visited in
105  * depth-first order, that is heap items first, then lower index pages, then
106  * upper index pages; this rule avoids doing extra work during a search that
107  * ends early due to LIMIT.
108  *
109  * To perform an ordered search, we use a pairing heap to manage the
110  * distance-order queue.  In a non-ordered search (no order-by operators),
111  * we use it to return heap tuples before unvisited index pages, to
112  * ensure depth-first order, but all entries are otherwise considered
113  * equal.
114  */
115 
116 /* Individual heap tuple to be visited */
117 typedef struct GISTSearchHeapItem
118 {
119 	ItemPointerData heapPtr;
120 	bool		recheck;		/* T if quals must be rechecked */
121 	bool		recheckDistances;	/* T if distances must be rechecked */
122 	HeapTuple	recontup;		/* data reconstructed from the index, used in
123 								 * index-only scans */
124 	OffsetNumber offnum;		/* track offset in page to mark tuple as
125 								 * LP_DEAD */
126 } GISTSearchHeapItem;
127 
128 /* Unvisited item, either index page or heap tuple */
129 typedef struct GISTSearchItem
130 {
131 	pairingheap_node phNode;
132 	BlockNumber blkno;			/* index page number, or InvalidBlockNumber */
133 	union
134 	{
135 		GistNSN		parentlsn;	/* parent page's LSN, if index page */
136 		/* we must store parentlsn to detect whether a split occurred */
137 		GISTSearchHeapItem heap;	/* heap info, if heap tuple */
138 	}			data;
139 
140 	/* numberOfOrderBys entries */
141 	IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER];
142 } GISTSearchItem;
143 
144 #define GISTSearchItemIsHeap(item)	((item).blkno == InvalidBlockNumber)
145 
146 #define SizeOfGISTSearchItem(n_distances) \
147 	(offsetof(GISTSearchItem, distances) + \
148 	 sizeof(IndexOrderByDistance) * (n_distances))
149 
150 /*
151  * GISTScanOpaqueData: private state for a scan of a GiST index
152  */
153 typedef struct GISTScanOpaqueData
154 {
155 	GISTSTATE  *giststate;		/* index information, see above */
156 	Oid		   *orderByTypes;	/* datatypes of ORDER BY expressions */
157 
158 	pairingheap *queue;			/* queue of unvisited items */
159 	MemoryContext queueCxt;		/* context holding the queue */
160 	bool		qual_ok;		/* false if qual can never be satisfied */
161 	bool		firstCall;		/* true until first gistgettuple call */
162 
163 	/* pre-allocated workspace arrays */
164 	IndexOrderByDistance *distances;	/* output area for gistindex_keytest */
165 
166 	/* info about killed items if any (killedItems is NULL if never used) */
167 	OffsetNumber *killedItems;	/* offset numbers of killed items */
168 	int			numKilled;		/* number of currently stored items */
169 	BlockNumber curBlkno;		/* current number of block */
170 	GistNSN		curPageLSN;		/* pos in the WAL stream when page was read */
171 
172 	/* In a non-ordered search, returnable heap items are stored here: */
173 	GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
174 	OffsetNumber nPageData;		/* number of valid items in array */
175 	OffsetNumber curPageData;	/* next item to return */
176 	MemoryContext pageDataCxt;	/* context holding the fetched tuples, for
177 								 * index-only scans */
178 } GISTScanOpaqueData;
179 
180 typedef GISTScanOpaqueData *GISTScanOpaque;
181 
182 /* despite the name, gistxlogPage is not part of any xlog record */
183 typedef struct gistxlogPage
184 {
185 	BlockNumber blkno;
186 	int			num;			/* number of index tuples following */
187 } gistxlogPage;
188 
189 /* SplitedPageLayout - gistSplit function result */
190 typedef struct SplitedPageLayout
191 {
192 	gistxlogPage block;
193 	IndexTupleData *list;
194 	int			lenlist;
195 	IndexTuple	itup;			/* union key for page */
196 	Page		page;			/* to operate */
197 	Buffer		buffer;			/* to write after all proceed */
198 
199 	struct SplitedPageLayout *next;
200 } SplitedPageLayout;
201 
202 /*
203  * GISTInsertStack used for locking buffers and transfer arguments during
204  * insertion
205  */
206 typedef struct GISTInsertStack
207 {
208 	/* current page */
209 	BlockNumber blkno;
210 	Buffer		buffer;
211 	Page		page;
212 
213 	/*
214 	 * log sequence number from page->lsn to recognize page update and compare
215 	 * it with page's nsn to recognize page split
216 	 */
217 	GistNSN		lsn;
218 
219 	/* offset of the downlink in the parent page, that points to this page */
220 	OffsetNumber downlinkoffnum;
221 
222 	/* pointer to parent */
223 	struct GISTInsertStack *parent;
224 } GISTInsertStack;
225 
226 /* Working state and results for multi-column split logic in gistsplit.c */
227 typedef struct GistSplitVector
228 {
229 	GIST_SPLITVEC splitVector;	/* passed to/from user PickSplit method */
230 
231 	Datum		spl_lattr[INDEX_MAX_KEYS];	/* Union of subkeys in
232 											 * splitVector.spl_left */
233 	bool		spl_lisnull[INDEX_MAX_KEYS];
234 
235 	Datum		spl_rattr[INDEX_MAX_KEYS];	/* Union of subkeys in
236 											 * splitVector.spl_right */
237 	bool		spl_risnull[INDEX_MAX_KEYS];
238 
239 	bool	   *spl_dontcare;	/* flags tuples which could go to either side
240 								 * of the split for zero penalty */
241 } GistSplitVector;
242 
243 typedef struct
244 {
245 	Relation	r;
246 	Relation	heapRel;
247 	Size		freespace;		/* free space to be left */
248 
249 	GISTInsertStack *stack;
250 } GISTInsertState;
251 
252 /* root page of a gist index */
253 #define GIST_ROOT_BLKNO				0
254 
255 /*
256  * Before PostgreSQL 9.1, we used to rely on so-called "invalid tuples" on
257  * inner pages to finish crash recovery of incomplete page splits. If a crash
258  * happened in the middle of a page split, so that the downlink pointers were
259  * not yet inserted, crash recovery inserted a special downlink pointer. The
260  * semantics of an invalid tuple was that it if you encounter one in a scan,
261  * it must always be followed, because we don't know if the tuples on the
262  * child page match or not.
263  *
264  * We no longer create such invalid tuples, we now mark the left-half of such
265  * an incomplete split with the F_FOLLOW_RIGHT flag instead, and finish the
266  * split properly the next time we need to insert on that page. To retain
267  * on-disk compatibility for the sake of pg_upgrade, we still store 0xffff as
268  * the offset number of all inner tuples. If we encounter any invalid tuples
269  * with 0xfffe during insertion, we throw an error, though scans still handle
270  * them. You should only encounter invalid tuples if you pg_upgrade a pre-9.1
271  * gist index which already has invalid tuples in it because of a crash. That
272  * should be rare, and you are recommended to REINDEX anyway if you have any
273  * invalid tuples in an index, so throwing an error is as far as we go with
274  * supporting that.
275  */
276 #define TUPLE_IS_VALID		0xffff
277 #define TUPLE_IS_INVALID	0xfffe
278 
279 #define  GistTupleIsInvalid(itup)	( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
280 #define  GistTupleSetValid(itup)	ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
281 
282 
283 
284 
285 /*
286  * A buffer attached to an internal node, used when building an index in
287  * buffering mode.
288  */
289 typedef struct
290 {
291 	BlockNumber nodeBlocknum;	/* index block # this buffer is for */
292 	int32		blocksCount;	/* current # of blocks occupied by buffer */
293 
294 	BlockNumber pageBlocknum;	/* temporary file block # */
295 	GISTNodeBufferPage *pageBuffer; /* in-memory buffer page */
296 
297 	/* is this buffer queued for emptying? */
298 	bool		queuedForEmptying;
299 
300 	/* is this a temporary copy, not in the hash table? */
301 	bool		isTemp;
302 
303 	int			level;			/* 0 == leaf */
304 } GISTNodeBuffer;
305 
306 /*
307  * Does specified level have buffers? (Beware of multiple evaluation of
308  * arguments.)
309  */
310 #define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
311 	((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
312 	 (nlevel) != (gfbb)->rootlevel)
313 
314 /* Is specified buffer at least half-filled (should be queued for emptying)? */
315 #define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
316 	((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
317 
318 /*
319  * Is specified buffer full? Our buffers can actually grow indefinitely,
320  * beyond the "maximum" size, so this just means whether the buffer has grown
321  * beyond the nominal maximum size.
322  */
323 #define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
324 	((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
325 
326 /*
327  * Data structure with general information about build buffers.
328  */
329 typedef struct GISTBuildBuffers
330 {
331 	/* Persistent memory context for the buffers and metadata. */
332 	MemoryContext context;
333 
334 	BufFile    *pfile;			/* Temporary file to store buffers in */
335 	long		nFileBlocks;	/* Current size of the temporary file */
336 
337 	/*
338 	 * resizable array of free blocks.
339 	 */
340 	long	   *freeBlocks;
341 	int			nFreeBlocks;	/* # of currently free blocks in the array */
342 	int			freeBlocksLen;	/* current allocated length of the array */
343 
344 	/* Hash for buffers by block number */
345 	HTAB	   *nodeBuffersTab;
346 
347 	/* List of buffers scheduled for emptying */
348 	List	   *bufferEmptyingQueue;
349 
350 	/*
351 	 * Parameters to the buffering build algorithm. levelStep determines which
352 	 * levels in the tree have buffers, and pagesPerBuffer determines how
353 	 * large each buffer is.
354 	 */
355 	int			levelStep;
356 	int			pagesPerBuffer;
357 
358 	/* Array of lists of buffers on each level, for final emptying */
359 	List	  **buffersOnLevels;
360 	int			buffersOnLevelsLen;
361 
362 	/*
363 	 * Dynamically-sized array of buffers that currently have their last page
364 	 * loaded in main memory.
365 	 */
366 	GISTNodeBuffer **loadedBuffers;
367 	int			loadedBuffersCount; /* # of entries in loadedBuffers */
368 	int			loadedBuffersLen;	/* allocated size of loadedBuffers */
369 
370 	/* Level of the current root node (= height of the index tree - 1) */
371 	int			rootlevel;
372 } GISTBuildBuffers;
373 
374 /*
375  * Storage type for GiST's reloptions
376  */
377 typedef struct GiSTOptions
378 {
379 	int32		vl_len_;		/* varlena header (do not touch directly!) */
380 	int			fillfactor;		/* page fill factor in percent (0..100) */
381 	int			bufferingModeOffset;	/* use buffering build? */
382 } GiSTOptions;
383 
384 /* gist.c */
385 extern void gistbuildempty(Relation index);
386 extern bool gistinsert(Relation r, Datum *values, bool *isnull,
387 		   ItemPointer ht_ctid, Relation heapRel,
388 		   IndexUniqueCheck checkUnique,
389 		   struct IndexInfo *indexInfo);
390 extern MemoryContext createTempGistContext(void);
391 extern GISTSTATE *initGISTstate(Relation index);
392 extern void freeGISTstate(GISTSTATE *giststate);
393 extern void gistdoinsert(Relation r,
394 			 IndexTuple itup,
395 			 Size freespace,
396 			 GISTSTATE *GISTstate,
397 			 Relation heapRel);
398 
399 /* A List of these is returned from gistplacetopage() in *splitinfo */
400 typedef struct
401 {
402 	Buffer		buf;			/* the split page "half" */
403 	IndexTuple	downlink;		/* downlink for this half. */
404 } GISTPageSplitInfo;
405 
406 extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
407 				Buffer buffer,
408 				IndexTuple *itup, int ntup,
409 				OffsetNumber oldoffnum, BlockNumber *newblkno,
410 				Buffer leftchildbuf,
411 				List **splitinfo,
412 				bool markleftchild,
413 				Relation heapRel);
414 
415 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
416 		  int len, GISTSTATE *giststate);
417 
418 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
419 			   OffsetNumber *todelete, int ntodelete,
420 			   IndexTuple *itup, int ntup,
421 			   Buffer leftchild, RelFileNode *hnode);
422 
423 extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
424 			  SplitedPageLayout *dist,
425 			  BlockNumber origrlink, GistNSN oldnsn,
426 			  Buffer leftchild, bool markfollowright);
427 
428 /* gistget.c */
429 extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir);
430 extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm);
431 extern bool gistcanreturn(Relation index, int attno);
432 
433 /* gistvalidate.c */
434 extern bool gistvalidate(Oid opclassoid);
435 
436 /* gistutil.c */
437 
438 #define GiSTPageSize   \
439 	( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GISTPageOpaqueData)) )
440 
441 #define GIST_MIN_FILLFACTOR			10
442 #define GIST_DEFAULT_FILLFACTOR		90
443 
444 extern bytea *gistoptions(Datum reloptions, bool validate);
445 extern bool gistproperty(Oid index_oid, int attno,
446 			 IndexAMProperty prop, const char *propname,
447 			 bool *res, bool *isnull);
448 extern bool gistfitpage(IndexTuple *itvec, int len);
449 extern bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace);
450 extern void gistcheckpage(Relation rel, Buffer buf);
451 extern Buffer gistNewBuffer(Relation r);
452 extern void gistfillbuffer(Page page, IndexTuple *itup, int len,
453 			   OffsetNumber off);
454 extern IndexTuple *gistextractpage(Page page, int *len /* out */ );
455 extern IndexTuple *gistjoinvector(
456 			   IndexTuple *itvec, int *len,
457 			   IndexTuple *additvec, int addlen);
458 extern IndexTupleData *gistfillitupvec(IndexTuple *vec, int veclen, int *memlen);
459 
460 extern IndexTuple gistunion(Relation r, IndexTuple *itvec,
461 		  int len, GISTSTATE *giststate);
462 extern IndexTuple gistgetadjusted(Relation r,
463 				IndexTuple oldtup,
464 				IndexTuple addtup,
465 				GISTSTATE *giststate);
466 extern IndexTuple gistFormTuple(GISTSTATE *giststate,
467 			  Relation r, Datum *attdata, bool *isnull, bool isleaf);
468 
469 extern OffsetNumber gistchoose(Relation r, Page p,
470 		   IndexTuple it,
471 		   GISTSTATE *giststate);
472 
473 extern void GISTInitBuffer(Buffer b, uint32 f);
474 extern void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
475 			   Datum k, Relation r, Page pg, OffsetNumber o,
476 			   bool l, bool isNull);
477 
478 extern float gistpenalty(GISTSTATE *giststate, int attno,
479 			GISTENTRY *key1, bool isNull1,
480 			GISTENTRY *key2, bool isNull2);
481 extern void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
482 				   Datum *attr, bool *isnull);
483 extern bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b);
484 extern void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
485 				  OffsetNumber o, GISTENTRY *attdata, bool *isnull);
486 extern HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r,
487 			   IndexTuple tuple);
488 extern void gistMakeUnionKey(GISTSTATE *giststate, int attno,
489 				 GISTENTRY *entry1, bool isnull1,
490 				 GISTENTRY *entry2, bool isnull2,
491 				 Datum *dst, bool *dstisnull);
492 
493 extern XLogRecPtr gistGetFakeLSN(Relation rel);
494 
495 /* gistvacuum.c */
496 extern IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info,
497 			   IndexBulkDeleteResult *stats,
498 			   IndexBulkDeleteCallback callback,
499 			   void *callback_state);
500 extern IndexBulkDeleteResult *gistvacuumcleanup(IndexVacuumInfo *info,
501 				  IndexBulkDeleteResult *stats);
502 
503 /* gistsplit.c */
504 extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
505 			   int len, GISTSTATE *giststate,
506 			   GistSplitVector *v,
507 			   int attno);
508 
509 /* gistbuild.c */
510 extern IndexBuildResult *gistbuild(Relation heap, Relation index,
511 		  struct IndexInfo *indexInfo);
512 extern void gistValidateBufferingOption(char *value);
513 
514 /* gistbuildbuffers.c */
515 extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
516 					 int maxLevel);
517 extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
518 				  GISTSTATE *giststate,
519 				  BlockNumber blkno, int level);
520 extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
521 						 GISTNodeBuffer *nodeBuffer, IndexTuple item);
522 extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
523 						  GISTNodeBuffer *nodeBuffer, IndexTuple *item);
524 extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
525 extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
526 								GISTSTATE *giststate, Relation r,
527 								int level, Buffer buffer,
528 								List *splitinfo);
529 extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
530 
531 #endif							/* GIST_PRIVATE_H */
532