1 /*-------------------------------------------------------------------------
2  *
3  * nbtsplitloc.c
4  *	  Choose split point code for Postgres btree implementation.
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *	  src/backend/access/nbtree/nbtsplitloc.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/nbtree.h"
18 #include "storage/lmgr.h"
19 
20 /* limits on split interval (default strategy only) */
21 #define MAX_LEAF_INTERVAL			9
22 #define MAX_INTERNAL_INTERVAL		18
23 
24 typedef enum
25 {
26 	/* strategy for searching through materialized list of split points */
27 	SPLIT_DEFAULT,				/* give some weight to truncation */
28 	SPLIT_MANY_DUPLICATES,		/* find minimally distinguishing point */
29 	SPLIT_SINGLE_VALUE			/* leave left page almost full */
30 } FindSplitStrat;
31 
32 typedef struct
33 {
34 	/* details of free space left by split */
35 	int16		curdelta;		/* current leftfree/rightfree delta */
36 	int16		leftfree;		/* space left on left page post-split */
37 	int16		rightfree;		/* space left on right page post-split */
38 
39 	/* split point identifying fields (returned by _bt_findsplitloc) */
40 	OffsetNumber firstoldonright;	/* first item on new right page */
41 	bool		newitemonleft;	/* new item goes on left, or right? */
42 
43 } SplitPoint;
44 
45 typedef struct
46 {
47 	/* context data for _bt_recsplitloc */
48 	Relation	rel;			/* index relation */
49 	Page		page;			/* page undergoing split */
50 	IndexTuple	newitem;		/* new item (cause of page split) */
51 	Size		newitemsz;		/* size of newitem (includes line pointer) */
52 	bool		is_leaf;		/* T if splitting a leaf page */
53 	bool		is_rightmost;	/* T if splitting rightmost page on level */
54 	OffsetNumber newitemoff;	/* where the new item is to be inserted */
55 	int			leftspace;		/* space available for items on left page */
56 	int			rightspace;		/* space available for items on right page */
57 	int			olddataitemstotal;	/* space taken by old items */
58 	Size		minfirstrightsz;	/* smallest firstoldonright tuple size */
59 
60 	/* candidate split point data */
61 	int			maxsplits;		/* maximum number of splits */
62 	int			nsplits;		/* current number of splits */
63 	SplitPoint *splits;			/* all candidate split points for page */
64 	int			interval;		/* current range of acceptable split points */
65 } FindSplitData;
66 
67 static void _bt_recsplitloc(FindSplitData *state,
68 							OffsetNumber firstoldonright, bool newitemonleft,
69 							int olddataitemstoleft, Size firstoldonrightsz);
70 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
71 								bool usemult);
72 static int	_bt_splitcmp(const void *arg1, const void *arg2);
73 static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
74 								int leaffillfactor, bool *usemult);
75 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
76 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
77 									 bool *newitemonleft, FindSplitStrat strategy);
78 static int	_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
79 						 SplitPoint *rightpage, FindSplitStrat *strategy);
80 static void _bt_interval_edges(FindSplitData *state,
81 							   SplitPoint **leftinterval, SplitPoint **rightinterval);
82 static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
83 static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
84 											SplitPoint *split);
85 static inline IndexTuple _bt_split_firstright(FindSplitData *state,
86 											  SplitPoint *split);
87 
88 
89 /*
90  *	_bt_findsplitloc() -- find an appropriate place to split a page.
91  *
92  * The main goal here is to equalize the free space that will be on each
93  * split page, *after accounting for the inserted tuple*.  (If we fail to
94  * account for it, we might find ourselves with too little room on the page
95  * that it needs to go into!)
96  *
97  * If the page is the rightmost page on its level, we instead try to arrange
98  * to leave the left split page fillfactor% full.  In this way, when we are
99  * inserting successively increasing keys (consider sequences, timestamps,
100  * etc) we will end up with a tree whose pages are about fillfactor% full,
101  * instead of the 50% full result that we'd get without this special case.
102  * This is the same as nbtsort.c produces for a newly-created tree.  Note
103  * that leaf and nonleaf pages use different fillfactors.  Note also that
104  * there are a number of further special cases where fillfactor is not
105  * applied in the standard way.
106  *
107  * We are passed the intended insert position of the new tuple, expressed as
108  * the offsetnumber of the tuple it must go in front of (this could be
109  * maxoff+1 if the tuple is to go at the end).  The new tuple itself is also
110  * passed, since it's needed to give some weight to how effective suffix
111  * truncation will be.  The implementation picks the split point that
112  * maximizes the effectiveness of suffix truncation from a small list of
113  * alternative candidate split points that leave each side of the split with
114  * about the same share of free space.  Suffix truncation is secondary to
115  * equalizing free space, except in cases with large numbers of duplicates.
116  * Note that it is always assumed that caller goes on to perform truncation,
117  * even with pg_upgrade'd indexes where that isn't actually the case
118  * (!heapkeyspace indexes).  See nbtree/README for more information about
119  * suffix truncation.
120  *
121  * We return the index of the first existing tuple that should go on the
122  * righthand page, plus a boolean indicating whether the new tuple goes on
123  * the left or right page.  The bool is necessary to disambiguate the case
124  * where firstright == newitemoff.
125  */
126 OffsetNumber
_bt_findsplitloc(Relation rel,Page page,OffsetNumber newitemoff,Size newitemsz,IndexTuple newitem,bool * newitemonleft)127 _bt_findsplitloc(Relation rel,
128 				 Page page,
129 				 OffsetNumber newitemoff,
130 				 Size newitemsz,
131 				 IndexTuple newitem,
132 				 bool *newitemonleft)
133 {
134 	BTPageOpaque opaque;
135 	int			leftspace,
136 				rightspace,
137 				olddataitemstotal,
138 				olddataitemstoleft,
139 				perfectpenalty,
140 				leaffillfactor;
141 	FindSplitData state;
142 	FindSplitStrat strategy;
143 	ItemId		itemid;
144 	OffsetNumber offnum,
145 				maxoff,
146 				foundfirstright;
147 	double		fillfactormult;
148 	bool		usemult;
149 	SplitPoint	leftpage,
150 				rightpage;
151 
152 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153 	maxoff = PageGetMaxOffsetNumber(page);
154 
155 	/* Total free space available on a btree page, after fixed overhead */
156 	leftspace = rightspace =
157 		PageGetPageSize(page) - SizeOfPageHeaderData -
158 		MAXALIGN(sizeof(BTPageOpaqueData));
159 
160 	/* The right page will have the same high key as the old page */
161 	if (!P_RIGHTMOST(opaque))
162 	{
163 		itemid = PageGetItemId(page, P_HIKEY);
164 		rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165 							 sizeof(ItemIdData));
166 	}
167 
168 	/* Count up total space in data items before actually scanning 'em */
169 	olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170 	leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171 
172 	/* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173 	newitemsz += sizeof(ItemIdData);
174 	state.rel = rel;
175 	state.page = page;
176 	state.newitem = newitem;
177 	state.newitemsz = newitemsz;
178 	state.is_leaf = P_ISLEAF(opaque);
179 	state.is_rightmost = P_RIGHTMOST(opaque);
180 	state.leftspace = leftspace;
181 	state.rightspace = rightspace;
182 	state.olddataitemstotal = olddataitemstotal;
183 	state.minfirstrightsz = SIZE_MAX;
184 	state.newitemoff = newitemoff;
185 
186 	/*
187 	 * maxsplits should never exceed maxoff because there will be at most as
188 	 * many candidate split points as there are points _between_ tuples, once
189 	 * you imagine that the new item is already on the original page (the
190 	 * final number of splits may be slightly lower because not all points
191 	 * between tuples will be legal).
192 	 */
193 	state.maxsplits = maxoff;
194 	state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195 	state.nsplits = 0;
196 
197 	/*
198 	 * Scan through the data items and calculate space usage for a split at
199 	 * each possible position
200 	 */
201 	olddataitemstoleft = 0;
202 
203 	for (offnum = P_FIRSTDATAKEY(opaque);
204 		 offnum <= maxoff;
205 		 offnum = OffsetNumberNext(offnum))
206 	{
207 		Size		itemsz;
208 
209 		itemid = PageGetItemId(page, offnum);
210 		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211 
212 		/*
213 		 * When item offset number is not newitemoff, neither side of the
214 		 * split can be newitem.  Record a split after the previous data item
215 		 * from original page, but before the current data item from original
216 		 * page. (_bt_recsplitloc() will reject the split when there are no
217 		 * previous items, which we rely on.)
218 		 */
219 		if (offnum < newitemoff)
220 			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221 		else if (offnum > newitemoff)
222 			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223 		else
224 		{
225 			/*
226 			 * Record a split after all "offnum < newitemoff" original page
227 			 * data items, but before newitem
228 			 */
229 			_bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230 
231 			/*
232 			 * Record a split after newitem, but before data item from
233 			 * original page at offset newitemoff/current offset
234 			 */
235 			_bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236 		}
237 
238 		olddataitemstoleft += itemsz;
239 	}
240 
241 	/*
242 	 * Record a split after all original page data items, but before newitem.
243 	 * (Though only when it's possible that newitem will end up alone on new
244 	 * right page.)
245 	 */
246 	Assert(olddataitemstoleft == olddataitemstotal);
247 	if (newitemoff > maxoff)
248 		_bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
249 
250 	/*
251 	 * I believe it is not possible to fail to find a feasible split, but just
252 	 * in case ...
253 	 */
254 	if (state.nsplits == 0)
255 		elog(ERROR, "could not find a feasible split point for index \"%s\"",
256 			 RelationGetRelationName(rel));
257 
258 	/*
259 	 * Start search for a split point among list of legal split points.  Give
260 	 * primary consideration to equalizing available free space in each half
261 	 * of the split initially (start with default strategy), while applying
262 	 * rightmost and split-after-new-item optimizations where appropriate.
263 	 * Either of the two other fallback strategies may be required for cases
264 	 * with a large number of duplicates around the original/space-optimal
265 	 * split point.
266 	 *
267 	 * Default strategy gives some weight to suffix truncation in deciding a
268 	 * split point on leaf pages.  It attempts to select a split point where a
269 	 * distinguishing attribute appears earlier in the new high key for the
270 	 * left side of the split, in order to maximize the number of trailing
271 	 * attributes that can be truncated away.  Only candidate split points
272 	 * that imply an acceptable balance of free space on each side are
273 	 * considered.
274 	 */
275 	if (!state.is_leaf)
276 	{
277 		/* fillfactormult only used on rightmost page */
278 		usemult = state.is_rightmost;
279 		fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
280 	}
281 	else if (state.is_rightmost)
282 	{
283 		/* Rightmost leaf page --  fillfactormult always used */
284 		usemult = true;
285 		fillfactormult = leaffillfactor / 100.0;
286 	}
287 	else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288 	{
289 		/*
290 		 * New item inserted at rightmost point among a localized grouping on
291 		 * a leaf page -- apply "split after new item" optimization, either by
292 		 * applying leaf fillfactor multiplier, or by choosing the exact split
293 		 * point that leaves the new item as last on the left. (usemult is set
294 		 * for us.)
295 		 */
296 		if (usemult)
297 		{
298 			/* fillfactormult should be set based on leaf fillfactor */
299 			fillfactormult = leaffillfactor / 100.0;
300 		}
301 		else
302 		{
303 			/* find precise split point after newitemoff */
304 			for (int i = 0; i < state.nsplits; i++)
305 			{
306 				SplitPoint *split = state.splits + i;
307 
308 				if (split->newitemonleft &&
309 					newitemoff == split->firstoldonright)
310 				{
311 					pfree(state.splits);
312 					*newitemonleft = true;
313 					return newitemoff;
314 				}
315 			}
316 
317 			/*
318 			 * Cannot legally split after newitemoff; proceed with split
319 			 * without using fillfactor multiplier.  This is defensive, and
320 			 * should never be needed in practice.
321 			 */
322 			fillfactormult = 0.50;
323 		}
324 	}
325 	else
326 	{
327 		/* Other leaf page.  50:50 page split. */
328 		usemult = false;
329 		/* fillfactormult not used, but be tidy */
330 		fillfactormult = 0.50;
331 	}
332 
333 	/*
334 	 * Set an initial limit on the split interval/number of candidate split
335 	 * points as appropriate.  The "Prefix B-Trees" paper refers to this as
336 	 * sigma l for leaf splits and sigma b for internal ("branch") splits.
337 	 * It's hard to provide a theoretical justification for the initial size
338 	 * of the split interval, though it's clear that a small split interval
339 	 * makes suffix truncation much more effective without noticeably
340 	 * affecting space utilization over time.
341 	 */
342 	state.interval = Min(Max(1, state.nsplits * 0.05),
343 						 state.is_leaf ? MAX_LEAF_INTERVAL :
344 						 MAX_INTERNAL_INTERVAL);
345 
346 	/*
347 	 * Save leftmost and rightmost splits for page before original ordinal
348 	 * sort order is lost by delta/fillfactormult sort
349 	 */
350 	leftpage = state.splits[0];
351 	rightpage = state.splits[state.nsplits - 1];
352 
353 	/* Give split points a fillfactormult-wise delta, and sort on deltas */
354 	_bt_deltasortsplits(&state, fillfactormult, usemult);
355 
356 	/*
357 	 * Determine if default strategy/split interval will produce a
358 	 * sufficiently distinguishing split, or if we should change strategies.
359 	 * Alternative strategies change the range of split points that are
360 	 * considered acceptable (split interval), and possibly change
361 	 * fillfactormult, in order to deal with pages with a large number of
362 	 * duplicates gracefully.
363 	 *
364 	 * Pass low and high splits for the entire page (actually, they're for an
365 	 * imaginary version of the page that includes newitem).  These are used
366 	 * when the initial split interval encloses split points that are full of
367 	 * duplicates, and we need to consider if it's even possible to avoid
368 	 * appending a heap TID.
369 	 */
370 	perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371 
372 	if (strategy == SPLIT_DEFAULT)
373 	{
374 		/*
375 		 * Default strategy worked out (always works out with internal page).
376 		 * Original split interval still stands.
377 		 */
378 	}
379 
380 	/*
381 	 * Many duplicates strategy is used when a heap TID would otherwise be
382 	 * appended, but the page isn't completely full of logical duplicates.
383 	 *
384 	 * The split interval is widened to include all legal candidate split
385 	 * points.  There might be a few as two distinct values in the whole-page
386 	 * split interval, though it's also possible that most of the values on
387 	 * the page are unique.  The final split point will either be to the
388 	 * immediate left or to the immediate right of the group of duplicate
389 	 * tuples that enclose the first/delta-optimal split point (perfect
390 	 * penalty was set so that the lowest delta split point that avoids
391 	 * appending a heap TID will be chosen).  Maximizing the number of
392 	 * attributes that can be truncated away is not a goal of the many
393 	 * duplicates strategy.
394 	 *
395 	 * Single value strategy is used when it is impossible to avoid appending
396 	 * a heap TID.  It arranges to leave the left page very full.  This
397 	 * maximizes space utilization in cases where tuples with the same
398 	 * attribute values span many pages.  Newly inserted duplicates will tend
399 	 * to have higher heap TID values, so we'll end up splitting to the right
400 	 * consistently.  (Single value strategy is harmless though not
401 	 * particularly useful with !heapkeyspace indexes.)
402 	 */
403 	else if (strategy == SPLIT_MANY_DUPLICATES)
404 	{
405 		Assert(state.is_leaf);
406 		/* Shouldn't try to truncate away extra user attributes */
407 		Assert(perfectpenalty ==
408 			   IndexRelationGetNumberOfKeyAttributes(state.rel));
409 		/* No need to resort splits -- no change in fillfactormult/deltas */
410 		state.interval = state.nsplits;
411 	}
412 	else if (strategy == SPLIT_SINGLE_VALUE)
413 	{
414 		Assert(state.is_leaf);
415 		/* Split near the end of the page */
416 		usemult = true;
417 		fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
418 		/* Resort split points with new delta */
419 		_bt_deltasortsplits(&state, fillfactormult, usemult);
420 		/* Appending a heap TID is unavoidable, so interval of 1 is fine */
421 		state.interval = 1;
422 	}
423 
424 	/*
425 	 * Search among acceptable split points (using final split interval) for
426 	 * the entry that has the lowest penalty, and is therefore expected to
427 	 * maximize fan-out.  Sets *newitemonleft for us.
428 	 */
429 	foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430 									   strategy);
431 	pfree(state.splits);
432 
433 	return foundfirstright;
434 }
435 
436 /*
437  * Subroutine to record a particular point between two tuples (possibly the
438  * new item) on page (ie, combination of firstright and newitemonleft
439  * settings) in *state for later analysis.  This is also a convenient point
440  * to check if the split is legal (if it isn't, it won't be recorded).
441  *
442  * firstoldonright is the offset of the first item on the original page that
443  * goes to the right page, and firstoldonrightsz is the size of that tuple.
444  * firstoldonright can be > max offset, which means that all the old items go
445  * to the left page and only the new item goes to the right page.  In that
446  * case, firstoldonrightsz is not used.
447  *
448  * olddataitemstoleft is the total size of all old items to the left of the
449  * split point that is recorded here when legal.  Should not include
450  * newitemsz, since that is handled here.
451  */
452 static void
_bt_recsplitloc(FindSplitData * state,OffsetNumber firstoldonright,bool newitemonleft,int olddataitemstoleft,Size firstoldonrightsz)453 _bt_recsplitloc(FindSplitData *state,
454 				OffsetNumber firstoldonright,
455 				bool newitemonleft,
456 				int olddataitemstoleft,
457 				Size firstoldonrightsz)
458 {
459 	int16		leftfree,
460 				rightfree;
461 	Size		firstrightitemsz;
462 	bool		newitemisfirstonright;
463 
464 	/* Is the new item going to be the first item on the right page? */
465 	newitemisfirstonright = (firstoldonright == state->newitemoff
466 							 && !newitemonleft);
467 
468 	if (newitemisfirstonright)
469 		firstrightitemsz = state->newitemsz;
470 	else
471 		firstrightitemsz = firstoldonrightsz;
472 
473 	/* Account for all the old tuples */
474 	leftfree = state->leftspace - olddataitemstoleft;
475 	rightfree = state->rightspace -
476 		(state->olddataitemstotal - olddataitemstoleft);
477 
478 	/*
479 	 * The first item on the right page becomes the high key of the left page;
480 	 * therefore it counts against left space as well as right space (we
481 	 * cannot assume that suffix truncation will make it any smaller).  When
482 	 * index has included attributes, then those attributes of left page high
483 	 * key will be truncated leaving that page with slightly more free space.
484 	 * However, that shouldn't affect our ability to find valid split
485 	 * location, since we err in the direction of being pessimistic about free
486 	 * space on the left half.  Besides, even when suffix truncation of
487 	 * non-TID attributes occurs, the new high key often won't even be a
488 	 * single MAXALIGN() quantum smaller than the firstright tuple it's based
489 	 * on.
490 	 *
491 	 * If we are on the leaf level, assume that suffix truncation cannot avoid
492 	 * adding a heap TID to the left half's new high key when splitting at the
493 	 * leaf level.  In practice the new high key will often be smaller and
494 	 * will rarely be larger, but conservatively assume the worst case.
495 	 */
496 	if (state->is_leaf)
497 		leftfree -= (int16) (firstrightitemsz +
498 							 MAXALIGN(sizeof(ItemPointerData)));
499 	else
500 		leftfree -= (int16) firstrightitemsz;
501 
502 	/* account for the new item */
503 	if (newitemonleft)
504 		leftfree -= (int16) state->newitemsz;
505 	else
506 		rightfree -= (int16) state->newitemsz;
507 
508 	/*
509 	 * If we are not on the leaf level, we will be able to discard the key
510 	 * data from the first item that winds up on the right page.
511 	 */
512 	if (!state->is_leaf)
513 		rightfree += (int16) firstrightitemsz -
514 			(int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
515 
516 	/* Record split if legal */
517 	if (leftfree >= 0 && rightfree >= 0)
518 	{
519 		Assert(state->nsplits < state->maxsplits);
520 
521 		/* Determine smallest firstright item size on page */
522 		state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
523 
524 		state->splits[state->nsplits].curdelta = 0;
525 		state->splits[state->nsplits].leftfree = leftfree;
526 		state->splits[state->nsplits].rightfree = rightfree;
527 		state->splits[state->nsplits].firstoldonright = firstoldonright;
528 		state->splits[state->nsplits].newitemonleft = newitemonleft;
529 		state->nsplits++;
530 	}
531 }
532 
533 /*
534  * Subroutine to assign space deltas to materialized array of candidate split
535  * points based on current fillfactor, and to sort array using that fillfactor
536  */
537 static void
_bt_deltasortsplits(FindSplitData * state,double fillfactormult,bool usemult)538 _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
539 					bool usemult)
540 {
541 	for (int i = 0; i < state->nsplits; i++)
542 	{
543 		SplitPoint *split = state->splits + i;
544 		int16		delta;
545 
546 		if (usemult)
547 			delta = fillfactormult * split->leftfree -
548 				(1.0 - fillfactormult) * split->rightfree;
549 		else
550 			delta = split->leftfree - split->rightfree;
551 
552 		if (delta < 0)
553 			delta = -delta;
554 
555 		/* Save delta */
556 		split->curdelta = delta;
557 	}
558 
559 	qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
560 }
561 
562 /*
563  * qsort-style comparator used by _bt_deltasortsplits()
564  */
565 static int
_bt_splitcmp(const void * arg1,const void * arg2)566 _bt_splitcmp(const void *arg1, const void *arg2)
567 {
568 	SplitPoint *split1 = (SplitPoint *) arg1;
569 	SplitPoint *split2 = (SplitPoint *) arg2;
570 
571 	if (split1->curdelta > split2->curdelta)
572 		return 1;
573 	if (split1->curdelta < split2->curdelta)
574 		return -1;
575 
576 	return 0;
577 }
578 
579 /*
580  * Subroutine to determine whether or not a non-rightmost leaf page should be
581  * split immediately after the would-be original page offset for the
582  * new/incoming tuple (or should have leaf fillfactor applied when new item is
583  * to the right on original page).  This is appropriate when there is a
584  * pattern of localized monotonically increasing insertions into a composite
585  * index, where leading attribute values form local groupings, and we
586  * anticipate further insertions of the same/current grouping (new item's
587  * grouping) in the near future.  This can be thought of as a variation on
588  * applying leaf fillfactor during rightmost leaf page splits, since cases
589  * that benefit will converge on packing leaf pages leaffillfactor% full over
590  * time.
591  *
592  * We may leave extra free space remaining on the rightmost page of a "most
593  * significant column" grouping of tuples if that grouping never ends up
594  * having future insertions that use the free space.  That effect is
595  * self-limiting; a future grouping that becomes the "nearest on the right"
596  * grouping of the affected grouping usually puts the extra free space to good
597  * use.
598  *
599  * Caller uses optimization when routine returns true, though the exact action
600  * taken by caller varies.  Caller uses original leaf page fillfactor in
601  * standard way rather than using the new item offset directly when *usemult
602  * was also set to true here.  Otherwise, caller applies optimization by
603  * locating the legal split point that makes the new tuple the very last tuple
604  * on the left side of the split.
605  */
606 static bool
_bt_afternewitemoff(FindSplitData * state,OffsetNumber maxoff,int leaffillfactor,bool * usemult)607 _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
608 					int leaffillfactor, bool *usemult)
609 {
610 	int16		nkeyatts;
611 	ItemId		itemid;
612 	IndexTuple	tup;
613 	int			keepnatts;
614 
615 	Assert(state->is_leaf && !state->is_rightmost);
616 
617 	nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
618 
619 	/* Single key indexes not considered here */
620 	if (nkeyatts == 1)
621 		return false;
622 
623 	/* Ascending insertion pattern never inferred when new item is first */
624 	if (state->newitemoff == P_FIRSTKEY)
625 		return false;
626 
627 	/*
628 	 * Only apply optimization on pages with equisized tuples, since ordinal
629 	 * keys are likely to be fixed-width.  Testing if the new tuple is
630 	 * variable width directly might also work, but that fails to apply the
631 	 * optimization to indexes with a numeric_ops attribute.
632 	 *
633 	 * Conclude that page has equisized tuples when the new item is the same
634 	 * width as the smallest item observed during pass over page, and other
635 	 * non-pivot tuples must be the same width as well.  (Note that the
636 	 * possibly-truncated existing high key isn't counted in
637 	 * olddataitemstotal, and must be subtracted from maxoff.)
638 	 */
639 	if (state->newitemsz != state->minfirstrightsz)
640 		return false;
641 	if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
642 		return false;
643 
644 	/*
645 	 * Avoid applying optimization when tuples are wider than a tuple
646 	 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
647 	 * int4/int32 attributes)
648 	 */
649 	if (state->newitemsz >
650 		MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
651 		sizeof(ItemIdData))
652 		return false;
653 
654 	/*
655 	 * At least the first attribute's value must be equal to the corresponding
656 	 * value in previous tuple to apply optimization.  New item cannot be a
657 	 * duplicate, either.
658 	 *
659 	 * Handle case where new item is to the right of all items on the existing
660 	 * page.  This is suggestive of monotonically increasing insertions in
661 	 * itself, so the "heap TID adjacency" test is not applied here.
662 	 */
663 	if (state->newitemoff > maxoff)
664 	{
665 		itemid = PageGetItemId(state->page, maxoff);
666 		tup = (IndexTuple) PageGetItem(state->page, itemid);
667 		keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
668 
669 		if (keepnatts > 1 && keepnatts <= nkeyatts)
670 		{
671 			*usemult = true;
672 			return true;
673 		}
674 
675 		return false;
676 	}
677 
678 	/*
679 	 * "Low cardinality leading column, high cardinality suffix column"
680 	 * indexes with a random insertion pattern (e.g., an index with a boolean
681 	 * column, such as an index on '(book_is_in_print, book_isbn)') present us
682 	 * with a risk of consistently misapplying the optimization.  We're
683 	 * willing to accept very occasional misapplication of the optimization,
684 	 * provided the cases where we get it wrong are rare and self-limiting.
685 	 *
686 	 * Heap TID adjacency strongly suggests that the item just to the left was
687 	 * inserted very recently, which limits overapplication of the
688 	 * optimization.  Besides, all inappropriate cases triggered here will
689 	 * still split in the middle of the page on average.
690 	 */
691 	itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
692 	tup = (IndexTuple) PageGetItem(state->page, itemid);
693 	/* Do cheaper test first */
694 	if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
695 		return false;
696 	/* Check same conditions as rightmost item case, too */
697 	keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
698 
699 	if (keepnatts > 1 && keepnatts <= nkeyatts)
700 	{
701 		double		interp = (double) state->newitemoff / ((double) maxoff + 1);
702 		double		leaffillfactormult = (double) leaffillfactor / 100.0;
703 
704 		/*
705 		 * Don't allow caller to split after a new item when it will result in
706 		 * a split point to the right of the point that a leaf fillfactor
707 		 * split would use -- have caller apply leaf fillfactor instead
708 		 */
709 		*usemult = interp > leaffillfactormult;
710 
711 		return true;
712 	}
713 
714 	return false;
715 }
716 
717 /*
718  * Subroutine for determining if two heap TIDS are "adjacent".
719  *
720  * Adjacent means that the high TID is very likely to have been inserted into
721  * heap relation immediately after the low TID, probably during the current
722  * transaction.
723  */
724 static bool
_bt_adjacenthtid(ItemPointer lowhtid,ItemPointer highhtid)725 _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
726 {
727 	BlockNumber lowblk,
728 				highblk;
729 
730 	lowblk = ItemPointerGetBlockNumber(lowhtid);
731 	highblk = ItemPointerGetBlockNumber(highhtid);
732 
733 	/* Make optimistic assumption of adjacency when heap blocks match */
734 	if (lowblk == highblk)
735 		return true;
736 
737 	/* When heap block one up, second offset should be FirstOffsetNumber */
738 	if (lowblk + 1 == highblk &&
739 		ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
740 		return true;
741 
742 	return false;
743 }
744 
745 /*
746  * Subroutine to find the "best" split point among candidate split points.
747  * The best split point is the split point with the lowest penalty among split
748  * points that fall within current/final split interval.  Penalty is an
749  * abstract score, with a definition that varies depending on whether we're
750  * splitting a leaf page or an internal page.  See _bt_split_penalty() for
751  * details.
752  *
753  * "perfectpenalty" is assumed to be the lowest possible penalty among
754  * candidate split points.  This allows us to return early without wasting
755  * cycles on calculating the first differing attribute for all candidate
756  * splits when that clearly cannot improve our choice (or when we only want a
757  * minimally distinguishing split point, and don't want to make the split any
758  * more unbalanced than is necessary).
759  *
760  * We return the index of the first existing tuple that should go on the right
761  * page, plus a boolean indicating if new item is on left of split point.
762  */
763 static OffsetNumber
_bt_bestsplitloc(FindSplitData * state,int perfectpenalty,bool * newitemonleft,FindSplitStrat strategy)764 _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
765 				 bool *newitemonleft, FindSplitStrat strategy)
766 {
767 	int			bestpenalty,
768 				lowsplit;
769 	int			highsplit = Min(state->interval, state->nsplits);
770 	SplitPoint *final;
771 
772 	bestpenalty = INT_MAX;
773 	lowsplit = 0;
774 	for (int i = lowsplit; i < highsplit; i++)
775 	{
776 		int			penalty;
777 
778 		penalty = _bt_split_penalty(state, state->splits + i);
779 
780 		if (penalty <= perfectpenalty)
781 		{
782 			bestpenalty = penalty;
783 			lowsplit = i;
784 			break;
785 		}
786 
787 		if (penalty < bestpenalty)
788 		{
789 			bestpenalty = penalty;
790 			lowsplit = i;
791 		}
792 	}
793 
794 	final = &state->splits[lowsplit];
795 
796 	/*
797 	 * There is a risk that the "many duplicates" strategy will repeatedly do
798 	 * the wrong thing when there are monotonically decreasing insertions to
799 	 * the right of a large group of duplicates.   Repeated splits could leave
800 	 * a succession of right half pages with free space that can never be
801 	 * used.  This must be avoided.
802 	 *
803 	 * Consider the example of the leftmost page in a single integer attribute
804 	 * NULLS FIRST index which is almost filled with NULLs.  Monotonically
805 	 * decreasing integer insertions might cause the same leftmost page to
806 	 * split repeatedly at the same point.  Each split derives its new high
807 	 * key from the lowest current value to the immediate right of the large
808 	 * group of NULLs, which will always be higher than all future integer
809 	 * insertions, directing all future integer insertions to the same
810 	 * leftmost page.
811 	 */
812 	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
813 		!final->newitemonleft && final->firstoldonright >= state->newitemoff &&
814 		final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL)
815 	{
816 		/*
817 		 * Avoid the problem by peforming a 50:50 split when the new item is
818 		 * just to the right of the would-be "many duplicates" split point.
819 		 */
820 		final = &state->splits[0];
821 	}
822 
823 	*newitemonleft = final->newitemonleft;
824 	return final->firstoldonright;
825 }
826 
827 /*
828  * Subroutine to decide whether split should use default strategy/initial
829  * split interval, or whether it should finish splitting the page using
830  * alternative strategies (this is only possible with leaf pages).
831  *
832  * Caller uses alternative strategy (or sticks with default strategy) based
833  * on how *strategy is set here.  Return value is "perfect penalty", which is
834  * passed to _bt_bestsplitloc() as a final constraint on how far caller is
835  * willing to go to avoid appending a heap TID when using the many duplicates
836  * strategy (it also saves _bt_bestsplitloc() useless cycles).
837  */
838 static int
_bt_strategy(FindSplitData * state,SplitPoint * leftpage,SplitPoint * rightpage,FindSplitStrat * strategy)839 _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
840 			 SplitPoint *rightpage, FindSplitStrat *strategy)
841 {
842 	IndexTuple	leftmost,
843 				rightmost;
844 	SplitPoint *leftinterval,
845 			   *rightinterval;
846 	int			perfectpenalty;
847 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
848 
849 	/* Assume that alternative strategy won't be used for now */
850 	*strategy = SPLIT_DEFAULT;
851 
852 	/*
853 	 * Use smallest observed first right item size for entire page as perfect
854 	 * penalty on internal pages.  This can save cycles in the common case
855 	 * where most or all splits (not just splits within interval) have first
856 	 * right tuples that are the same size.
857 	 */
858 	if (!state->is_leaf)
859 		return state->minfirstrightsz;
860 
861 	/*
862 	 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
863 	 * current split interval
864 	 */
865 	_bt_interval_edges(state, &leftinterval, &rightinterval);
866 	leftmost = _bt_split_lastleft(state, leftinterval);
867 	rightmost = _bt_split_firstright(state, rightinterval);
868 
869 	/*
870 	 * If initial split interval can produce a split point that will at least
871 	 * avoid appending a heap TID in new high key, we're done.  Finish split
872 	 * with default strategy and initial split interval.
873 	 */
874 	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
875 	if (perfectpenalty <= indnkeyatts)
876 		return perfectpenalty;
877 
878 	/*
879 	 * Work out how caller should finish split when even their "perfect"
880 	 * penalty for initial/default split interval indicates that the interval
881 	 * does not contain even a single split that avoids appending a heap TID.
882 	 *
883 	 * Use the leftmost split's lastleft tuple and the rightmost split's
884 	 * firstright tuple to assess every possible split.
885 	 */
886 	leftmost = _bt_split_lastleft(state, leftpage);
887 	rightmost = _bt_split_firstright(state, rightpage);
888 
889 	/*
890 	 * If page (including new item) has many duplicates but is not entirely
891 	 * full of duplicates, a many duplicates strategy split will be performed.
892 	 * If page is entirely full of duplicates, a single value strategy split
893 	 * will be performed.
894 	 */
895 	perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
896 	if (perfectpenalty <= indnkeyatts)
897 	{
898 		*strategy = SPLIT_MANY_DUPLICATES;
899 
900 		/*
901 		 * Many duplicates strategy should split at either side the group of
902 		 * duplicates that enclose the delta-optimal split point.  Return
903 		 * indnkeyatts rather than the true perfect penalty to make that
904 		 * happen.  (If perfectpenalty was returned here then low cardinality
905 		 * composite indexes could have continual unbalanced splits.)
906 		 *
907 		 * Note that caller won't go through with a many duplicates split in
908 		 * rare cases where it looks like there are ever-decreasing insertions
909 		 * to the immediate right of the split point.  This must happen just
910 		 * before a final decision is made, within _bt_bestsplitloc().
911 		 */
912 		return indnkeyatts;
913 	}
914 
915 	/*
916 	 * Single value strategy is only appropriate with ever-increasing heap
917 	 * TIDs; otherwise, original default strategy split should proceed to
918 	 * avoid pathological performance.  Use page high key to infer if this is
919 	 * the rightmost page among pages that store the same duplicate value.
920 	 * This should not prevent insertions of heap TIDs that are slightly out
921 	 * of order from using single value strategy, since that's expected with
922 	 * concurrent inserters of the same duplicate value.
923 	 */
924 	else if (state->is_rightmost)
925 		*strategy = SPLIT_SINGLE_VALUE;
926 	else
927 	{
928 		ItemId		itemid;
929 		IndexTuple	hikey;
930 
931 		itemid = PageGetItemId(state->page, P_HIKEY);
932 		hikey = (IndexTuple) PageGetItem(state->page, itemid);
933 		perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
934 											 state->newitem);
935 		if (perfectpenalty <= indnkeyatts)
936 			*strategy = SPLIT_SINGLE_VALUE;
937 		else
938 		{
939 			/*
940 			 * Have caller finish split using default strategy, since page
941 			 * does not appear to be the rightmost page for duplicates of the
942 			 * value the page is filled with
943 			 */
944 		}
945 	}
946 
947 	return perfectpenalty;
948 }
949 
950 /*
951  * Subroutine to locate leftmost and rightmost splits for current/default
952  * split interval.  Note that it will be the same split iff there is only one
953  * split in interval.
954  */
955 static void
_bt_interval_edges(FindSplitData * state,SplitPoint ** leftinterval,SplitPoint ** rightinterval)956 _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
957 				   SplitPoint **rightinterval)
958 {
959 	int			highsplit = Min(state->interval, state->nsplits);
960 	SplitPoint *deltaoptimal;
961 
962 	deltaoptimal = state->splits;
963 	*leftinterval = NULL;
964 	*rightinterval = NULL;
965 
966 	/*
967 	 * Delta is an absolute distance to optimal split point, so both the
968 	 * leftmost and rightmost split point will usually be at the end of the
969 	 * array
970 	 */
971 	for (int i = highsplit - 1; i >= 0; i--)
972 	{
973 		SplitPoint *distant = state->splits + i;
974 
975 		if (distant->firstoldonright < deltaoptimal->firstoldonright)
976 		{
977 			if (*leftinterval == NULL)
978 				*leftinterval = distant;
979 		}
980 		else if (distant->firstoldonright > deltaoptimal->firstoldonright)
981 		{
982 			if (*rightinterval == NULL)
983 				*rightinterval = distant;
984 		}
985 		else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
986 		{
987 			/*
988 			 * "incoming tuple will become first on right page" (distant) is
989 			 * to the left of "incoming tuple will become last on left page"
990 			 * (delta-optimal)
991 			 */
992 			Assert(distant->firstoldonright == state->newitemoff);
993 			if (*leftinterval == NULL)
994 				*leftinterval = distant;
995 		}
996 		else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
997 		{
998 			/*
999 			 * "incoming tuple will become last on left page" (distant) is to
1000 			 * the right of "incoming tuple will become first on right page"
1001 			 * (delta-optimal)
1002 			 */
1003 			Assert(distant->firstoldonright == state->newitemoff);
1004 			if (*rightinterval == NULL)
1005 				*rightinterval = distant;
1006 		}
1007 		else
1008 		{
1009 			/* There was only one or two splits in initial split interval */
1010 			Assert(distant == deltaoptimal);
1011 			if (*leftinterval == NULL)
1012 				*leftinterval = distant;
1013 			if (*rightinterval == NULL)
1014 				*rightinterval = distant;
1015 		}
1016 
1017 		if (*leftinterval && *rightinterval)
1018 			return;
1019 	}
1020 
1021 	Assert(false);
1022 }
1023 
1024 /*
1025  * Subroutine to find penalty for caller's candidate split point.
1026  *
1027  * On leaf pages, penalty is the attribute number that distinguishes each side
1028  * of a split.  It's the last attribute that needs to be included in new high
1029  * key for left page.  It can be greater than the number of key attributes in
1030  * cases where a heap TID will need to be appended during truncation.
1031  *
1032  * On internal pages, penalty is simply the size of the first item on the
1033  * right half of the split (including line pointer overhead).  This tuple will
1034  * become the new high key for the left page.
1035  */
1036 static inline int
_bt_split_penalty(FindSplitData * state,SplitPoint * split)1037 _bt_split_penalty(FindSplitData *state, SplitPoint *split)
1038 {
1039 	IndexTuple	lastleftuple;
1040 	IndexTuple	firstrighttuple;
1041 
1042 	if (!state->is_leaf)
1043 	{
1044 		ItemId		itemid;
1045 
1046 		if (!split->newitemonleft &&
1047 			split->firstoldonright == state->newitemoff)
1048 			return state->newitemsz;
1049 
1050 		itemid = PageGetItemId(state->page, split->firstoldonright);
1051 
1052 		return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1053 	}
1054 
1055 	lastleftuple = _bt_split_lastleft(state, split);
1056 	firstrighttuple = _bt_split_firstright(state, split);
1057 
1058 	Assert(lastleftuple != firstrighttuple);
1059 	return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
1060 }
1061 
1062 /*
1063  * Subroutine to get a lastleft IndexTuple for a spit point from page
1064  */
1065 static inline IndexTuple
_bt_split_lastleft(FindSplitData * state,SplitPoint * split)1066 _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
1067 {
1068 	ItemId		itemid;
1069 
1070 	if (split->newitemonleft && split->firstoldonright == state->newitemoff)
1071 		return state->newitem;
1072 
1073 	itemid = PageGetItemId(state->page,
1074 						   OffsetNumberPrev(split->firstoldonright));
1075 	return (IndexTuple) PageGetItem(state->page, itemid);
1076 }
1077 
1078 /*
1079  * Subroutine to get a firstright IndexTuple for a spit point from page
1080  */
1081 static inline IndexTuple
_bt_split_firstright(FindSplitData * state,SplitPoint * split)1082 _bt_split_firstright(FindSplitData *state, SplitPoint *split)
1083 {
1084 	ItemId		itemid;
1085 
1086 	if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
1087 		return state->newitem;
1088 
1089 	itemid = PageGetItemId(state->page, split->firstoldonright);
1090 	return (IndexTuple) PageGetItem(state->page, itemid);
1091 }
1092