1 /*-------------------------------------------------------------------------
2 *
3 * nbtsplitloc.c
4 * Choose split point code for Postgres btree implementation.
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsplitloc.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include "access/nbtree.h"
18 #include "storage/lmgr.h"
19
20 typedef enum
21 {
22 /* strategy for searching through materialized list of split points */
23 SPLIT_DEFAULT, /* give some weight to truncation */
24 SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */
25 SPLIT_SINGLE_VALUE /* leave left page almost full */
26 } FindSplitStrat;
27
28 typedef struct
29 {
30 /* details of free space left by split */
31 int16 curdelta; /* current leftfree/rightfree delta */
32 int16 leftfree; /* space left on left page post-split */
33 int16 rightfree; /* space left on right page post-split */
34
35 /* split point identifying fields (returned by _bt_findsplitloc) */
36 OffsetNumber firstrightoff; /* first origpage item on rightpage */
37 bool newitemonleft; /* new item goes on left, or right? */
38
39 } SplitPoint;
40
41 typedef struct
42 {
43 /* context data for _bt_recsplitloc */
44 Relation rel; /* index relation */
45 Page origpage; /* page undergoing split */
46 IndexTuple newitem; /* new item (cause of page split) */
47 Size newitemsz; /* size of newitem (includes line pointer) */
48 bool is_leaf; /* T if splitting a leaf page */
49 bool is_rightmost; /* T if splitting rightmost page on level */
50 OffsetNumber newitemoff; /* where the new item is to be inserted */
51 int leftspace; /* space available for items on left page */
52 int rightspace; /* space available for items on right page */
53 int olddataitemstotal; /* space taken by old items */
54 Size minfirstrightsz; /* smallest firstright size */
55
56 /* candidate split point data */
57 int maxsplits; /* maximum number of splits */
58 int nsplits; /* current number of splits */
59 SplitPoint *splits; /* all candidate split points for page */
60 int interval; /* current range of acceptable split points */
61 } FindSplitData;
62
63 static void _bt_recsplitloc(FindSplitData *state,
64 OffsetNumber firstrightoff, bool newitemonleft,
65 int olddataitemstoleft,
66 Size firstrightofforigpagetuplesz);
67 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
68 bool usemult);
69 static int _bt_splitcmp(const void *arg1, const void *arg2);
70 static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
71 int leaffillfactor, bool *usemult);
72 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
73 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
74 bool *newitemonleft, FindSplitStrat strategy);
75 static int _bt_defaultinterval(FindSplitData *state);
76 static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
77 SplitPoint *rightpage, FindSplitStrat *strategy);
78 static void _bt_interval_edges(FindSplitData *state,
79 SplitPoint **leftinterval, SplitPoint **rightinterval);
80 static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
81 static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
82 SplitPoint *split);
83 static inline IndexTuple _bt_split_firstright(FindSplitData *state,
84 SplitPoint *split);
85
86
87 /*
88 * _bt_findsplitloc() -- find an appropriate place to split a page.
89 *
90 * The main goal here is to equalize the free space that will be on each
91 * split page, *after accounting for the inserted tuple*. (If we fail to
92 * account for it, we might find ourselves with too little room on the page
93 * that it needs to go into!)
94 *
95 * If the page is the rightmost page on its level, we instead try to arrange
96 * to leave the left split page fillfactor% full. In this way, when we are
97 * inserting successively increasing keys (consider sequences, timestamps,
98 * etc) we will end up with a tree whose pages are about fillfactor% full,
99 * instead of the 50% full result that we'd get without this special case.
100 * This is the same as nbtsort.c produces for a newly-created tree. Note
101 * that leaf and nonleaf pages use different fillfactors. Note also that
102 * there are a number of further special cases where fillfactor is not
103 * applied in the standard way.
104 *
105 * We are passed the intended insert position of the new tuple, expressed as
106 * the offsetnumber of the tuple it must go in front of (this could be
107 * maxoff+1 if the tuple is to go at the end). The new tuple itself is also
108 * passed, since it's needed to give some weight to how effective suffix
109 * truncation will be. The implementation picks the split point that
110 * maximizes the effectiveness of suffix truncation from a small list of
111 * alternative candidate split points that leave each side of the split with
112 * about the same share of free space. Suffix truncation is secondary to
113 * equalizing free space, except in cases with large numbers of duplicates.
114 * Note that it is always assumed that caller goes on to perform truncation,
115 * even with pg_upgrade'd indexes where that isn't actually the case
116 * (!heapkeyspace indexes). See nbtree/README for more information about
117 * suffix truncation.
118 *
119 * We return the index of the first existing tuple that should go on the
120 * righthand page (which is called firstrightoff), plus a boolean
121 * indicating whether the new tuple goes on the left or right page. You
122 * can think of the returned state as a point _between_ two adjacent data
123 * items (laftleft and firstright data items) on an imaginary version of
124 * origpage that already includes newitem. The bool is necessary to
125 * disambiguate the case where firstrightoff == newitemoff (i.e. it is
126 * sometimes needed to determine if the firstright tuple for the split is
127 * newitem rather than the tuple from origpage at offset firstrightoff).
128 */
129 OffsetNumber
_bt_findsplitloc(Relation rel,Page origpage,OffsetNumber newitemoff,Size newitemsz,IndexTuple newitem,bool * newitemonleft)130 _bt_findsplitloc(Relation rel,
131 Page origpage,
132 OffsetNumber newitemoff,
133 Size newitemsz,
134 IndexTuple newitem,
135 bool *newitemonleft)
136 {
137 BTPageOpaque opaque;
138 int leftspace,
139 rightspace,
140 olddataitemstotal,
141 olddataitemstoleft,
142 perfectpenalty,
143 leaffillfactor;
144 FindSplitData state;
145 FindSplitStrat strategy;
146 ItemId itemid;
147 OffsetNumber offnum,
148 maxoff,
149 firstrightoff;
150 double fillfactormult;
151 bool usemult;
152 SplitPoint leftpage,
153 rightpage;
154
155 opaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
156 maxoff = PageGetMaxOffsetNumber(origpage);
157
158 /* Total free space available on a btree page, after fixed overhead */
159 leftspace = rightspace =
160 PageGetPageSize(origpage) - SizeOfPageHeaderData -
161 MAXALIGN(sizeof(BTPageOpaqueData));
162
163 /* The right page will have the same high key as the old page */
164 if (!P_RIGHTMOST(opaque))
165 {
166 itemid = PageGetItemId(origpage, P_HIKEY);
167 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
168 sizeof(ItemIdData));
169 }
170
171 /* Count up total space in data items before actually scanning 'em */
172 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage);
173 leaffillfactor = BTGetFillFactor(rel);
174
175 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
176 newitemsz += sizeof(ItemIdData);
177 state.rel = rel;
178 state.origpage = origpage;
179 state.newitem = newitem;
180 state.newitemsz = newitemsz;
181 state.is_leaf = P_ISLEAF(opaque);
182 state.is_rightmost = P_RIGHTMOST(opaque);
183 state.leftspace = leftspace;
184 state.rightspace = rightspace;
185 state.olddataitemstotal = olddataitemstotal;
186 state.minfirstrightsz = SIZE_MAX;
187 state.newitemoff = newitemoff;
188
189 /* newitem cannot be a posting list item */
190 Assert(!BTreeTupleIsPosting(newitem));
191
192 /*
193 * maxsplits should never exceed maxoff because there will be at most as
194 * many candidate split points as there are points _between_ tuples, once
195 * you imagine that the new item is already on the original page (the
196 * final number of splits may be slightly lower because not all points
197 * between tuples will be legal).
198 */
199 state.maxsplits = maxoff;
200 state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
201 state.nsplits = 0;
202
203 /*
204 * Scan through the data items and calculate space usage for a split at
205 * each possible position
206 */
207 olddataitemstoleft = 0;
208
209 for (offnum = P_FIRSTDATAKEY(opaque);
210 offnum <= maxoff;
211 offnum = OffsetNumberNext(offnum))
212 {
213 Size itemsz;
214
215 itemid = PageGetItemId(origpage, offnum);
216 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
217
218 /*
219 * When item offset number is not newitemoff, neither side of the
220 * split can be newitem. Record a split after the previous data item
221 * from original page, but before the current data item from original
222 * page. (_bt_recsplitloc() will reject the split when there are no
223 * previous items, which we rely on.)
224 */
225 if (offnum < newitemoff)
226 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
227 else if (offnum > newitemoff)
228 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
229 else
230 {
231 /*
232 * Record a split after all "offnum < newitemoff" original page
233 * data items, but before newitem
234 */
235 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
236
237 /*
238 * Record a split after newitem, but before data item from
239 * original page at offset newitemoff/current offset
240 */
241 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
242 }
243
244 olddataitemstoleft += itemsz;
245 }
246
247 /*
248 * Record a split after all original page data items, but before newitem.
249 * (Though only when it's possible that newitem will end up alone on new
250 * right page.)
251 */
252 Assert(olddataitemstoleft == olddataitemstotal);
253 if (newitemoff > maxoff)
254 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
255
256 /*
257 * I believe it is not possible to fail to find a feasible split, but just
258 * in case ...
259 */
260 if (state.nsplits == 0)
261 elog(ERROR, "could not find a feasible split point for index \"%s\"",
262 RelationGetRelationName(rel));
263
264 /*
265 * Start search for a split point among list of legal split points. Give
266 * primary consideration to equalizing available free space in each half
267 * of the split initially (start with default strategy), while applying
268 * rightmost and split-after-new-item optimizations where appropriate.
269 * Either of the two other fallback strategies may be required for cases
270 * with a large number of duplicates around the original/space-optimal
271 * split point.
272 *
273 * Default strategy gives some weight to suffix truncation in deciding a
274 * split point on leaf pages. It attempts to select a split point where a
275 * distinguishing attribute appears earlier in the new high key for the
276 * left side of the split, in order to maximize the number of trailing
277 * attributes that can be truncated away. Only candidate split points
278 * that imply an acceptable balance of free space on each side are
279 * considered. See _bt_defaultinterval().
280 */
281 if (!state.is_leaf)
282 {
283 /* fillfactormult only used on rightmost page */
284 usemult = state.is_rightmost;
285 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
286 }
287 else if (state.is_rightmost)
288 {
289 /* Rightmost leaf page -- fillfactormult always used */
290 usemult = true;
291 fillfactormult = leaffillfactor / 100.0;
292 }
293 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
294 {
295 /*
296 * New item inserted at rightmost point among a localized grouping on
297 * a leaf page -- apply "split after new item" optimization, either by
298 * applying leaf fillfactor multiplier, or by choosing the exact split
299 * point that leaves newitem as lastleft. (usemult is set for us.)
300 */
301 if (usemult)
302 {
303 /* fillfactormult should be set based on leaf fillfactor */
304 fillfactormult = leaffillfactor / 100.0;
305 }
306 else
307 {
308 /* find precise split point after newitemoff */
309 for (int i = 0; i < state.nsplits; i++)
310 {
311 SplitPoint *split = state.splits + i;
312
313 if (split->newitemonleft &&
314 newitemoff == split->firstrightoff)
315 {
316 pfree(state.splits);
317 *newitemonleft = true;
318 return newitemoff;
319 }
320 }
321
322 /*
323 * Cannot legally split after newitemoff; proceed with split
324 * without using fillfactor multiplier. This is defensive, and
325 * should never be needed in practice.
326 */
327 fillfactormult = 0.50;
328 }
329 }
330 else
331 {
332 /* Other leaf page. 50:50 page split. */
333 usemult = false;
334 /* fillfactormult not used, but be tidy */
335 fillfactormult = 0.50;
336 }
337
338 /*
339 * Save leftmost and rightmost splits for page before original ordinal
340 * sort order is lost by delta/fillfactormult sort
341 */
342 leftpage = state.splits[0];
343 rightpage = state.splits[state.nsplits - 1];
344
345 /* Give split points a fillfactormult-wise delta, and sort on deltas */
346 _bt_deltasortsplits(&state, fillfactormult, usemult);
347
348 /* Determine split interval for default strategy */
349 state.interval = _bt_defaultinterval(&state);
350
351 /*
352 * Determine if default strategy/split interval will produce a
353 * sufficiently distinguishing split, or if we should change strategies.
354 * Alternative strategies change the range of split points that are
355 * considered acceptable (split interval), and possibly change
356 * fillfactormult, in order to deal with pages with a large number of
357 * duplicates gracefully.
358 *
359 * Pass low and high splits for the entire page (actually, they're for an
360 * imaginary version of the page that includes newitem). These are used
361 * when the initial split interval encloses split points that are full of
362 * duplicates, and we need to consider if it's even possible to avoid
363 * appending a heap TID.
364 */
365 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
366
367 if (strategy == SPLIT_DEFAULT)
368 {
369 /*
370 * Default strategy worked out (always works out with internal page).
371 * Original split interval still stands.
372 */
373 }
374
375 /*
376 * Many duplicates strategy is used when a heap TID would otherwise be
377 * appended, but the page isn't completely full of logical duplicates.
378 *
379 * The split interval is widened to include all legal candidate split
380 * points. There might be a few as two distinct values in the whole-page
381 * split interval, though it's also possible that most of the values on
382 * the page are unique. The final split point will either be to the
383 * immediate left or to the immediate right of the group of duplicate
384 * tuples that enclose the first/delta-optimal split point (perfect
385 * penalty was set so that the lowest delta split point that avoids
386 * appending a heap TID will be chosen). Maximizing the number of
387 * attributes that can be truncated away is not a goal of the many
388 * duplicates strategy.
389 *
390 * Single value strategy is used when it is impossible to avoid appending
391 * a heap TID. It arranges to leave the left page very full. This
392 * maximizes space utilization in cases where tuples with the same
393 * attribute values span many pages. Newly inserted duplicates will tend
394 * to have higher heap TID values, so we'll end up splitting to the right
395 * consistently. (Single value strategy is harmless though not
396 * particularly useful with !heapkeyspace indexes.)
397 */
398 else if (strategy == SPLIT_MANY_DUPLICATES)
399 {
400 Assert(state.is_leaf);
401 /* Shouldn't try to truncate away extra user attributes */
402 Assert(perfectpenalty ==
403 IndexRelationGetNumberOfKeyAttributes(state.rel));
404 /* No need to resort splits -- no change in fillfactormult/deltas */
405 state.interval = state.nsplits;
406 }
407 else if (strategy == SPLIT_SINGLE_VALUE)
408 {
409 Assert(state.is_leaf);
410 /* Split near the end of the page */
411 usemult = true;
412 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
413 /* Resort split points with new delta */
414 _bt_deltasortsplits(&state, fillfactormult, usemult);
415 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
416 state.interval = 1;
417 }
418
419 /*
420 * Search among acceptable split points (using final split interval) for
421 * the entry that has the lowest penalty, and is therefore expected to
422 * maximize fan-out. Sets *newitemonleft for us.
423 */
424 firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
425 strategy);
426 pfree(state.splits);
427
428 return firstrightoff;
429 }
430
431 /*
432 * Subroutine to record a particular point between two tuples (possibly the
433 * new item) on page (ie, combination of firstrightoff and newitemonleft
434 * settings) in *state for later analysis. This is also a convenient point to
435 * check if the split is legal (if it isn't, it won't be recorded).
436 *
437 * firstrightoff is the offset of the first item on the original page that
438 * goes to the right page, and firstrightofforigpagetuplesz is the size of
439 * that tuple. firstrightoff can be > max offset, which means that all the
440 * old items go to the left page and only the new item goes to the right page.
441 * We don't actually use firstrightofforigpagetuplesz in that case (actually,
442 * we don't use it for _any_ split where the firstright tuple happens to be
443 * newitem).
444 *
445 * olddataitemstoleft is the total size of all old items to the left of the
446 * split point that is recorded here when legal. Should not include
447 * newitemsz, since that is handled here.
448 */
449 static void
_bt_recsplitloc(FindSplitData * state,OffsetNumber firstrightoff,bool newitemonleft,int olddataitemstoleft,Size firstrightofforigpagetuplesz)450 _bt_recsplitloc(FindSplitData *state,
451 OffsetNumber firstrightoff,
452 bool newitemonleft,
453 int olddataitemstoleft,
454 Size firstrightofforigpagetuplesz)
455 {
456 int16 leftfree,
457 rightfree;
458 Size firstrightsz;
459 Size postingsz = 0;
460 bool newitemisfirstright;
461
462 /* Is the new item going to be split point's firstright tuple? */
463 newitemisfirstright = (firstrightoff == state->newitemoff &&
464 !newitemonleft);
465
466 if (newitemisfirstright)
467 firstrightsz = state->newitemsz;
468 else
469 {
470 firstrightsz = firstrightofforigpagetuplesz;
471
472 /*
473 * Calculate suffix truncation space saving when firstright tuple is a
474 * posting list tuple, though only when the tuple is over 64 bytes
475 * including line pointer overhead (arbitrary). This avoids accessing
476 * the tuple in cases where its posting list must be very small (if
477 * tuple has one at all).
478 *
479 * Note: We don't do this in the case where firstright tuple is
480 * newitem, since newitem cannot have a posting list.
481 */
482 if (state->is_leaf && firstrightsz > 64)
483 {
484 ItemId itemid;
485 IndexTuple newhighkey;
486
487 itemid = PageGetItemId(state->origpage, firstrightoff);
488 newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid);
489
490 if (BTreeTupleIsPosting(newhighkey))
491 postingsz = IndexTupleSize(newhighkey) -
492 BTreeTupleGetPostingOffset(newhighkey);
493 }
494 }
495
496 /* Account for all the old tuples */
497 leftfree = state->leftspace - olddataitemstoleft;
498 rightfree = state->rightspace -
499 (state->olddataitemstotal - olddataitemstoleft);
500
501 /*
502 * The first item on the right page becomes the high key of the left page;
503 * therefore it counts against left space as well as right space (we
504 * cannot assume that suffix truncation will make it any smaller). When
505 * index has included attributes, then those attributes of left page high
506 * key will be truncated leaving that page with slightly more free space.
507 * However, that shouldn't affect our ability to find valid split
508 * location, since we err in the direction of being pessimistic about free
509 * space on the left half. Besides, even when suffix truncation of
510 * non-TID attributes occurs, the new high key often won't even be a
511 * single MAXALIGN() quantum smaller than the firstright tuple it's based
512 * on.
513 *
514 * If we are on the leaf level, assume that suffix truncation cannot avoid
515 * adding a heap TID to the left half's new high key when splitting at the
516 * leaf level. In practice the new high key will often be smaller and
517 * will rarely be larger, but conservatively assume the worst case. We do
518 * go to the trouble of subtracting away posting list overhead, though
519 * only when it looks like it will make an appreciable difference.
520 * (Posting lists are the only case where truncation will typically make
521 * the final high key far smaller than firstright, so being a bit more
522 * precise there noticeably improves the balance of free space.)
523 */
524 if (state->is_leaf)
525 leftfree -= (int16) (firstrightsz +
526 MAXALIGN(sizeof(ItemPointerData)) -
527 postingsz);
528 else
529 leftfree -= (int16) firstrightsz;
530
531 /* account for the new item */
532 if (newitemonleft)
533 leftfree -= (int16) state->newitemsz;
534 else
535 rightfree -= (int16) state->newitemsz;
536
537 /*
538 * If we are not on the leaf level, we will be able to discard the key
539 * data from the first item that winds up on the right page.
540 */
541 if (!state->is_leaf)
542 rightfree += (int16) firstrightsz -
543 (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
544
545 /* Record split if legal */
546 if (leftfree >= 0 && rightfree >= 0)
547 {
548 Assert(state->nsplits < state->maxsplits);
549
550 /* Determine smallest firstright tuple size among legal splits */
551 state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz);
552
553 state->splits[state->nsplits].curdelta = 0;
554 state->splits[state->nsplits].leftfree = leftfree;
555 state->splits[state->nsplits].rightfree = rightfree;
556 state->splits[state->nsplits].firstrightoff = firstrightoff;
557 state->splits[state->nsplits].newitemonleft = newitemonleft;
558 state->nsplits++;
559 }
560 }
561
562 /*
563 * Subroutine to assign space deltas to materialized array of candidate split
564 * points based on current fillfactor, and to sort array using that fillfactor
565 */
566 static void
_bt_deltasortsplits(FindSplitData * state,double fillfactormult,bool usemult)567 _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
568 bool usemult)
569 {
570 for (int i = 0; i < state->nsplits; i++)
571 {
572 SplitPoint *split = state->splits + i;
573 int16 delta;
574
575 if (usemult)
576 delta = fillfactormult * split->leftfree -
577 (1.0 - fillfactormult) * split->rightfree;
578 else
579 delta = split->leftfree - split->rightfree;
580
581 if (delta < 0)
582 delta = -delta;
583
584 /* Save delta */
585 split->curdelta = delta;
586 }
587
588 qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
589 }
590
591 /*
592 * qsort-style comparator used by _bt_deltasortsplits()
593 */
594 static int
_bt_splitcmp(const void * arg1,const void * arg2)595 _bt_splitcmp(const void *arg1, const void *arg2)
596 {
597 SplitPoint *split1 = (SplitPoint *) arg1;
598 SplitPoint *split2 = (SplitPoint *) arg2;
599
600 if (split1->curdelta > split2->curdelta)
601 return 1;
602 if (split1->curdelta < split2->curdelta)
603 return -1;
604
605 return 0;
606 }
607
608 /*
609 * Subroutine to determine whether or not a non-rightmost leaf page should be
610 * split immediately after the would-be original page offset for the
611 * new/incoming tuple (or should have leaf fillfactor applied when new item is
612 * to the right on original page). This is appropriate when there is a
613 * pattern of localized monotonically increasing insertions into a composite
614 * index, where leading attribute values form local groupings, and we
615 * anticipate further insertions of the same/current grouping (new item's
616 * grouping) in the near future. This can be thought of as a variation on
617 * applying leaf fillfactor during rightmost leaf page splits, since cases
618 * that benefit will converge on packing leaf pages leaffillfactor% full over
619 * time.
620 *
621 * We may leave extra free space remaining on the rightmost page of a "most
622 * significant column" grouping of tuples if that grouping never ends up
623 * having future insertions that use the free space. That effect is
624 * self-limiting; a future grouping that becomes the "nearest on the right"
625 * grouping of the affected grouping usually puts the extra free space to good
626 * use.
627 *
628 * Caller uses optimization when routine returns true, though the exact action
629 * taken by caller varies. Caller uses original leaf page fillfactor in
630 * standard way rather than using the new item offset directly when *usemult
631 * was also set to true here. Otherwise, caller applies optimization by
632 * locating the legal split point that makes the new tuple the lastleft tuple
633 * for the split.
634 */
635 static bool
_bt_afternewitemoff(FindSplitData * state,OffsetNumber maxoff,int leaffillfactor,bool * usemult)636 _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
637 int leaffillfactor, bool *usemult)
638 {
639 int16 nkeyatts;
640 ItemId itemid;
641 IndexTuple tup;
642 int keepnatts;
643
644 Assert(state->is_leaf && !state->is_rightmost);
645
646 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
647
648 /* Single key indexes not considered here */
649 if (nkeyatts == 1)
650 return false;
651
652 /* Ascending insertion pattern never inferred when new item is first */
653 if (state->newitemoff == P_FIRSTKEY)
654 return false;
655
656 /*
657 * Only apply optimization on pages with equisized tuples, since ordinal
658 * keys are likely to be fixed-width. Testing if the new tuple is
659 * variable width directly might also work, but that fails to apply the
660 * optimization to indexes with a numeric_ops attribute.
661 *
662 * Conclude that page has equisized tuples when the new item is the same
663 * width as the smallest item observed during pass over page, and other
664 * non-pivot tuples must be the same width as well. (Note that the
665 * possibly-truncated existing high key isn't counted in
666 * olddataitemstotal, and must be subtracted from maxoff.)
667 */
668 if (state->newitemsz != state->minfirstrightsz)
669 return false;
670 if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
671 return false;
672
673 /*
674 * Avoid applying optimization when tuples are wider than a tuple
675 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
676 * int4/int32 attributes)
677 */
678 if (state->newitemsz >
679 MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
680 sizeof(ItemIdData))
681 return false;
682
683 /*
684 * At least the first attribute's value must be equal to the corresponding
685 * value in previous tuple to apply optimization. New item cannot be a
686 * duplicate, either.
687 *
688 * Handle case where new item is to the right of all items on the existing
689 * page. This is suggestive of monotonically increasing insertions in
690 * itself, so the "heap TID adjacency" test is not applied here.
691 */
692 if (state->newitemoff > maxoff)
693 {
694 itemid = PageGetItemId(state->origpage, maxoff);
695 tup = (IndexTuple) PageGetItem(state->origpage, itemid);
696 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
697
698 if (keepnatts > 1 && keepnatts <= nkeyatts)
699 {
700 *usemult = true;
701 return true;
702 }
703
704 return false;
705 }
706
707 /*
708 * "Low cardinality leading column, high cardinality suffix column"
709 * indexes with a random insertion pattern (e.g., an index with a boolean
710 * column, such as an index on '(book_is_in_print, book_isbn)') present us
711 * with a risk of consistently misapplying the optimization. We're
712 * willing to accept very occasional misapplication of the optimization,
713 * provided the cases where we get it wrong are rare and self-limiting.
714 *
715 * Heap TID adjacency strongly suggests that the item just to the left was
716 * inserted very recently, which limits overapplication of the
717 * optimization. Besides, all inappropriate cases triggered here will
718 * still split in the middle of the page on average.
719 */
720 itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff));
721 tup = (IndexTuple) PageGetItem(state->origpage, itemid);
722 /* Do cheaper test first */
723 if (BTreeTupleIsPosting(tup) ||
724 !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
725 return false;
726 /* Check same conditions as rightmost item case, too */
727 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
728
729 if (keepnatts > 1 && keepnatts <= nkeyatts)
730 {
731 double interp = (double) state->newitemoff / ((double) maxoff + 1);
732 double leaffillfactormult = (double) leaffillfactor / 100.0;
733
734 /*
735 * Don't allow caller to split after a new item when it will result in
736 * a split point to the right of the point that a leaf fillfactor
737 * split would use -- have caller apply leaf fillfactor instead
738 */
739 *usemult = interp > leaffillfactormult;
740
741 return true;
742 }
743
744 return false;
745 }
746
747 /*
748 * Subroutine for determining if two heap TIDS are "adjacent".
749 *
750 * Adjacent means that the high TID is very likely to have been inserted into
751 * heap relation immediately after the low TID, probably during the current
752 * transaction.
753 */
754 static bool
_bt_adjacenthtid(ItemPointer lowhtid,ItemPointer highhtid)755 _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
756 {
757 BlockNumber lowblk,
758 highblk;
759
760 lowblk = ItemPointerGetBlockNumber(lowhtid);
761 highblk = ItemPointerGetBlockNumber(highhtid);
762
763 /* Make optimistic assumption of adjacency when heap blocks match */
764 if (lowblk == highblk)
765 return true;
766
767 /* When heap block one up, second offset should be FirstOffsetNumber */
768 if (lowblk + 1 == highblk &&
769 ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
770 return true;
771
772 return false;
773 }
774
775 /*
776 * Subroutine to find the "best" split point among candidate split points.
777 * The best split point is the split point with the lowest penalty among split
778 * points that fall within current/final split interval. Penalty is an
779 * abstract score, with a definition that varies depending on whether we're
780 * splitting a leaf page or an internal page. See _bt_split_penalty() for
781 * details.
782 *
783 * "perfectpenalty" is assumed to be the lowest possible penalty among
784 * candidate split points. This allows us to return early without wasting
785 * cycles on calculating the first differing attribute for all candidate
786 * splits when that clearly cannot improve our choice (or when we only want a
787 * minimally distinguishing split point, and don't want to make the split any
788 * more unbalanced than is necessary).
789 *
790 * We return the index of the first existing tuple that should go on the right
791 * page, plus a boolean indicating if new item is on left of split point.
792 */
793 static OffsetNumber
_bt_bestsplitloc(FindSplitData * state,int perfectpenalty,bool * newitemonleft,FindSplitStrat strategy)794 _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
795 bool *newitemonleft, FindSplitStrat strategy)
796 {
797 int bestpenalty,
798 lowsplit;
799 int highsplit = Min(state->interval, state->nsplits);
800 SplitPoint *final;
801
802 bestpenalty = INT_MAX;
803 lowsplit = 0;
804 for (int i = lowsplit; i < highsplit; i++)
805 {
806 int penalty;
807
808 penalty = _bt_split_penalty(state, state->splits + i);
809
810 if (penalty < bestpenalty)
811 {
812 bestpenalty = penalty;
813 lowsplit = i;
814 }
815
816 if (penalty <= perfectpenalty)
817 break;
818 }
819
820 final = &state->splits[lowsplit];
821
822 /*
823 * There is a risk that the "many duplicates" strategy will repeatedly do
824 * the wrong thing when there are monotonically decreasing insertions to
825 * the right of a large group of duplicates. Repeated splits could leave
826 * a succession of right half pages with free space that can never be
827 * used. This must be avoided.
828 *
829 * Consider the example of the leftmost page in a single integer attribute
830 * NULLS FIRST index which is almost filled with NULLs. Monotonically
831 * decreasing integer insertions might cause the same leftmost page to
832 * split repeatedly at the same point. Each split derives its new high
833 * key from the lowest current value to the immediate right of the large
834 * group of NULLs, which will always be higher than all future integer
835 * insertions, directing all future integer insertions to the same
836 * leftmost page.
837 */
838 if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
839 !final->newitemonleft && final->firstrightoff >= state->newitemoff &&
840 final->firstrightoff < state->newitemoff + 9)
841 {
842 /*
843 * Avoid the problem by performing a 50:50 split when the new item is
844 * just to the right of the would-be "many duplicates" split point.
845 * (Note that the test used for an insert that is "just to the right"
846 * of the split point is conservative.)
847 */
848 final = &state->splits[0];
849 }
850
851 *newitemonleft = final->newitemonleft;
852 return final->firstrightoff;
853 }
854
855 #define LEAF_SPLIT_DISTANCE 0.050
856 #define INTERNAL_SPLIT_DISTANCE 0.075
857
858 /*
859 * Return a split interval to use for the default strategy. This is a limit
860 * on the number of candidate split points to give further consideration to.
861 * Only a fraction of all candidate splits points (those located at the start
862 * of the now-sorted splits array) fall within the split interval. Split
863 * interval is applied within _bt_bestsplitloc().
864 *
865 * Split interval represents an acceptable range of split points -- those that
866 * have leftfree and rightfree values that are acceptably balanced. The final
867 * split point chosen is the split point with the lowest "penalty" among split
868 * points in this split interval (unless we change our entire strategy, in
869 * which case the interval also changes -- see _bt_strategy()).
870 *
871 * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits,
872 * and sigma b for internal ("branch") splits. It's hard to provide a
873 * theoretical justification for the size of the split interval, though it's
874 * clear that a small split interval can make tuples on level L+1 much smaller
875 * on average, without noticeably affecting space utilization on level L.
876 * (Note that the way that we calculate split interval might need to change if
877 * suffix truncation is taught to truncate tuples "within" the last
878 * attribute/datum for data types like text, which is more or less how it is
879 * assumed to work in the paper.)
880 */
881 static int
_bt_defaultinterval(FindSplitData * state)882 _bt_defaultinterval(FindSplitData *state)
883 {
884 SplitPoint *spaceoptimal;
885 int16 tolerance,
886 lowleftfree,
887 lowrightfree,
888 highleftfree,
889 highrightfree;
890
891 /*
892 * Determine leftfree and rightfree values that are higher and lower than
893 * we're willing to tolerate. Note that the final split interval will be
894 * about 10% of nsplits in the common case where all non-pivot tuples
895 * (data items) from a leaf page are uniformly sized. We're a bit more
896 * aggressive when splitting internal pages.
897 */
898 if (state->is_leaf)
899 tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE;
900 else
901 tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE;
902
903 /* First candidate split point is the most evenly balanced */
904 spaceoptimal = state->splits;
905 lowleftfree = spaceoptimal->leftfree - tolerance;
906 lowrightfree = spaceoptimal->rightfree - tolerance;
907 highleftfree = spaceoptimal->leftfree + tolerance;
908 highrightfree = spaceoptimal->rightfree + tolerance;
909
910 /*
911 * Iterate through split points, starting from the split immediately after
912 * 'spaceoptimal'. Find the first split point that divides free space so
913 * unevenly that including it in the split interval would be unacceptable.
914 */
915 for (int i = 1; i < state->nsplits; i++)
916 {
917 SplitPoint *split = state->splits + i;
918
919 /* Cannot use curdelta here, since its value is often weighted */
920 if (split->leftfree < lowleftfree || split->rightfree < lowrightfree ||
921 split->leftfree > highleftfree || split->rightfree > highrightfree)
922 return i;
923 }
924
925 return state->nsplits;
926 }
927
928 /*
929 * Subroutine to decide whether split should use default strategy/initial
930 * split interval, or whether it should finish splitting the page using
931 * alternative strategies (this is only possible with leaf pages).
932 *
933 * Caller uses alternative strategy (or sticks with default strategy) based
934 * on how *strategy is set here. Return value is "perfect penalty", which is
935 * passed to _bt_bestsplitloc() as a final constraint on how far caller is
936 * willing to go to avoid appending a heap TID when using the many duplicates
937 * strategy (it also saves _bt_bestsplitloc() useless cycles).
938 */
939 static int
_bt_strategy(FindSplitData * state,SplitPoint * leftpage,SplitPoint * rightpage,FindSplitStrat * strategy)940 _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
941 SplitPoint *rightpage, FindSplitStrat *strategy)
942 {
943 IndexTuple leftmost,
944 rightmost;
945 SplitPoint *leftinterval,
946 *rightinterval;
947 int perfectpenalty;
948 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
949
950 /* Assume that alternative strategy won't be used for now */
951 *strategy = SPLIT_DEFAULT;
952
953 /*
954 * Use smallest observed firstright item size for entire page (actually,
955 * entire imaginary version of page that includes newitem) as perfect
956 * penalty on internal pages. This can save cycles in the common case
957 * where most or all splits (not just splits within interval) have
958 * firstright tuples that are the same size.
959 */
960 if (!state->is_leaf)
961 return state->minfirstrightsz;
962
963 /*
964 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
965 * current split interval
966 */
967 _bt_interval_edges(state, &leftinterval, &rightinterval);
968 leftmost = _bt_split_lastleft(state, leftinterval);
969 rightmost = _bt_split_firstright(state, rightinterval);
970
971 /*
972 * If initial split interval can produce a split point that will at least
973 * avoid appending a heap TID in new high key, we're done. Finish split
974 * with default strategy and initial split interval.
975 */
976 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
977 if (perfectpenalty <= indnkeyatts)
978 return perfectpenalty;
979
980 /*
981 * Work out how caller should finish split when even their "perfect"
982 * penalty for initial/default split interval indicates that the interval
983 * does not contain even a single split that avoids appending a heap TID.
984 *
985 * Use the leftmost split's lastleft tuple and the rightmost split's
986 * firstright tuple to assess every possible split.
987 */
988 leftmost = _bt_split_lastleft(state, leftpage);
989 rightmost = _bt_split_firstright(state, rightpage);
990
991 /*
992 * If page (including new item) has many duplicates but is not entirely
993 * full of duplicates, a many duplicates strategy split will be performed.
994 * If page is entirely full of duplicates, a single value strategy split
995 * will be performed.
996 */
997 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
998 if (perfectpenalty <= indnkeyatts)
999 {
1000 *strategy = SPLIT_MANY_DUPLICATES;
1001
1002 /*
1003 * Many duplicates strategy should split at either side the group of
1004 * duplicates that enclose the delta-optimal split point. Return
1005 * indnkeyatts rather than the true perfect penalty to make that
1006 * happen. (If perfectpenalty was returned here then low cardinality
1007 * composite indexes could have continual unbalanced splits.)
1008 *
1009 * Note that caller won't go through with a many duplicates split in
1010 * rare cases where it looks like there are ever-decreasing insertions
1011 * to the immediate right of the split point. This must happen just
1012 * before a final decision is made, within _bt_bestsplitloc().
1013 */
1014 return indnkeyatts;
1015 }
1016
1017 /*
1018 * Single value strategy is only appropriate with ever-increasing heap
1019 * TIDs; otherwise, original default strategy split should proceed to
1020 * avoid pathological performance. Use page high key to infer if this is
1021 * the rightmost page among pages that store the same duplicate value.
1022 * This should not prevent insertions of heap TIDs that are slightly out
1023 * of order from using single value strategy, since that's expected with
1024 * concurrent inserters of the same duplicate value.
1025 */
1026 else if (state->is_rightmost)
1027 *strategy = SPLIT_SINGLE_VALUE;
1028 else
1029 {
1030 ItemId itemid;
1031 IndexTuple hikey;
1032
1033 itemid = PageGetItemId(state->origpage, P_HIKEY);
1034 hikey = (IndexTuple) PageGetItem(state->origpage, itemid);
1035 perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
1036 state->newitem);
1037 if (perfectpenalty <= indnkeyatts)
1038 *strategy = SPLIT_SINGLE_VALUE;
1039 else
1040 {
1041 /*
1042 * Have caller finish split using default strategy, since page
1043 * does not appear to be the rightmost page for duplicates of the
1044 * value the page is filled with
1045 */
1046 }
1047 }
1048
1049 return perfectpenalty;
1050 }
1051
1052 /*
1053 * Subroutine to locate leftmost and rightmost splits for current/default
1054 * split interval. Note that it will be the same split iff there is only one
1055 * split in interval.
1056 */
1057 static void
_bt_interval_edges(FindSplitData * state,SplitPoint ** leftinterval,SplitPoint ** rightinterval)1058 _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
1059 SplitPoint **rightinterval)
1060 {
1061 int highsplit = Min(state->interval, state->nsplits);
1062 SplitPoint *deltaoptimal;
1063
1064 deltaoptimal = state->splits;
1065 *leftinterval = NULL;
1066 *rightinterval = NULL;
1067
1068 /*
1069 * Delta is an absolute distance to optimal split point, so both the
1070 * leftmost and rightmost split point will usually be at the end of the
1071 * array
1072 */
1073 for (int i = highsplit - 1; i >= 0; i--)
1074 {
1075 SplitPoint *distant = state->splits + i;
1076
1077 if (distant->firstrightoff < deltaoptimal->firstrightoff)
1078 {
1079 if (*leftinterval == NULL)
1080 *leftinterval = distant;
1081 }
1082 else if (distant->firstrightoff > deltaoptimal->firstrightoff)
1083 {
1084 if (*rightinterval == NULL)
1085 *rightinterval = distant;
1086 }
1087 else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
1088 {
1089 /*
1090 * "incoming tuple will become firstright" (distant) is to the
1091 * left of "incoming tuple will become lastleft" (delta-optimal)
1092 */
1093 Assert(distant->firstrightoff == state->newitemoff);
1094 if (*leftinterval == NULL)
1095 *leftinterval = distant;
1096 }
1097 else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
1098 {
1099 /*
1100 * "incoming tuple will become lastleft" (distant) is to the right
1101 * of "incoming tuple will become firstright" (delta-optimal)
1102 */
1103 Assert(distant->firstrightoff == state->newitemoff);
1104 if (*rightinterval == NULL)
1105 *rightinterval = distant;
1106 }
1107 else
1108 {
1109 /* There was only one or two splits in initial split interval */
1110 Assert(distant == deltaoptimal);
1111 if (*leftinterval == NULL)
1112 *leftinterval = distant;
1113 if (*rightinterval == NULL)
1114 *rightinterval = distant;
1115 }
1116
1117 if (*leftinterval && *rightinterval)
1118 return;
1119 }
1120
1121 Assert(false);
1122 }
1123
1124 /*
1125 * Subroutine to find penalty for caller's candidate split point.
1126 *
1127 * On leaf pages, penalty is the attribute number that distinguishes each side
1128 * of a split. It's the last attribute that needs to be included in new high
1129 * key for left page. It can be greater than the number of key attributes in
1130 * cases where a heap TID will need to be appended during truncation.
1131 *
1132 * On internal pages, penalty is simply the size of the firstright tuple for
1133 * the split (including line pointer overhead). This tuple will become the
1134 * new high key for the left page.
1135 */
1136 static inline int
_bt_split_penalty(FindSplitData * state,SplitPoint * split)1137 _bt_split_penalty(FindSplitData *state, SplitPoint *split)
1138 {
1139 IndexTuple lastleft;
1140 IndexTuple firstright;
1141
1142 if (!state->is_leaf)
1143 {
1144 ItemId itemid;
1145
1146 if (!split->newitemonleft &&
1147 split->firstrightoff == state->newitemoff)
1148 return state->newitemsz;
1149
1150 itemid = PageGetItemId(state->origpage, split->firstrightoff);
1151
1152 return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1153 }
1154
1155 lastleft = _bt_split_lastleft(state, split);
1156 firstright = _bt_split_firstright(state, split);
1157
1158 return _bt_keep_natts_fast(state->rel, lastleft, firstright);
1159 }
1160
1161 /*
1162 * Subroutine to get a lastleft IndexTuple for a split point
1163 */
1164 static inline IndexTuple
_bt_split_lastleft(FindSplitData * state,SplitPoint * split)1165 _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
1166 {
1167 ItemId itemid;
1168
1169 if (split->newitemonleft && split->firstrightoff == state->newitemoff)
1170 return state->newitem;
1171
1172 itemid = PageGetItemId(state->origpage,
1173 OffsetNumberPrev(split->firstrightoff));
1174 return (IndexTuple) PageGetItem(state->origpage, itemid);
1175 }
1176
1177 /*
1178 * Subroutine to get a firstright IndexTuple for a split point
1179 */
1180 static inline IndexTuple
_bt_split_firstright(FindSplitData * state,SplitPoint * split)1181 _bt_split_firstright(FindSplitData *state, SplitPoint *split)
1182 {
1183 ItemId itemid;
1184
1185 if (!split->newitemonleft && split->firstrightoff == state->newitemoff)
1186 return state->newitem;
1187
1188 itemid = PageGetItemId(state->origpage, split->firstrightoff);
1189 return (IndexTuple) PageGetItem(state->origpage, itemid);
1190 }
1191