1 /*-------------------------------------------------------------------------
2 *
3 * nbtsplitloc.c
4 * Choose split point code for Postgres btree implementation.
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/nbtree/nbtsplitloc.c
12 *
13 *-------------------------------------------------------------------------
14 */
15 #include "postgres.h"
16
17 #include "access/nbtree.h"
18 #include "storage/lmgr.h"
19
20 /* limits on split interval (default strategy only) */
21 #define MAX_LEAF_INTERVAL 9
22 #define MAX_INTERNAL_INTERVAL 18
23
24 typedef enum
25 {
26 /* strategy for searching through materialized list of split points */
27 SPLIT_DEFAULT, /* give some weight to truncation */
28 SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */
29 SPLIT_SINGLE_VALUE /* leave left page almost full */
30 } FindSplitStrat;
31
32 typedef struct
33 {
34 /* details of free space left by split */
35 int16 curdelta; /* current leftfree/rightfree delta */
36 int16 leftfree; /* space left on left page post-split */
37 int16 rightfree; /* space left on right page post-split */
38
39 /* split point identifying fields (returned by _bt_findsplitloc) */
40 OffsetNumber firstoldonright; /* first item on new right page */
41 bool newitemonleft; /* new item goes on left, or right? */
42
43 } SplitPoint;
44
45 typedef struct
46 {
47 /* context data for _bt_recsplitloc */
48 Relation rel; /* index relation */
49 Page page; /* page undergoing split */
50 IndexTuple newitem; /* new item (cause of page split) */
51 Size newitemsz; /* size of newitem (includes line pointer) */
52 bool is_leaf; /* T if splitting a leaf page */
53 bool is_rightmost; /* T if splitting rightmost page on level */
54 OffsetNumber newitemoff; /* where the new item is to be inserted */
55 int leftspace; /* space available for items on left page */
56 int rightspace; /* space available for items on right page */
57 int olddataitemstotal; /* space taken by old items */
58 Size minfirstrightsz; /* smallest firstoldonright tuple size */
59
60 /* candidate split point data */
61 int maxsplits; /* maximum number of splits */
62 int nsplits; /* current number of splits */
63 SplitPoint *splits; /* all candidate split points for page */
64 int interval; /* current range of acceptable split points */
65 } FindSplitData;
66
67 static void _bt_recsplitloc(FindSplitData *state,
68 OffsetNumber firstoldonright, bool newitemonleft,
69 int olddataitemstoleft, Size firstoldonrightsz);
70 static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
71 bool usemult);
72 static int _bt_splitcmp(const void *arg1, const void *arg2);
73 static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
74 int leaffillfactor, bool *usemult);
75 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
76 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
77 bool *newitemonleft, FindSplitStrat strategy);
78 static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
79 SplitPoint *rightpage, FindSplitStrat *strategy);
80 static void _bt_interval_edges(FindSplitData *state,
81 SplitPoint **leftinterval, SplitPoint **rightinterval);
82 static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split);
83 static inline IndexTuple _bt_split_lastleft(FindSplitData *state,
84 SplitPoint *split);
85 static inline IndexTuple _bt_split_firstright(FindSplitData *state,
86 SplitPoint *split);
87
88
89 /*
90 * _bt_findsplitloc() -- find an appropriate place to split a page.
91 *
92 * The main goal here is to equalize the free space that will be on each
93 * split page, *after accounting for the inserted tuple*. (If we fail to
94 * account for it, we might find ourselves with too little room on the page
95 * that it needs to go into!)
96 *
97 * If the page is the rightmost page on its level, we instead try to arrange
98 * to leave the left split page fillfactor% full. In this way, when we are
99 * inserting successively increasing keys (consider sequences, timestamps,
100 * etc) we will end up with a tree whose pages are about fillfactor% full,
101 * instead of the 50% full result that we'd get without this special case.
102 * This is the same as nbtsort.c produces for a newly-created tree. Note
103 * that leaf and nonleaf pages use different fillfactors. Note also that
104 * there are a number of further special cases where fillfactor is not
105 * applied in the standard way.
106 *
107 * We are passed the intended insert position of the new tuple, expressed as
108 * the offsetnumber of the tuple it must go in front of (this could be
109 * maxoff+1 if the tuple is to go at the end). The new tuple itself is also
110 * passed, since it's needed to give some weight to how effective suffix
111 * truncation will be. The implementation picks the split point that
112 * maximizes the effectiveness of suffix truncation from a small list of
113 * alternative candidate split points that leave each side of the split with
114 * about the same share of free space. Suffix truncation is secondary to
115 * equalizing free space, except in cases with large numbers of duplicates.
116 * Note that it is always assumed that caller goes on to perform truncation,
117 * even with pg_upgrade'd indexes where that isn't actually the case
118 * (!heapkeyspace indexes). See nbtree/README for more information about
119 * suffix truncation.
120 *
121 * We return the index of the first existing tuple that should go on the
122 * righthand page, plus a boolean indicating whether the new tuple goes on
123 * the left or right page. The bool is necessary to disambiguate the case
124 * where firstright == newitemoff.
125 */
126 OffsetNumber
_bt_findsplitloc(Relation rel,Page page,OffsetNumber newitemoff,Size newitemsz,IndexTuple newitem,bool * newitemonleft)127 _bt_findsplitloc(Relation rel,
128 Page page,
129 OffsetNumber newitemoff,
130 Size newitemsz,
131 IndexTuple newitem,
132 bool *newitemonleft)
133 {
134 BTPageOpaque opaque;
135 int leftspace,
136 rightspace,
137 olddataitemstotal,
138 olddataitemstoleft,
139 perfectpenalty,
140 leaffillfactor;
141 FindSplitData state;
142 FindSplitStrat strategy;
143 ItemId itemid;
144 OffsetNumber offnum,
145 maxoff,
146 foundfirstright;
147 double fillfactormult;
148 bool usemult;
149 SplitPoint leftpage,
150 rightpage;
151
152 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
153 maxoff = PageGetMaxOffsetNumber(page);
154
155 /* Total free space available on a btree page, after fixed overhead */
156 leftspace = rightspace =
157 PageGetPageSize(page) - SizeOfPageHeaderData -
158 MAXALIGN(sizeof(BTPageOpaqueData));
159
160 /* The right page will have the same high key as the old page */
161 if (!P_RIGHTMOST(opaque))
162 {
163 itemid = PageGetItemId(page, P_HIKEY);
164 rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) +
165 sizeof(ItemIdData));
166 }
167
168 /* Count up total space in data items before actually scanning 'em */
169 olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(page);
170 leaffillfactor = RelationGetFillFactor(rel, BTREE_DEFAULT_FILLFACTOR);
171
172 /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
173 newitemsz += sizeof(ItemIdData);
174 state.rel = rel;
175 state.page = page;
176 state.newitem = newitem;
177 state.newitemsz = newitemsz;
178 state.is_leaf = P_ISLEAF(opaque);
179 state.is_rightmost = P_RIGHTMOST(opaque);
180 state.leftspace = leftspace;
181 state.rightspace = rightspace;
182 state.olddataitemstotal = olddataitemstotal;
183 state.minfirstrightsz = SIZE_MAX;
184 state.newitemoff = newitemoff;
185
186 /*
187 * maxsplits should never exceed maxoff because there will be at most as
188 * many candidate split points as there are points _between_ tuples, once
189 * you imagine that the new item is already on the original page (the
190 * final number of splits may be slightly lower because not all points
191 * between tuples will be legal).
192 */
193 state.maxsplits = maxoff;
194 state.splits = palloc(sizeof(SplitPoint) * state.maxsplits);
195 state.nsplits = 0;
196
197 /*
198 * Scan through the data items and calculate space usage for a split at
199 * each possible position
200 */
201 olddataitemstoleft = 0;
202
203 for (offnum = P_FIRSTDATAKEY(opaque);
204 offnum <= maxoff;
205 offnum = OffsetNumberNext(offnum))
206 {
207 Size itemsz;
208
209 itemid = PageGetItemId(page, offnum);
210 itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
211
212 /*
213 * When item offset number is not newitemoff, neither side of the
214 * split can be newitem. Record a split after the previous data item
215 * from original page, but before the current data item from original
216 * page. (_bt_recsplitloc() will reject the split when there are no
217 * previous items, which we rely on.)
218 */
219 if (offnum < newitemoff)
220 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
221 else if (offnum > newitemoff)
222 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
223 else
224 {
225 /*
226 * Record a split after all "offnum < newitemoff" original page
227 * data items, but before newitem
228 */
229 _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz);
230
231 /*
232 * Record a split after newitem, but before data item from
233 * original page at offset newitemoff/current offset
234 */
235 _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz);
236 }
237
238 olddataitemstoleft += itemsz;
239 }
240
241 /*
242 * Record a split after all original page data items, but before newitem.
243 * (Though only when it's possible that newitem will end up alone on new
244 * right page.)
245 */
246 Assert(olddataitemstoleft == olddataitemstotal);
247 if (newitemoff > maxoff)
248 _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0);
249
250 /*
251 * I believe it is not possible to fail to find a feasible split, but just
252 * in case ...
253 */
254 if (state.nsplits == 0)
255 elog(ERROR, "could not find a feasible split point for index \"%s\"",
256 RelationGetRelationName(rel));
257
258 /*
259 * Start search for a split point among list of legal split points. Give
260 * primary consideration to equalizing available free space in each half
261 * of the split initially (start with default strategy), while applying
262 * rightmost and split-after-new-item optimizations where appropriate.
263 * Either of the two other fallback strategies may be required for cases
264 * with a large number of duplicates around the original/space-optimal
265 * split point.
266 *
267 * Default strategy gives some weight to suffix truncation in deciding a
268 * split point on leaf pages. It attempts to select a split point where a
269 * distinguishing attribute appears earlier in the new high key for the
270 * left side of the split, in order to maximize the number of trailing
271 * attributes that can be truncated away. Only candidate split points
272 * that imply an acceptable balance of free space on each side are
273 * considered.
274 */
275 if (!state.is_leaf)
276 {
277 /* fillfactormult only used on rightmost page */
278 usemult = state.is_rightmost;
279 fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0;
280 }
281 else if (state.is_rightmost)
282 {
283 /* Rightmost leaf page -- fillfactormult always used */
284 usemult = true;
285 fillfactormult = leaffillfactor / 100.0;
286 }
287 else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult))
288 {
289 /*
290 * New item inserted at rightmost point among a localized grouping on
291 * a leaf page -- apply "split after new item" optimization, either by
292 * applying leaf fillfactor multiplier, or by choosing the exact split
293 * point that leaves the new item as last on the left. (usemult is set
294 * for us.)
295 */
296 if (usemult)
297 {
298 /* fillfactormult should be set based on leaf fillfactor */
299 fillfactormult = leaffillfactor / 100.0;
300 }
301 else
302 {
303 /* find precise split point after newitemoff */
304 for (int i = 0; i < state.nsplits; i++)
305 {
306 SplitPoint *split = state.splits + i;
307
308 if (split->newitemonleft &&
309 newitemoff == split->firstoldonright)
310 {
311 pfree(state.splits);
312 *newitemonleft = true;
313 return newitemoff;
314 }
315 }
316
317 /*
318 * Cannot legally split after newitemoff; proceed with split
319 * without using fillfactor multiplier. This is defensive, and
320 * should never be needed in practice.
321 */
322 fillfactormult = 0.50;
323 }
324 }
325 else
326 {
327 /* Other leaf page. 50:50 page split. */
328 usemult = false;
329 /* fillfactormult not used, but be tidy */
330 fillfactormult = 0.50;
331 }
332
333 /*
334 * Set an initial limit on the split interval/number of candidate split
335 * points as appropriate. The "Prefix B-Trees" paper refers to this as
336 * sigma l for leaf splits and sigma b for internal ("branch") splits.
337 * It's hard to provide a theoretical justification for the initial size
338 * of the split interval, though it's clear that a small split interval
339 * makes suffix truncation much more effective without noticeably
340 * affecting space utilization over time.
341 */
342 state.interval = Min(Max(1, state.nsplits * 0.05),
343 state.is_leaf ? MAX_LEAF_INTERVAL :
344 MAX_INTERNAL_INTERVAL);
345
346 /*
347 * Save leftmost and rightmost splits for page before original ordinal
348 * sort order is lost by delta/fillfactormult sort
349 */
350 leftpage = state.splits[0];
351 rightpage = state.splits[state.nsplits - 1];
352
353 /* Give split points a fillfactormult-wise delta, and sort on deltas */
354 _bt_deltasortsplits(&state, fillfactormult, usemult);
355
356 /*
357 * Determine if default strategy/split interval will produce a
358 * sufficiently distinguishing split, or if we should change strategies.
359 * Alternative strategies change the range of split points that are
360 * considered acceptable (split interval), and possibly change
361 * fillfactormult, in order to deal with pages with a large number of
362 * duplicates gracefully.
363 *
364 * Pass low and high splits for the entire page (actually, they're for an
365 * imaginary version of the page that includes newitem). These are used
366 * when the initial split interval encloses split points that are full of
367 * duplicates, and we need to consider if it's even possible to avoid
368 * appending a heap TID.
369 */
370 perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
371
372 if (strategy == SPLIT_DEFAULT)
373 {
374 /*
375 * Default strategy worked out (always works out with internal page).
376 * Original split interval still stands.
377 */
378 }
379
380 /*
381 * Many duplicates strategy is used when a heap TID would otherwise be
382 * appended, but the page isn't completely full of logical duplicates.
383 *
384 * The split interval is widened to include all legal candidate split
385 * points. There might be a few as two distinct values in the whole-page
386 * split interval, though it's also possible that most of the values on
387 * the page are unique. The final split point will either be to the
388 * immediate left or to the immediate right of the group of duplicate
389 * tuples that enclose the first/delta-optimal split point (perfect
390 * penalty was set so that the lowest delta split point that avoids
391 * appending a heap TID will be chosen). Maximizing the number of
392 * attributes that can be truncated away is not a goal of the many
393 * duplicates strategy.
394 *
395 * Single value strategy is used when it is impossible to avoid appending
396 * a heap TID. It arranges to leave the left page very full. This
397 * maximizes space utilization in cases where tuples with the same
398 * attribute values span many pages. Newly inserted duplicates will tend
399 * to have higher heap TID values, so we'll end up splitting to the right
400 * consistently. (Single value strategy is harmless though not
401 * particularly useful with !heapkeyspace indexes.)
402 */
403 else if (strategy == SPLIT_MANY_DUPLICATES)
404 {
405 Assert(state.is_leaf);
406 /* Shouldn't try to truncate away extra user attributes */
407 Assert(perfectpenalty ==
408 IndexRelationGetNumberOfKeyAttributes(state.rel));
409 /* No need to resort splits -- no change in fillfactormult/deltas */
410 state.interval = state.nsplits;
411 }
412 else if (strategy == SPLIT_SINGLE_VALUE)
413 {
414 Assert(state.is_leaf);
415 /* Split near the end of the page */
416 usemult = true;
417 fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0;
418 /* Resort split points with new delta */
419 _bt_deltasortsplits(&state, fillfactormult, usemult);
420 /* Appending a heap TID is unavoidable, so interval of 1 is fine */
421 state.interval = 1;
422 }
423
424 /*
425 * Search among acceptable split points (using final split interval) for
426 * the entry that has the lowest penalty, and is therefore expected to
427 * maximize fan-out. Sets *newitemonleft for us.
428 */
429 foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
430 strategy);
431 pfree(state.splits);
432
433 return foundfirstright;
434 }
435
436 /*
437 * Subroutine to record a particular point between two tuples (possibly the
438 * new item) on page (ie, combination of firstright and newitemonleft
439 * settings) in *state for later analysis. This is also a convenient point
440 * to check if the split is legal (if it isn't, it won't be recorded).
441 *
442 * firstoldonright is the offset of the first item on the original page that
443 * goes to the right page, and firstoldonrightsz is the size of that tuple.
444 * firstoldonright can be > max offset, which means that all the old items go
445 * to the left page and only the new item goes to the right page. In that
446 * case, firstoldonrightsz is not used.
447 *
448 * olddataitemstoleft is the total size of all old items to the left of the
449 * split point that is recorded here when legal. Should not include
450 * newitemsz, since that is handled here.
451 */
452 static void
_bt_recsplitloc(FindSplitData * state,OffsetNumber firstoldonright,bool newitemonleft,int olddataitemstoleft,Size firstoldonrightsz)453 _bt_recsplitloc(FindSplitData *state,
454 OffsetNumber firstoldonright,
455 bool newitemonleft,
456 int olddataitemstoleft,
457 Size firstoldonrightsz)
458 {
459 int16 leftfree,
460 rightfree;
461 Size firstrightitemsz;
462 bool newitemisfirstonright;
463
464 /* Is the new item going to be the first item on the right page? */
465 newitemisfirstonright = (firstoldonright == state->newitemoff
466 && !newitemonleft);
467
468 if (newitemisfirstonright)
469 firstrightitemsz = state->newitemsz;
470 else
471 firstrightitemsz = firstoldonrightsz;
472
473 /* Account for all the old tuples */
474 leftfree = state->leftspace - olddataitemstoleft;
475 rightfree = state->rightspace -
476 (state->olddataitemstotal - olddataitemstoleft);
477
478 /*
479 * The first item on the right page becomes the high key of the left page;
480 * therefore it counts against left space as well as right space (we
481 * cannot assume that suffix truncation will make it any smaller). When
482 * index has included attributes, then those attributes of left page high
483 * key will be truncated leaving that page with slightly more free space.
484 * However, that shouldn't affect our ability to find valid split
485 * location, since we err in the direction of being pessimistic about free
486 * space on the left half. Besides, even when suffix truncation of
487 * non-TID attributes occurs, the new high key often won't even be a
488 * single MAXALIGN() quantum smaller than the firstright tuple it's based
489 * on.
490 *
491 * If we are on the leaf level, assume that suffix truncation cannot avoid
492 * adding a heap TID to the left half's new high key when splitting at the
493 * leaf level. In practice the new high key will often be smaller and
494 * will rarely be larger, but conservatively assume the worst case.
495 */
496 if (state->is_leaf)
497 leftfree -= (int16) (firstrightitemsz +
498 MAXALIGN(sizeof(ItemPointerData)));
499 else
500 leftfree -= (int16) firstrightitemsz;
501
502 /* account for the new item */
503 if (newitemonleft)
504 leftfree -= (int16) state->newitemsz;
505 else
506 rightfree -= (int16) state->newitemsz;
507
508 /*
509 * If we are not on the leaf level, we will be able to discard the key
510 * data from the first item that winds up on the right page.
511 */
512 if (!state->is_leaf)
513 rightfree += (int16) firstrightitemsz -
514 (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData));
515
516 /* Record split if legal */
517 if (leftfree >= 0 && rightfree >= 0)
518 {
519 Assert(state->nsplits < state->maxsplits);
520
521 /* Determine smallest firstright item size on page */
522 state->minfirstrightsz = Min(state->minfirstrightsz, firstrightitemsz);
523
524 state->splits[state->nsplits].curdelta = 0;
525 state->splits[state->nsplits].leftfree = leftfree;
526 state->splits[state->nsplits].rightfree = rightfree;
527 state->splits[state->nsplits].firstoldonright = firstoldonright;
528 state->splits[state->nsplits].newitemonleft = newitemonleft;
529 state->nsplits++;
530 }
531 }
532
533 /*
534 * Subroutine to assign space deltas to materialized array of candidate split
535 * points based on current fillfactor, and to sort array using that fillfactor
536 */
537 static void
_bt_deltasortsplits(FindSplitData * state,double fillfactormult,bool usemult)538 _bt_deltasortsplits(FindSplitData *state, double fillfactormult,
539 bool usemult)
540 {
541 for (int i = 0; i < state->nsplits; i++)
542 {
543 SplitPoint *split = state->splits + i;
544 int16 delta;
545
546 if (usemult)
547 delta = fillfactormult * split->leftfree -
548 (1.0 - fillfactormult) * split->rightfree;
549 else
550 delta = split->leftfree - split->rightfree;
551
552 if (delta < 0)
553 delta = -delta;
554
555 /* Save delta */
556 split->curdelta = delta;
557 }
558
559 qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp);
560 }
561
562 /*
563 * qsort-style comparator used by _bt_deltasortsplits()
564 */
565 static int
_bt_splitcmp(const void * arg1,const void * arg2)566 _bt_splitcmp(const void *arg1, const void *arg2)
567 {
568 SplitPoint *split1 = (SplitPoint *) arg1;
569 SplitPoint *split2 = (SplitPoint *) arg2;
570
571 if (split1->curdelta > split2->curdelta)
572 return 1;
573 if (split1->curdelta < split2->curdelta)
574 return -1;
575
576 return 0;
577 }
578
579 /*
580 * Subroutine to determine whether or not a non-rightmost leaf page should be
581 * split immediately after the would-be original page offset for the
582 * new/incoming tuple (or should have leaf fillfactor applied when new item is
583 * to the right on original page). This is appropriate when there is a
584 * pattern of localized monotonically increasing insertions into a composite
585 * index, where leading attribute values form local groupings, and we
586 * anticipate further insertions of the same/current grouping (new item's
587 * grouping) in the near future. This can be thought of as a variation on
588 * applying leaf fillfactor during rightmost leaf page splits, since cases
589 * that benefit will converge on packing leaf pages leaffillfactor% full over
590 * time.
591 *
592 * We may leave extra free space remaining on the rightmost page of a "most
593 * significant column" grouping of tuples if that grouping never ends up
594 * having future insertions that use the free space. That effect is
595 * self-limiting; a future grouping that becomes the "nearest on the right"
596 * grouping of the affected grouping usually puts the extra free space to good
597 * use.
598 *
599 * Caller uses optimization when routine returns true, though the exact action
600 * taken by caller varies. Caller uses original leaf page fillfactor in
601 * standard way rather than using the new item offset directly when *usemult
602 * was also set to true here. Otherwise, caller applies optimization by
603 * locating the legal split point that makes the new tuple the very last tuple
604 * on the left side of the split.
605 */
606 static bool
_bt_afternewitemoff(FindSplitData * state,OffsetNumber maxoff,int leaffillfactor,bool * usemult)607 _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
608 int leaffillfactor, bool *usemult)
609 {
610 int16 nkeyatts;
611 ItemId itemid;
612 IndexTuple tup;
613 int keepnatts;
614
615 Assert(state->is_leaf && !state->is_rightmost);
616
617 nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
618
619 /* Single key indexes not considered here */
620 if (nkeyatts == 1)
621 return false;
622
623 /* Ascending insertion pattern never inferred when new item is first */
624 if (state->newitemoff == P_FIRSTKEY)
625 return false;
626
627 /*
628 * Only apply optimization on pages with equisized tuples, since ordinal
629 * keys are likely to be fixed-width. Testing if the new tuple is
630 * variable width directly might also work, but that fails to apply the
631 * optimization to indexes with a numeric_ops attribute.
632 *
633 * Conclude that page has equisized tuples when the new item is the same
634 * width as the smallest item observed during pass over page, and other
635 * non-pivot tuples must be the same width as well. (Note that the
636 * possibly-truncated existing high key isn't counted in
637 * olddataitemstotal, and must be subtracted from maxoff.)
638 */
639 if (state->newitemsz != state->minfirstrightsz)
640 return false;
641 if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal)
642 return false;
643
644 /*
645 * Avoid applying optimization when tuples are wider than a tuple
646 * consisting of two non-NULL int8/int64 attributes (or four non-NULL
647 * int4/int32 attributes)
648 */
649 if (state->newitemsz >
650 MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) +
651 sizeof(ItemIdData))
652 return false;
653
654 /*
655 * At least the first attribute's value must be equal to the corresponding
656 * value in previous tuple to apply optimization. New item cannot be a
657 * duplicate, either.
658 *
659 * Handle case where new item is to the right of all items on the existing
660 * page. This is suggestive of monotonically increasing insertions in
661 * itself, so the "heap TID adjacency" test is not applied here.
662 */
663 if (state->newitemoff > maxoff)
664 {
665 itemid = PageGetItemId(state->page, maxoff);
666 tup = (IndexTuple) PageGetItem(state->page, itemid);
667 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
668
669 if (keepnatts > 1 && keepnatts <= nkeyatts)
670 {
671 *usemult = true;
672 return true;
673 }
674
675 return false;
676 }
677
678 /*
679 * "Low cardinality leading column, high cardinality suffix column"
680 * indexes with a random insertion pattern (e.g., an index with a boolean
681 * column, such as an index on '(book_is_in_print, book_isbn)') present us
682 * with a risk of consistently misapplying the optimization. We're
683 * willing to accept very occasional misapplication of the optimization,
684 * provided the cases where we get it wrong are rare and self-limiting.
685 *
686 * Heap TID adjacency strongly suggests that the item just to the left was
687 * inserted very recently, which limits overapplication of the
688 * optimization. Besides, all inappropriate cases triggered here will
689 * still split in the middle of the page on average.
690 */
691 itemid = PageGetItemId(state->page, OffsetNumberPrev(state->newitemoff));
692 tup = (IndexTuple) PageGetItem(state->page, itemid);
693 /* Do cheaper test first */
694 if (!_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid))
695 return false;
696 /* Check same conditions as rightmost item case, too */
697 keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem);
698
699 if (keepnatts > 1 && keepnatts <= nkeyatts)
700 {
701 double interp = (double) state->newitemoff / ((double) maxoff + 1);
702 double leaffillfactormult = (double) leaffillfactor / 100.0;
703
704 /*
705 * Don't allow caller to split after a new item when it will result in
706 * a split point to the right of the point that a leaf fillfactor
707 * split would use -- have caller apply leaf fillfactor instead
708 */
709 *usemult = interp > leaffillfactormult;
710
711 return true;
712 }
713
714 return false;
715 }
716
717 /*
718 * Subroutine for determining if two heap TIDS are "adjacent".
719 *
720 * Adjacent means that the high TID is very likely to have been inserted into
721 * heap relation immediately after the low TID, probably during the current
722 * transaction.
723 */
724 static bool
_bt_adjacenthtid(ItemPointer lowhtid,ItemPointer highhtid)725 _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
726 {
727 BlockNumber lowblk,
728 highblk;
729
730 lowblk = ItemPointerGetBlockNumber(lowhtid);
731 highblk = ItemPointerGetBlockNumber(highhtid);
732
733 /* Make optimistic assumption of adjacency when heap blocks match */
734 if (lowblk == highblk)
735 return true;
736
737 /* When heap block one up, second offset should be FirstOffsetNumber */
738 if (lowblk + 1 == highblk &&
739 ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber)
740 return true;
741
742 return false;
743 }
744
745 /*
746 * Subroutine to find the "best" split point among candidate split points.
747 * The best split point is the split point with the lowest penalty among split
748 * points that fall within current/final split interval. Penalty is an
749 * abstract score, with a definition that varies depending on whether we're
750 * splitting a leaf page or an internal page. See _bt_split_penalty() for
751 * details.
752 *
753 * "perfectpenalty" is assumed to be the lowest possible penalty among
754 * candidate split points. This allows us to return early without wasting
755 * cycles on calculating the first differing attribute for all candidate
756 * splits when that clearly cannot improve our choice (or when we only want a
757 * minimally distinguishing split point, and don't want to make the split any
758 * more unbalanced than is necessary).
759 *
760 * We return the index of the first existing tuple that should go on the right
761 * page, plus a boolean indicating if new item is on left of split point.
762 */
763 static OffsetNumber
_bt_bestsplitloc(FindSplitData * state,int perfectpenalty,bool * newitemonleft,FindSplitStrat strategy)764 _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
765 bool *newitemonleft, FindSplitStrat strategy)
766 {
767 int bestpenalty,
768 lowsplit;
769 int highsplit = Min(state->interval, state->nsplits);
770 SplitPoint *final;
771
772 bestpenalty = INT_MAX;
773 lowsplit = 0;
774 for (int i = lowsplit; i < highsplit; i++)
775 {
776 int penalty;
777
778 penalty = _bt_split_penalty(state, state->splits + i);
779
780 if (penalty <= perfectpenalty)
781 {
782 bestpenalty = penalty;
783 lowsplit = i;
784 break;
785 }
786
787 if (penalty < bestpenalty)
788 {
789 bestpenalty = penalty;
790 lowsplit = i;
791 }
792 }
793
794 final = &state->splits[lowsplit];
795
796 /*
797 * There is a risk that the "many duplicates" strategy will repeatedly do
798 * the wrong thing when there are monotonically decreasing insertions to
799 * the right of a large group of duplicates. Repeated splits could leave
800 * a succession of right half pages with free space that can never be
801 * used. This must be avoided.
802 *
803 * Consider the example of the leftmost page in a single integer attribute
804 * NULLS FIRST index which is almost filled with NULLs. Monotonically
805 * decreasing integer insertions might cause the same leftmost page to
806 * split repeatedly at the same point. Each split derives its new high
807 * key from the lowest current value to the immediate right of the large
808 * group of NULLs, which will always be higher than all future integer
809 * insertions, directing all future integer insertions to the same
810 * leftmost page.
811 */
812 if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
813 !final->newitemonleft && final->firstoldonright >= state->newitemoff &&
814 final->firstoldonright < state->newitemoff + MAX_LEAF_INTERVAL)
815 {
816 /*
817 * Avoid the problem by peforming a 50:50 split when the new item is
818 * just to the right of the would-be "many duplicates" split point.
819 */
820 final = &state->splits[0];
821 }
822
823 *newitemonleft = final->newitemonleft;
824 return final->firstoldonright;
825 }
826
827 /*
828 * Subroutine to decide whether split should use default strategy/initial
829 * split interval, or whether it should finish splitting the page using
830 * alternative strategies (this is only possible with leaf pages).
831 *
832 * Caller uses alternative strategy (or sticks with default strategy) based
833 * on how *strategy is set here. Return value is "perfect penalty", which is
834 * passed to _bt_bestsplitloc() as a final constraint on how far caller is
835 * willing to go to avoid appending a heap TID when using the many duplicates
836 * strategy (it also saves _bt_bestsplitloc() useless cycles).
837 */
838 static int
_bt_strategy(FindSplitData * state,SplitPoint * leftpage,SplitPoint * rightpage,FindSplitStrat * strategy)839 _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
840 SplitPoint *rightpage, FindSplitStrat *strategy)
841 {
842 IndexTuple leftmost,
843 rightmost;
844 SplitPoint *leftinterval,
845 *rightinterval;
846 int perfectpenalty;
847 int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel);
848
849 /* Assume that alternative strategy won't be used for now */
850 *strategy = SPLIT_DEFAULT;
851
852 /*
853 * Use smallest observed first right item size for entire page as perfect
854 * penalty on internal pages. This can save cycles in the common case
855 * where most or all splits (not just splits within interval) have first
856 * right tuples that are the same size.
857 */
858 if (!state->is_leaf)
859 return state->minfirstrightsz;
860
861 /*
862 * Use leftmost and rightmost tuples from leftmost and rightmost splits in
863 * current split interval
864 */
865 _bt_interval_edges(state, &leftinterval, &rightinterval);
866 leftmost = _bt_split_lastleft(state, leftinterval);
867 rightmost = _bt_split_firstright(state, rightinterval);
868
869 /*
870 * If initial split interval can produce a split point that will at least
871 * avoid appending a heap TID in new high key, we're done. Finish split
872 * with default strategy and initial split interval.
873 */
874 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
875 if (perfectpenalty <= indnkeyatts)
876 return perfectpenalty;
877
878 /*
879 * Work out how caller should finish split when even their "perfect"
880 * penalty for initial/default split interval indicates that the interval
881 * does not contain even a single split that avoids appending a heap TID.
882 *
883 * Use the leftmost split's lastleft tuple and the rightmost split's
884 * firstright tuple to assess every possible split.
885 */
886 leftmost = _bt_split_lastleft(state, leftpage);
887 rightmost = _bt_split_firstright(state, rightpage);
888
889 /*
890 * If page (including new item) has many duplicates but is not entirely
891 * full of duplicates, a many duplicates strategy split will be performed.
892 * If page is entirely full of duplicates, a single value strategy split
893 * will be performed.
894 */
895 perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost);
896 if (perfectpenalty <= indnkeyatts)
897 {
898 *strategy = SPLIT_MANY_DUPLICATES;
899
900 /*
901 * Many duplicates strategy should split at either side the group of
902 * duplicates that enclose the delta-optimal split point. Return
903 * indnkeyatts rather than the true perfect penalty to make that
904 * happen. (If perfectpenalty was returned here then low cardinality
905 * composite indexes could have continual unbalanced splits.)
906 *
907 * Note that caller won't go through with a many duplicates split in
908 * rare cases where it looks like there are ever-decreasing insertions
909 * to the immediate right of the split point. This must happen just
910 * before a final decision is made, within _bt_bestsplitloc().
911 */
912 return indnkeyatts;
913 }
914
915 /*
916 * Single value strategy is only appropriate with ever-increasing heap
917 * TIDs; otherwise, original default strategy split should proceed to
918 * avoid pathological performance. Use page high key to infer if this is
919 * the rightmost page among pages that store the same duplicate value.
920 * This should not prevent insertions of heap TIDs that are slightly out
921 * of order from using single value strategy, since that's expected with
922 * concurrent inserters of the same duplicate value.
923 */
924 else if (state->is_rightmost)
925 *strategy = SPLIT_SINGLE_VALUE;
926 else
927 {
928 ItemId itemid;
929 IndexTuple hikey;
930
931 itemid = PageGetItemId(state->page, P_HIKEY);
932 hikey = (IndexTuple) PageGetItem(state->page, itemid);
933 perfectpenalty = _bt_keep_natts_fast(state->rel, hikey,
934 state->newitem);
935 if (perfectpenalty <= indnkeyatts)
936 *strategy = SPLIT_SINGLE_VALUE;
937 else
938 {
939 /*
940 * Have caller finish split using default strategy, since page
941 * does not appear to be the rightmost page for duplicates of the
942 * value the page is filled with
943 */
944 }
945 }
946
947 return perfectpenalty;
948 }
949
950 /*
951 * Subroutine to locate leftmost and rightmost splits for current/default
952 * split interval. Note that it will be the same split iff there is only one
953 * split in interval.
954 */
955 static void
_bt_interval_edges(FindSplitData * state,SplitPoint ** leftinterval,SplitPoint ** rightinterval)956 _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval,
957 SplitPoint **rightinterval)
958 {
959 int highsplit = Min(state->interval, state->nsplits);
960 SplitPoint *deltaoptimal;
961
962 deltaoptimal = state->splits;
963 *leftinterval = NULL;
964 *rightinterval = NULL;
965
966 /*
967 * Delta is an absolute distance to optimal split point, so both the
968 * leftmost and rightmost split point will usually be at the end of the
969 * array
970 */
971 for (int i = highsplit - 1; i >= 0; i--)
972 {
973 SplitPoint *distant = state->splits + i;
974
975 if (distant->firstoldonright < deltaoptimal->firstoldonright)
976 {
977 if (*leftinterval == NULL)
978 *leftinterval = distant;
979 }
980 else if (distant->firstoldonright > deltaoptimal->firstoldonright)
981 {
982 if (*rightinterval == NULL)
983 *rightinterval = distant;
984 }
985 else if (!distant->newitemonleft && deltaoptimal->newitemonleft)
986 {
987 /*
988 * "incoming tuple will become first on right page" (distant) is
989 * to the left of "incoming tuple will become last on left page"
990 * (delta-optimal)
991 */
992 Assert(distant->firstoldonright == state->newitemoff);
993 if (*leftinterval == NULL)
994 *leftinterval = distant;
995 }
996 else if (distant->newitemonleft && !deltaoptimal->newitemonleft)
997 {
998 /*
999 * "incoming tuple will become last on left page" (distant) is to
1000 * the right of "incoming tuple will become first on right page"
1001 * (delta-optimal)
1002 */
1003 Assert(distant->firstoldonright == state->newitemoff);
1004 if (*rightinterval == NULL)
1005 *rightinterval = distant;
1006 }
1007 else
1008 {
1009 /* There was only one or two splits in initial split interval */
1010 Assert(distant == deltaoptimal);
1011 if (*leftinterval == NULL)
1012 *leftinterval = distant;
1013 if (*rightinterval == NULL)
1014 *rightinterval = distant;
1015 }
1016
1017 if (*leftinterval && *rightinterval)
1018 return;
1019 }
1020
1021 Assert(false);
1022 }
1023
1024 /*
1025 * Subroutine to find penalty for caller's candidate split point.
1026 *
1027 * On leaf pages, penalty is the attribute number that distinguishes each side
1028 * of a split. It's the last attribute that needs to be included in new high
1029 * key for left page. It can be greater than the number of key attributes in
1030 * cases where a heap TID will need to be appended during truncation.
1031 *
1032 * On internal pages, penalty is simply the size of the first item on the
1033 * right half of the split (including line pointer overhead). This tuple will
1034 * become the new high key for the left page.
1035 */
1036 static inline int
_bt_split_penalty(FindSplitData * state,SplitPoint * split)1037 _bt_split_penalty(FindSplitData *state, SplitPoint *split)
1038 {
1039 IndexTuple lastleftuple;
1040 IndexTuple firstrighttuple;
1041
1042 if (!state->is_leaf)
1043 {
1044 ItemId itemid;
1045
1046 if (!split->newitemonleft &&
1047 split->firstoldonright == state->newitemoff)
1048 return state->newitemsz;
1049
1050 itemid = PageGetItemId(state->page, split->firstoldonright);
1051
1052 return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
1053 }
1054
1055 lastleftuple = _bt_split_lastleft(state, split);
1056 firstrighttuple = _bt_split_firstright(state, split);
1057
1058 Assert(lastleftuple != firstrighttuple);
1059 return _bt_keep_natts_fast(state->rel, lastleftuple, firstrighttuple);
1060 }
1061
1062 /*
1063 * Subroutine to get a lastleft IndexTuple for a spit point from page
1064 */
1065 static inline IndexTuple
_bt_split_lastleft(FindSplitData * state,SplitPoint * split)1066 _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
1067 {
1068 ItemId itemid;
1069
1070 if (split->newitemonleft && split->firstoldonright == state->newitemoff)
1071 return state->newitem;
1072
1073 itemid = PageGetItemId(state->page,
1074 OffsetNumberPrev(split->firstoldonright));
1075 return (IndexTuple) PageGetItem(state->page, itemid);
1076 }
1077
1078 /*
1079 * Subroutine to get a firstright IndexTuple for a spit point from page
1080 */
1081 static inline IndexTuple
_bt_split_firstright(FindSplitData * state,SplitPoint * split)1082 _bt_split_firstright(FindSplitData *state, SplitPoint *split)
1083 {
1084 ItemId itemid;
1085
1086 if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
1087 return state->newitem;
1088
1089 itemid = PageGetItemId(state->page, split->firstoldonright);
1090 return (IndexTuple) PageGetItem(state->page, itemid);
1091 }
1092