xref: /original-bsd/lib/libc/db/btree/bt_split.c (revision b21da0a0)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_split.c	5.2 (Berkeley) 02/22/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 #include <sys/types.h>
16 #include <db.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include "btree.h"
20 
21 /*
22  *  _BT_SPLIT -- Split a page into two pages.
23  *
24  *	Splits are caused by insertions, and propogate up the tree in
25  *	the usual way.  The root page is always page 1 in the file on
26  *	disk, so root splits are handled specially.  On entry to this
27  *	routine, t->bt_curpage is the page to be split.
28  *
29  *	Parameters:
30  *		t -- btree in which to do split.
31  *
32  *	Returns:
33  *		RET_SUCCESS, RET_ERROR.
34  *
35  *	Side Effects:
36  *		Changes the notion of the current page.
37  */
38 
39 int
40 _bt_split(t)
41 	BTREE_P t;
42 {
43 	BTHEADER *h;
44 	BTHEADER *left, *right;
45 	pgno_t nextpgno, parent;
46 	int nbytes, len;
47 	IDATUM *id;
48 	DATUM *d;
49 	char *src;
50 	IDATUM *new;
51 	pgno_t oldchain;
52 	u_char flags;
53 
54 	h = (BTHEADER *) t->bt_curpage;
55 
56 	/* split root page specially, since it must remain page 1 */
57 	if (h->h_pgno == P_ROOT) {
58 		return (_bt_splitroot(t));
59 	}
60 
61 	/*
62 	 *  This is a little complicated.  We go to some trouble to
63 	 *  figure out which of the three possible cases -- in-memory tree,
64 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
65 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
66 	 *  have to do some extra copying, though, since the cache code
67 	 *  manages buffers externally to this code.
68 	 */
69 
70 	if (ISDISK(t) && ISCACHE(t)) {
71 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
72 		    == (BTHEADER *) NULL)
73 			return (RET_ERROR);
74 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
75 		left->h_flags = t->bt_curpage->h_flags;
76 		left->h_lower = (index_t)
77 			  (((char *) &(left->h_linp[0])) - ((char *) left));
78 		left->h_upper = t->bt_psize;
79 
80 	} else {
81 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
82 			return (RET_ERROR);
83 	}
84 	left->h_pgno = h->h_pgno;
85 
86 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
87 		return (RET_ERROR);
88 	right->h_pgno = ++(t->bt_npages);
89 
90 	/* now do the split */
91 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
92 		return (RET_ERROR);
93 
94 	right->h_prevpg = left->h_pgno;
95 	nextpgno = right->h_nextpg = h->h_nextpg;
96 	left->h_nextpg = right->h_pgno;
97 	left->h_prevpg = h->h_prevpg;
98 
99 	/* okay, now use the left half of the page as the new page */
100 	if (ISDISK(t) && ISCACHE(t)) {
101 		(void) bcopy((char *) left, (char *) t->bt_curpage,
102 			     (int) t->bt_psize);
103 		(void) free ((char *) left);
104 		left = t->bt_curpage;
105 	} else {
106 		(void) free((char *) t->bt_curpage);
107 		t->bt_curpage = left;
108 	}
109 
110 	/*
111 	 *  Write the new pages out.  We need them to stay where they are
112 	 *  until we're done updating the parent pages.
113 	 */
114 
115 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
116 		return (RET_ERROR);
117 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
118 		return (RET_ERROR);
119 
120 	/* update 'prev' pointer of old neighbor of left */
121 	if (nextpgno != P_NONE) {
122 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
123 			return (RET_ERROR);
124 		h = t->bt_curpage;
125 		h->h_prevpg = right->h_pgno;
126 		h->h_flags |= F_DIRTY;
127 	}
128 
129 	if ((parent = _bt_pop(t)) != P_NONE) {
130 		if (right->h_flags & F_LEAF) {
131 			d = (DATUM *) GETDATUM(right, 0);
132 			len = d->d_ksize;
133 			if (d->d_flags & D_BIGKEY) {
134 				bcopy(&(d->d_bytes[0]),
135 				      (char *) &oldchain,
136 				      sizeof(oldchain));
137 				if (_bt_markchain(t, oldchain) == RET_ERROR)
138 					return (RET_ERROR);
139 				src = (char *) &oldchain;
140 				flags = D_BIGKEY;
141 			} else {
142 				src = (char *) &(d->d_bytes[0]);
143 				flags = 0;
144 			}
145 		} else {
146 			id = (IDATUM *) GETDATUM(right, 0);
147 			len = id->i_size;
148 			flags = id->i_flags;
149 			src = (char *) &(id->i_bytes[0]);
150 		}
151 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
152 		new = (IDATUM *) malloc((unsigned) nbytes);
153 		if (new == (IDATUM *) NULL)
154 			return (RET_ERROR);
155 		new->i_size = len;
156 		new->i_pgno = right->h_pgno;
157 		new->i_flags = flags;
158 		if (len > 0)
159 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
160 
161 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
162 		if (_bt_getpage(t, parent) == RET_ERROR)
163 			return (RET_ERROR);
164 
165 		h = t->bt_curpage;
166 
167 		/*
168 		 *  Split the parent if we need to, then reposition the
169 		 *  tree's current page pointer for the new datum.
170 		 */
171 		if ((h->h_upper - h->h_lower) < nbytes) {
172 			if (_bt_split(t) == RET_ERROR)
173 				return (RET_ERROR);
174 			if (_bt_reposition(t, new, parent, right->h_prevpg)
175 			      == RET_ERROR)
176 				return (RET_ERROR);
177 		}
178 
179 		/* okay, now insert the new idatum */
180 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
181 			return (RET_ERROR);
182 	}
183 
184 	/*
185 	 *  Okay, split is done; don't need right page stapled down anymore.
186 	 *  The page we call 'left' above is the new version of the old
187 	 *  (split) page, so we can't release it.
188 	 */
189 
190 	if (_bt_release(t, right) == RET_ERROR)
191 		return (RET_ERROR);
192 	if (ISDISK(t) && !ISCACHE(t))
193 		(void) free((char *) right);
194 
195 	return (RET_SUCCESS);
196 }
197 
198 /*
199  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
200  *
201  *	After splitting a node in the tree in order to make room for
202  *	an insertion, we need to figure out which page after the split
203  *	should get the item we want to insert.  This routine positions
204  *	the tree's current page pointer appropriately.
205  *
206  *	Parameters:
207  *		t -- tree to position
208  *		new -- the item we want to insert
209  *		parent -- parent of the node that we just split
210  *		prev -- page number of item directly to the left of
211  *			new's position in the tree.
212  *
213  *	Returns:
214  *		RET_SUCCESS, RET_ERROR.
215  *
216  *	Side Effects:
217  *		None.
218  */
219 
220 int
221 _bt_reposition(t, new, parent, prev)
222 	BTREE_P t;
223 	IDATUM *new;
224 	pgno_t parent;
225 	pgno_t prev;
226 {
227 	index_t i, next;
228 	IDATUM *idx;
229 
230 	if (parent == P_ROOT) {
231 
232 		/*
233 		 *  If we just split the root page, then there are guaranteed
234 		 *  to be exactly two IDATUMs on it.  Look at both of them
235 		 *  to see if they point to the page that we want.
236 		 */
237 
238 		if (_bt_getpage(t, parent) == RET_ERROR)
239 			return (RET_ERROR);
240 
241 		next = NEXTINDEX(t->bt_curpage);
242 		for (i = 0; i < next; i++) {
243 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
244 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
245 				return (RET_ERROR);
246 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
247 				return (RET_SUCCESS);
248 			if (_bt_getpage(t, parent) == RET_ERROR)
249 				return (RET_ERROR);
250 		}
251 	} else {
252 
253 		/*
254 		 *  Get the parent page -- which is where the new item would
255 		 *  have gone -- and figure out whether the new item now goes
256 		 *  on the parent, or the page immediately to the right of
257 		 *  the parent.
258 		 */
259 
260 		if (_bt_getpage(t, parent) == RET_ERROR)
261 			return (RET_ERROR);
262 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
263 			return (RET_SUCCESS);
264 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
265 			return (RET_ERROR);
266 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
267 			return (RET_SUCCESS);
268 	}
269 	return (RET_ERROR);
270 }
271 
272 /*
273  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
274  *
275  *	This routine is used by _bt_reposition to decide whether the current
276  *	page is the correct one on which to insert a new item.
277  *
278  *	Parameters:
279  *		t -- tree to check
280  *		new -- the item that will be inserted (used for binary search)
281  *		prev -- page number of page whose IDATUM is immediately to
282  *			the left of new's position in the tree.
283  *
284  *	Returns:
285  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
286  */
287 
288 int
289 _bt_isonpage(t, new, prev)
290 	BTREE_P t;
291 	IDATUM *new;
292 	pgno_t prev;
293 {
294 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
295 	index_t i, next;
296 	IDATUM *idx;
297 
298 	i = _bt_binsrch(t, &(new->i_bytes[0]));
299 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
300 		--i;
301 	next = NEXTINDEX(h);
302 	idx = (IDATUM *) GETDATUM(h, i);
303 	while (i < next && idx->i_pgno != prev) {
304 		i++;
305 		idx = (IDATUM *) GETDATUM(h, i);
306 	}
307 	if (idx->i_pgno == prev)
308 		return (RET_SUCCESS);
309 	else
310 		return (RET_ERROR);
311 }
312 
313 /*
314  *  _BT_SPLITROOT -- Split the root of a btree.
315  *
316  *	The root page for a btree is always page one.  This means that in
317  *	order to split the root, we need to do extra work.
318  *
319  *	Parameters:
320  *		t -- tree to split
321  *
322  *	Returns:
323  *		RET_SUCCESS, RET_ERROR.
324  *
325  *	Side Effects:
326  *		Splits root upward in the usual way, adding two new pages
327  *		to the tree (rather than just one, as in usual splits).
328  */
329 
330 int
331 _bt_splitroot(t)
332 	BTREE_P t;
333 {
334 	BTHEADER *h = t->bt_curpage;
335 	BTHEADER *left, *right;
336 	IDATUM *id;
337 	BTHEADER *where_h;
338 	char *src, *dest;
339 	int len, nbytes;
340 	u_long was_leaf;
341 	pgno_t oldchain;
342 	u_char flags;
343 
344 	/* get two new pages for the split */
345 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
346 		return (RET_ERROR);
347 	left->h_pgno = ++(t->bt_npages);
348 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
349 		return (RET_ERROR);
350 	right->h_pgno = ++(t->bt_npages);
351 
352 	/* do the split */
353 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
354 		return (RET_ERROR);
355 
356 	/* connect the new pages correctly */
357 	right->h_prevpg = left->h_pgno;
358 	left->h_nextpg = right->h_pgno;
359 
360 	/*
361 	 *  Write the child pages out now.  We need them to remain
362 	 *  where they are until we finish updating parent pages,
363 	 *  however.
364 	 */
365 
366 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
367 		return (RET_ERROR);
368 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
369 		return (RET_ERROR);
370 
371 	/* now change the root page into an internal page */
372 	was_leaf = (h->h_flags & F_LEAF);
373 	h->h_flags &= ~F_LEAF;
374 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
375 	h->h_upper = (index_t) t->bt_psize;
376 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
377 
378 	/* put two new keys on root page */
379 	where_h = left;
380 	while (where_h) {
381 		DATUM *data;
382 		IDATUM *idata;
383 
384 		if (was_leaf) {
385 			data = (DATUM *) GETDATUM(where_h, 0);
386 
387 			if (where_h == left) {
388 				len = 0;	/* first key in tree is null */
389 			} else {
390 				if (data->d_flags & D_BIGKEY) {
391 					bcopy(&(data->d_bytes[0]),
392 					      (char *) &oldchain,
393 					      sizeof(oldchain));
394 					if (_bt_markchain(t, oldchain) == RET_ERROR)
395 						return (RET_ERROR);
396 					src = (char *) &oldchain;
397 					flags = D_BIGKEY;
398 				} else {
399 					src = (char *) &(data->d_bytes[0]);
400 					flags = 0;
401 				}
402 				len = data->d_ksize;
403 			}
404 		} else {
405 			idata = (IDATUM *) GETDATUM(where_h, 0);
406 			len = idata->i_size;
407 			flags = idata->i_flags;
408 			src = &(idata->i_bytes[0]);
409 		}
410 		dest = ((char *) h) + h->h_upper;
411 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
412 		dest -= LONGALIGN(nbytes);
413 		id = (IDATUM *) dest;
414 		id->i_size = len;
415 		id->i_pgno = where_h->h_pgno;
416 		id->i_flags = flags;
417 		if (len > 0)
418 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
419 		dest -= ((int) h);
420 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
421 		h->h_upper = (index_t) dest;
422 		h->h_lower += sizeof(index_t);
423 
424 		/* next page */
425 		if (where_h == left)
426 			where_h = right;
427 		else
428 			where_h = (BTHEADER *) NULL;
429 	}
430 
431 	if (_bt_release(t, left) == RET_ERROR)
432 		return (RET_ERROR);
433 	if (_bt_release(t, right) == RET_ERROR)
434 		return (RET_ERROR);
435 
436 	/*
437 	 *  That's it, split is done.  If we're doing a non-cached disk
438 	 *  btree, we can free up the pages we allocated, as they're on
439 	 *  disk, now.
440 	 */
441 
442 	if (ISDISK(t) && !ISCACHE(t)) {
443 		(void) free ((char *) left);
444 		(void) free ((char *) right);
445 	}
446 
447 	h->h_flags |= F_DIRTY;
448 
449 	return (RET_SUCCESS);
450 }
451 
452 /*
453  *  _BT_DOPSPLIT -- Do the work of splitting a page
454  *
455  *	This routine takes two page pointers and splits the data on the
456  *	current page of the btree between them.
457  *
458  *	We do a lot of work here to handle duplicate keys on a page; we
459  *	have to place these keys carefully to guarantee that later searches
460  *	will find them correctly.  See comments in the code below for details.
461  *
462  *	Parameters:
463  *		t -- tree to split
464  *		left -- pointer to page to get left half of the data
465  *		right -- pointer to page to get right half of the data
466  *
467  *	Returns:
468  *		None.
469  */
470 
471 int
472 _bt_dopsplit(t, left, right)
473 	BTREE_P t;
474 	BTHEADER *left;
475 	BTHEADER *right;
476 {
477 	BTHEADER *h = t->bt_curpage;
478 	size_t psize;
479 	char *where;
480 	BTHEADER *where_h;
481 	index_t where_i;
482 	int nbytes, dsize, fixedsize, freespc;
483 	index_t i;
484 	index_t save_lower, save_upper, save_i;
485 	index_t switch_i;
486 	char *save_key;
487 	DATUM *d;
488 	CURSOR *c;
489 	index_t top;
490 	int free_save;
491 	pgno_t chain;
492 	int ignore;
493 
494 	/*
495 	 *  Our strategy is to put half the bytes on each page.  We figure
496 	 *  out how many bytes we have total, and then move items until
497 	 *  the last item moved put at least 50% of the data on the left
498 	 *  page.
499 	 */
500 	save_key = (char *) NULL;
501 	psize = (int) t->bt_psize;
502 	where = ((char *) left) + psize;
503 	where_h = left;
504 	where_i = 0;
505 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
506 	freespc = nbytes;
507 
508 	top = NEXTINDEX(h);
509 	if (h->h_flags & F_LEAF)
510 		fixedsize = (sizeof(DATUM) - sizeof(char));
511 	else
512 		fixedsize = (sizeof(IDATUM) - sizeof(char));
513 
514 	save_key = (char *) NULL;
515 
516 	/* move data */
517 	for (i = 0; i < top; i++) {
518 
519 		/*
520 		 *  Internal and leaf pages have different layouts for
521 		 *  data items, but in both cases the first entry in the
522 		 *  data item is a size_t.
523 		 */
524 		d = (DATUM *) GETDATUM(h,i);
525 		if (h->h_flags & F_LEAF) {
526 			dsize = d->d_ksize + d->d_dsize + fixedsize;
527 		} else {
528 			dsize = d->d_ksize + fixedsize;
529 		}
530 
531 		/*
532 		 *  If a page contains duplicate keys, we have to be
533 		 *  careful about splits.  A sequence of duplicate keys
534 		 *  may not begin in the middle of one page and end in
535 		 *  the middle of another; it must begin on a page boundary,
536 		 *  in order for searches on the internal nodes to work
537 		 *  correctly.
538 		 */
539 		if (where_h == left) {
540 			if (save_key == (char *) NULL) {
541 				if (h->h_flags & F_LEAF) {
542 					if (d->d_flags & D_BIGKEY) {
543 						free_save = TRUE;
544 						bcopy(&(d->d_bytes[0]),
545 						     (char *) &chain,
546 						     sizeof(chain));
547 						if (_bt_getbig(t, chain,
548 							       &save_key,
549 							       &ignore)
550 						    == RET_ERROR)
551 							return (RET_ERROR);
552 					} else {
553 						free_save = FALSE;
554 						save_key = (char *) &(d->d_bytes[0]);
555 					}
556 				} else {
557 					IDATUM *id = (IDATUM *) d;
558 
559 					if (id->i_flags & D_BIGKEY) {
560 						free_save = TRUE;
561 						bcopy(&(id->i_bytes[0]),
562 						     (char *) &chain,
563 						     sizeof(chain));
564 						if (_bt_getbig(t, chain,
565 							       &save_key,
566 							       &ignore)
567 						    == RET_ERROR)
568 							return (RET_ERROR);
569 					} else {
570 						free_save = FALSE;
571 						save_key = (char *)
572 							&(id->i_bytes[0]);
573 					}
574 				}
575 				save_i = 0;
576 				save_lower = where_h->h_lower;
577 				save_upper = where_h->h_upper;
578 			} else {
579 				if (_bt_cmp(t, save_key, i) != 0) {
580 					save_lower = where_h->h_lower;
581 					save_upper = where_h->h_upper;
582 					save_i = i;
583 				}
584 				if (h->h_flags & F_LEAF) {
585 					if (free_save)
586 						(void) free(save_key);
587 					if (d->d_flags & D_BIGKEY) {
588 						free_save = TRUE;
589 						bcopy(&(d->d_bytes[0]),
590 						     (char *) &chain,
591 						     sizeof(chain));
592 						if (_bt_getbig(t, chain,
593 							       &save_key,
594 							       &ignore)
595 						    == RET_ERROR)
596 							return (RET_ERROR);
597 					} else {
598 						free_save = FALSE;
599 						save_key = (char *) &(d->d_bytes[0]);
600 					}
601 				} else {
602 					IDATUM *id = (IDATUM *) d;
603 
604 					if (id->i_flags & D_BIGKEY) {
605 						free_save = TRUE;
606 						bcopy(&(id->i_bytes[0]),
607 						     (char *) &chain,
608 						     sizeof(chain));
609 						if (_bt_getbig(t, chain,
610 							       &save_key,
611 							       &ignore)
612 						    == RET_ERROR)
613 							return (RET_ERROR);
614 					} else {
615 						free_save = FALSE;
616 						save_key = (char *)
617 							&(id->i_bytes[0]);
618 					}
619 				}
620 			}
621 		}
622 
623 		/* copy data and update page state */
624 		where -= LONGALIGN(dsize);
625 		(void) bcopy((char *) d, (char *) where, dsize);
626 		where_h->h_upper = where_h->h_linp[where_i] =
627 			(index_t) (where - (int) where_h);
628 		where_h->h_lower += sizeof(index_t);
629 		where_i++;
630 
631 		/* if we've moved half, switch to the right-hand page */
632 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
633 		if ((freespc - nbytes) > nbytes) {
634 			nbytes = 2 * freespc;
635 
636 			/* identical keys go on the same page */
637 			if (save_i > 0) {
638 				/* i gets incremented at loop bottom... */
639 				i = save_i - 1;
640 				where_h->h_lower = save_lower;
641 				where_h->h_upper = save_upper;
642 			}
643 			where = ((char *) right) + psize;
644 			where_h = right;
645 			switch_i = where_i;
646 			where_i = 0;
647 		}
648 	}
649 
650 	/*
651 	 *  If there was an active scan on the database, and we just
652 	 *  split the page that the cursor was on, we may need to
653 	 *  adjust the cursor to point to the same entry as before the
654 	 *  split.
655 	 */
656 
657 	c = &(t->bt_cursor);
658 	if ((t->bt_flags & BTF_SEQINIT)
659 	    && (c->c_pgno == h->h_pgno)
660 	    && (c->c_index >= switch_i)) {
661 		c->c_pgno = where_h->h_pgno;
662 		c->c_index -= where_i;
663 	}
664 	return (RET_SUCCESS);
665 }
666