xref: /original-bsd/lib/libc/db/btree/bt_split.c (revision 31e799e3)
1 /*-
2  * Copyright (c) 1990 The Regents of the University of California.
3  * All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * %sccs.include.redist.c%
9  */
10 
11 #if defined(LIBC_SCCS) && !defined(lint)
12 static char sccsid[] = "@(#)bt_split.c	5.1 (Berkeley) 01/23/91";
13 #endif /* LIBC_SCCS and not lint */
14 
15 #include <sys/types.h>
16 #include <db.h>
17 #include "btree.h"
18 
19 /*
20  *  _BT_SPLIT -- Split a page into two pages.
21  *
22  *	Splits are caused by insertions, and propogate up the tree in
23  *	the usual way.  The root page is always page 1 in the file on
24  *	disk, so root splits are handled specially.  On entry to this
25  *	routine, t->bt_curpage is the page to be split.
26  *
27  *	Parameters:
28  *		t -- btree in which to do split.
29  *
30  *	Returns:
31  *		RET_SUCCESS, RET_ERROR.
32  *
33  *	Side Effects:
34  *		Changes the notion of the current page.
35  */
36 
37 int
38 _bt_split(t)
39 	BTREE_P t;
40 {
41 	BTHEADER *h;
42 	BTHEADER *left, *right;
43 	pgno_t nextpgno, parent;
44 	int nbytes, len;
45 	IDATUM *id;
46 	DATUM *d;
47 	char *src;
48 	IDATUM *new;
49 	pgno_t oldchain;
50 	u_char flags;
51 
52 	h = (BTHEADER *) t->bt_curpage;
53 
54 	/* split root page specially, since it must remain page 1 */
55 	if (h->h_pgno == P_ROOT) {
56 		return (_bt_splitroot(t));
57 	}
58 
59 	/*
60 	 *  This is a little complicated.  We go to some trouble to
61 	 *  figure out which of the three possible cases -- in-memory tree,
62 	 *  disk tree (no cache), and disk tree (cache) -- we have, in order
63 	 *  to avoid unnecessary copying.  If we have a disk cache, then we
64 	 *  have to do some extra copying, though, since the cache code
65 	 *  manages buffers externally to this code.
66 	 */
67 
68 	if (ISDISK(t) && ISCACHE(t)) {
69 		if ((left = (BTHEADER *) malloc((unsigned) t->bt_psize))
70 		    == (BTHEADER *) NULL)
71 			return (RET_ERROR);
72 		left->h_pgno = left->h_prevpg = left->h_nextpg = P_NONE;
73 		left->h_flags = t->bt_curpage->h_flags;
74 		left->h_lower = (index_t)
75 			  (((char *) &(left->h_linp[0])) - ((char *) left));
76 		left->h_upper = t->bt_psize;
77 
78 	} else {
79 		if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
80 			return (RET_ERROR);
81 	}
82 	left->h_pgno = h->h_pgno;
83 
84 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
85 		return (RET_ERROR);
86 	right->h_pgno = ++(t->bt_npages);
87 
88 	/* now do the split */
89 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
90 		return (RET_ERROR);
91 
92 	right->h_prevpg = left->h_pgno;
93 	nextpgno = right->h_nextpg = h->h_nextpg;
94 	left->h_nextpg = right->h_pgno;
95 	left->h_prevpg = h->h_prevpg;
96 
97 	/* okay, now use the left half of the page as the new page */
98 	if (ISDISK(t) && ISCACHE(t)) {
99 		(void) bcopy((char *) left, (char *) t->bt_curpage,
100 			     (int) t->bt_psize);
101 		(void) free ((char *) left);
102 		left = t->bt_curpage;
103 	} else {
104 		(void) free((char *) t->bt_curpage);
105 		t->bt_curpage = left;
106 	}
107 
108 	/*
109 	 *  Write the new pages out.  We need them to stay where they are
110 	 *  until we're done updating the parent pages.
111 	 */
112 
113 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
114 		return (RET_ERROR);
115 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
116 		return (RET_ERROR);
117 
118 	/* update 'prev' pointer of old neighbor of left */
119 	if (nextpgno != P_NONE) {
120 		if (_bt_getpage(t, nextpgno) == RET_ERROR)
121 			return (RET_ERROR);
122 		h = t->bt_curpage;
123 		h->h_prevpg = right->h_pgno;
124 		h->h_flags |= F_DIRTY;
125 	}
126 
127 	if ((parent = _bt_pop(t)) != P_NONE) {
128 		if (right->h_flags & F_LEAF) {
129 			d = (DATUM *) GETDATUM(right, 0);
130 			len = d->d_ksize;
131 			if (d->d_flags & D_BIGKEY) {
132 				bcopy(&(d->d_bytes[0]),
133 				      (char *) &oldchain,
134 				      sizeof(oldchain));
135 				if (_bt_markchain(t, oldchain) == RET_ERROR)
136 					return (RET_ERROR);
137 				src = (char *) &oldchain;
138 				flags = D_BIGKEY;
139 			} else {
140 				src = (char *) &(d->d_bytes[0]);
141 				flags = 0;
142 			}
143 		} else {
144 			id = (IDATUM *) GETDATUM(right, 0);
145 			len = id->i_size;
146 			flags = id->i_flags;
147 			src = (char *) &(id->i_bytes[0]);
148 		}
149 		nbytes = len + (sizeof(IDATUM) - sizeof(char));
150 		new = (IDATUM *) malloc((unsigned) nbytes);
151 		if (new == (IDATUM *) NULL)
152 			return (RET_ERROR);
153 		new->i_size = len;
154 		new->i_pgno = right->h_pgno;
155 		new->i_flags = flags;
156 		if (len > 0)
157 			(void) bcopy(src, (char *) &(new->i_bytes[0]), len);
158 
159 		nbytes = LONGALIGN(nbytes) + sizeof(index_t);
160 		if (_bt_getpage(t, parent) == RET_ERROR)
161 			return (RET_ERROR);
162 
163 		h = t->bt_curpage;
164 
165 		/*
166 		 *  Split the parent if we need to, then reposition the
167 		 *  tree's current page pointer for the new datum.
168 		 */
169 		if ((h->h_upper - h->h_lower) < nbytes) {
170 			if (_bt_split(t) == RET_ERROR)
171 				return (RET_ERROR);
172 			if (_bt_reposition(t, new, parent, right->h_prevpg)
173 			      == RET_ERROR)
174 				return (RET_ERROR);
175 		}
176 
177 		/* okay, now insert the new idatum */
178 		if (_bt_inserti(t, new, right->h_prevpg) == RET_ERROR)
179 			return (RET_ERROR);
180 	}
181 
182 	/*
183 	 *  Okay, split is done; don't need right page stapled down anymore.
184 	 *  The page we call 'left' above is the new version of the old
185 	 *  (split) page, so we can't release it.
186 	 */
187 
188 	if (_bt_release(t, right) == RET_ERROR)
189 		return (RET_ERROR);
190 	if (ISDISK(t) && !ISCACHE(t))
191 		(void) free((char *) right);
192 
193 	return (RET_SUCCESS);
194 }
195 
196 /*
197  *  _BT_REPOSITION -- Reposition the current page pointer of a btree.
198  *
199  *	After splitting a node in the tree in order to make room for
200  *	an insertion, we need to figure out which page after the split
201  *	should get the item we want to insert.  This routine positions
202  *	the tree's current page pointer appropriately.
203  *
204  *	Parameters:
205  *		t -- tree to position
206  *		new -- the item we want to insert
207  *		parent -- parent of the node that we just split
208  *		prev -- page number of item directly to the left of
209  *			new's position in the tree.
210  *
211  *	Returns:
212  *		RET_SUCCESS, RET_ERROR.
213  *
214  *	Side Effects:
215  *		None.
216  */
217 
218 int
219 _bt_reposition(t, new, parent, prev)
220 	BTREE_P t;
221 	IDATUM *new;
222 	pgno_t parent;
223 	pgno_t prev;
224 {
225 	index_t i, next;
226 	IDATUM *idx;
227 
228 	if (parent == P_ROOT) {
229 
230 		/*
231 		 *  If we just split the root page, then there are guaranteed
232 		 *  to be exactly two IDATUMs on it.  Look at both of them
233 		 *  to see if they point to the page that we want.
234 		 */
235 
236 		if (_bt_getpage(t, parent) == RET_ERROR)
237 			return (RET_ERROR);
238 
239 		next = NEXTINDEX(t->bt_curpage);
240 		for (i = 0; i < next; i++) {
241 			idx = (IDATUM *) GETDATUM(t->bt_curpage, i);
242 			if (_bt_getpage(t, idx->i_pgno) == RET_ERROR)
243 				return (RET_ERROR);
244 			if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
245 				return (RET_SUCCESS);
246 			if (_bt_getpage(t, parent) == RET_ERROR)
247 				return (RET_ERROR);
248 		}
249 	} else {
250 
251 		/*
252 		 *  Get the parent page -- which is where the new item would
253 		 *  have gone -- and figure out whether the new item now goes
254 		 *  on the parent, or the page immediately to the right of
255 		 *  the parent.
256 		 */
257 
258 		if (_bt_getpage(t, parent) == RET_ERROR)
259 			return (RET_ERROR);
260 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
261 			return (RET_SUCCESS);
262 		if (_bt_getpage(t, t->bt_curpage->h_nextpg) == RET_ERROR)
263 			return (RET_ERROR);
264 		if (_bt_isonpage(t, new, prev) == RET_SUCCESS)
265 			return (RET_SUCCESS);
266 	}
267 	return (RET_ERROR);
268 }
269 
270 /*
271  *  _BT_ISONPAGE -- Is the IDATUM for a given page number on the current page?
272  *
273  *	This routine is used by _bt_reposition to decide whether the current
274  *	page is the correct one on which to insert a new item.
275  *
276  *	Parameters:
277  *		t -- tree to check
278  *		new -- the item that will be inserted (used for binary search)
279  *		prev -- page number of page whose IDATUM is immediately to
280  *			the left of new's position in the tree.
281  *
282  *	Returns:
283  *		RET_SUCCESS, RET_ERROR (corresponding to TRUE, FALSE).
284  */
285 
286 int
287 _bt_isonpage(t, new, prev)
288 	BTREE_P t;
289 	IDATUM *new;
290 	pgno_t prev;
291 {
292 	BTHEADER *h = (BTHEADER *) t->bt_curpage;
293 	index_t i, next;
294 	IDATUM *idx;
295 
296 	i = _bt_binsrch(t, &(new->i_bytes[0]));
297 	while (i != 0 && _bt_cmp(t, &(new->i_bytes[0]), i) == 0)
298 		--i;
299 	next = NEXTINDEX(h);
300 	idx = (IDATUM *) GETDATUM(h, i);
301 	while (i < next && idx->i_pgno != prev) {
302 		i++;
303 		idx = (IDATUM *) GETDATUM(h, i);
304 	}
305 	if (idx->i_pgno == prev)
306 		return (RET_SUCCESS);
307 	else
308 		return (RET_ERROR);
309 }
310 
311 /*
312  *  _BT_SPLITROOT -- Split the root of a btree.
313  *
314  *	The root page for a btree is always page one.  This means that in
315  *	order to split the root, we need to do extra work.
316  *
317  *	Parameters:
318  *		t -- tree to split
319  *
320  *	Returns:
321  *		RET_SUCCESS, RET_ERROR.
322  *
323  *	Side Effects:
324  *		Splits root upward in the usual way, adding two new pages
325  *		to the tree (rather than just one, as in usual splits).
326  */
327 
328 int
329 _bt_splitroot(t)
330 	BTREE_P t;
331 {
332 	BTHEADER *h = t->bt_curpage;
333 	BTHEADER *left, *right;
334 	IDATUM *id;
335 	BTHEADER *where_h;
336 	char *src, *dest;
337 	int len, nbytes;
338 	u_long was_leaf;
339 	pgno_t oldchain;
340 	u_char flags;
341 
342 	/* get two new pages for the split */
343 	if ((left = _bt_allocpg(t)) == (BTHEADER *) NULL)
344 		return (RET_ERROR);
345 	left->h_pgno = ++(t->bt_npages);
346 	if ((right = _bt_allocpg(t)) == (BTHEADER *) NULL)
347 		return (RET_ERROR);
348 	right->h_pgno = ++(t->bt_npages);
349 
350 	/* do the split */
351 	if (_bt_dopsplit(t, left, right) == RET_ERROR)
352 		return (RET_ERROR);
353 
354 	/* connect the new pages correctly */
355 	right->h_prevpg = left->h_pgno;
356 	left->h_nextpg = right->h_pgno;
357 
358 	/*
359 	 *  Write the child pages out now.  We need them to remain
360 	 *  where they are until we finish updating parent pages,
361 	 *  however.
362 	 */
363 
364 	if (_bt_write(t, left, NORELEASE) == RET_ERROR)
365 		return (RET_ERROR);
366 	if (_bt_write(t, right, NORELEASE) == RET_ERROR)
367 		return (RET_ERROR);
368 
369 	/* now change the root page into an internal page */
370 	was_leaf = (h->h_flags & F_LEAF);
371 	h->h_flags &= ~F_LEAF;
372 	h->h_lower = (index_t) (((char *) (&(h->h_linp[0]))) - ((char *) h));
373 	h->h_upper = (index_t) t->bt_psize;
374 	(void) bzero((char *) &(h->h_linp[0]), (int) (h->h_upper - h->h_lower));
375 
376 	/* put two new keys on root page */
377 	where_h = left;
378 	while (where_h) {
379 		DATUM *data;
380 		IDATUM *idata;
381 
382 		if (was_leaf) {
383 			data = (DATUM *) GETDATUM(where_h, 0);
384 
385 			if (where_h == left) {
386 				len = 0;	/* first key in tree is null */
387 			} else {
388 				if (data->d_flags & D_BIGKEY) {
389 					bcopy(&(data->d_bytes[0]),
390 					      (char *) &oldchain,
391 					      sizeof(oldchain));
392 					if (_bt_markchain(t, oldchain) == RET_ERROR)
393 						return (RET_ERROR);
394 					src = (char *) &oldchain;
395 					flags = D_BIGKEY;
396 				} else {
397 					src = (char *) &(data->d_bytes[0]);
398 					flags = 0;
399 				}
400 				len = data->d_ksize;
401 			}
402 		} else {
403 			idata = (IDATUM *) GETDATUM(where_h, 0);
404 			len = idata->i_size;
405 			flags = idata->i_flags;
406 			src = &(idata->i_bytes[0]);
407 		}
408 		dest = ((char *) h) + h->h_upper;
409 		nbytes = len + (sizeof (IDATUM) - sizeof(char));
410 		dest -= LONGALIGN(nbytes);
411 		id = (IDATUM *) dest;
412 		id->i_size = len;
413 		id->i_pgno = where_h->h_pgno;
414 		id->i_flags = flags;
415 		if (len > 0)
416 			(void) bcopy((char *) src, (char *) &(id->i_bytes[0]), len);
417 		dest -= ((int) h);
418 		h->h_linp[NEXTINDEX(h)] = (index_t) dest;
419 		h->h_upper = (index_t) dest;
420 		h->h_lower += sizeof(index_t);
421 
422 		/* next page */
423 		if (where_h == left)
424 			where_h = right;
425 		else
426 			where_h = (BTHEADER *) NULL;
427 	}
428 
429 	if (_bt_release(t, left) == RET_ERROR)
430 		return (RET_ERROR);
431 	if (_bt_release(t, right) == RET_ERROR)
432 		return (RET_ERROR);
433 
434 	/*
435 	 *  That's it, split is done.  If we're doing a non-cached disk
436 	 *  btree, we can free up the pages we allocated, as they're on
437 	 *  disk, now.
438 	 */
439 
440 	if (ISDISK(t) && !ISCACHE(t)) {
441 		(void) free ((char *) left);
442 		(void) free ((char *) right);
443 	}
444 
445 	h->h_flags |= F_DIRTY;
446 
447 	return (RET_SUCCESS);
448 }
449 
450 /*
451  *  _BT_DOPSPLIT -- Do the work of splitting a page
452  *
453  *	This routine takes two page pointers and splits the data on the
454  *	current page of the btree between them.
455  *
456  *	We do a lot of work here to handle duplicate keys on a page; we
457  *	have to place these keys carefully to guarantee that later searches
458  *	will find them correctly.  See comments in the code below for details.
459  *
460  *	Parameters:
461  *		t -- tree to split
462  *		left -- pointer to page to get left half of the data
463  *		right -- pointer to page to get right half of the data
464  *
465  *	Returns:
466  *		None.
467  */
468 
469 int
470 _bt_dopsplit(t, left, right)
471 	BTREE_P t;
472 	BTHEADER *left;
473 	BTHEADER *right;
474 {
475 	BTHEADER *h = t->bt_curpage;
476 	size_t psize;
477 	char *where;
478 	BTHEADER *where_h;
479 	index_t where_i;
480 	int nbytes, dsize, fixedsize, freespc;
481 	index_t i;
482 	index_t save_lower, save_upper, save_i;
483 	index_t switch_i;
484 	char *save_key;
485 	DATUM *d;
486 	CURSOR *c;
487 	index_t top;
488 	int free_save;
489 	pgno_t chain;
490 	int ignore;
491 
492 	/*
493 	 *  Our strategy is to put half the bytes on each page.  We figure
494 	 *  out how many bytes we have total, and then move items until
495 	 *  the last item moved put at least 50% of the data on the left
496 	 *  page.
497 	 */
498 	save_key = (char *) NULL;
499 	psize = (int) t->bt_psize;
500 	where = ((char *) left) + psize;
501 	where_h = left;
502 	where_i = 0;
503 	nbytes = psize - (int) ((char *) &(h->h_linp[0]) - ((char *) h));
504 	freespc = nbytes;
505 
506 	top = NEXTINDEX(h);
507 	if (h->h_flags & F_LEAF)
508 		fixedsize = (sizeof(DATUM) - sizeof(char));
509 	else
510 		fixedsize = (sizeof(IDATUM) - sizeof(char));
511 
512 	save_key = (char *) NULL;
513 
514 	/* move data */
515 	for (i = 0; i < top; i++) {
516 
517 		/*
518 		 *  Internal and leaf pages have different layouts for
519 		 *  data items, but in both cases the first entry in the
520 		 *  data item is a size_t.
521 		 */
522 		d = (DATUM *) GETDATUM(h,i);
523 		if (h->h_flags & F_LEAF) {
524 			dsize = d->d_ksize + d->d_dsize + fixedsize;
525 		} else {
526 			dsize = d->d_ksize + fixedsize;
527 		}
528 
529 		/*
530 		 *  If a page contains duplicate keys, we have to be
531 		 *  careful about splits.  A sequence of duplicate keys
532 		 *  may not begin in the middle of one page and end in
533 		 *  the middle of another; it must begin on a page boundary,
534 		 *  in order for searches on the internal nodes to work
535 		 *  correctly.
536 		 */
537 		if (where_h == left) {
538 			if (save_key == (char *) NULL) {
539 				if (h->h_flags & F_LEAF) {
540 					if (d->d_flags & D_BIGKEY) {
541 						free_save = TRUE;
542 						bcopy(&(d->d_bytes[0]),
543 						     (char *) &chain,
544 						     sizeof(chain));
545 						if (_bt_getbig(t, chain,
546 							       &save_key,
547 							       &ignore)
548 						    == RET_ERROR)
549 							return (RET_ERROR);
550 					} else {
551 						free_save = FALSE;
552 						save_key = (char *) &(d->d_bytes[0]);
553 					}
554 				} else {
555 					IDATUM *id = (IDATUM *) d;
556 
557 					if (id->i_flags & D_BIGKEY) {
558 						free_save = TRUE;
559 						bcopy(&(id->i_bytes[0]),
560 						     (char *) &chain,
561 						     sizeof(chain));
562 						if (_bt_getbig(t, chain,
563 							       &save_key,
564 							       &ignore)
565 						    == RET_ERROR)
566 							return (RET_ERROR);
567 					} else {
568 						free_save = FALSE;
569 						save_key = (char *)
570 							&(id->i_bytes[0]);
571 					}
572 				}
573 				save_i = 0;
574 				save_lower = where_h->h_lower;
575 				save_upper = where_h->h_upper;
576 			} else {
577 				if (_bt_cmp(t, save_key, i) != 0) {
578 					save_lower = where_h->h_lower;
579 					save_upper = where_h->h_upper;
580 					save_i = i;
581 				}
582 				if (h->h_flags & F_LEAF) {
583 					if (free_save)
584 						(void) free(save_key);
585 					if (d->d_flags & D_BIGKEY) {
586 						free_save = TRUE;
587 						bcopy(&(d->d_bytes[0]),
588 						     (char *) &chain,
589 						     sizeof(chain));
590 						if (_bt_getbig(t, chain,
591 							       &save_key,
592 							       &ignore)
593 						    == RET_ERROR)
594 							return (RET_ERROR);
595 					} else {
596 						free_save = FALSE;
597 						save_key = (char *) &(d->d_bytes[0]);
598 					}
599 				} else {
600 					IDATUM *id = (IDATUM *) d;
601 
602 					if (id->i_flags & D_BIGKEY) {
603 						free_save = TRUE;
604 						bcopy(&(id->i_bytes[0]),
605 						     (char *) &chain,
606 						     sizeof(chain));
607 						if (_bt_getbig(t, chain,
608 							       &save_key,
609 							       &ignore)
610 						    == RET_ERROR)
611 							return (RET_ERROR);
612 					} else {
613 						free_save = FALSE;
614 						save_key = (char *)
615 							&(id->i_bytes[0]);
616 					}
617 				}
618 			}
619 		}
620 
621 		/* copy data and update page state */
622 		where -= LONGALIGN(dsize);
623 		(void) bcopy((char *) d, (char *) where, dsize);
624 		where_h->h_upper = where_h->h_linp[where_i] =
625 			(index_t) (where - (int) where_h);
626 		where_h->h_lower += sizeof(index_t);
627 		where_i++;
628 
629 		/* if we've moved half, switch to the right-hand page */
630 		nbytes -= LONGALIGN(dsize) + sizeof(index_t);
631 		if ((freespc - nbytes) > nbytes) {
632 			nbytes = 2 * freespc;
633 
634 			/* identical keys go on the same page */
635 			if (save_i > 0) {
636 				/* i gets incremented at loop bottom... */
637 				i = save_i - 1;
638 				where_h->h_lower = save_lower;
639 				where_h->h_upper = save_upper;
640 			}
641 			where = ((char *) right) + psize;
642 			where_h = right;
643 			switch_i = where_i;
644 			where_i = 0;
645 		}
646 	}
647 
648 	/*
649 	 *  If there was an active scan on the database, and we just
650 	 *  split the page that the cursor was on, we may need to
651 	 *  adjust the cursor to point to the same entry as before the
652 	 *  split.
653 	 */
654 
655 	c = &(t->bt_cursor);
656 	if ((t->bt_flags & BTF_SEQINIT)
657 	    && (c->c_pgno == h->h_pgno)
658 	    && (c->c_index >= switch_i)) {
659 		c->c_pgno = where_h->h_pgno;
660 		c->c_index -= where_i;
661 	}
662 	return (RET_SUCCESS);
663 }
664