xref: /openbsd/usr.sbin/ldapd/btree.c (revision 36e226d9)
1*36e226d9Smartinh /*	$OpenBSD: btree.c,v 1.15 2010/06/29 04:27:15 martinh Exp $ */
25d465952Smartinh 
35d465952Smartinh /*
45d465952Smartinh  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
55d465952Smartinh  *
65d465952Smartinh  * Permission to use, copy, modify, and distribute this software for any
75d465952Smartinh  * purpose with or without fee is hereby granted, provided that the above
85d465952Smartinh  * copyright notice and this permission notice appear in all copies.
95d465952Smartinh  *
105d465952Smartinh  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
115d465952Smartinh  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
125d465952Smartinh  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
135d465952Smartinh  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
145d465952Smartinh  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
155d465952Smartinh  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
165d465952Smartinh  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
175d465952Smartinh  */
185d465952Smartinh 
195d465952Smartinh #include <sys/types.h>
205d465952Smartinh #include <sys/tree.h>
2137ee45a1Smartinh #include <sys/stat.h>
225d465952Smartinh #include <sys/queue.h>
235d465952Smartinh #include <sys/param.h>
245d465952Smartinh #include <sys/uio.h>
255d465952Smartinh 
265d465952Smartinh #include <assert.h>
275d465952Smartinh #include <err.h>
285d465952Smartinh #include <errno.h>
295d465952Smartinh #include <fcntl.h>
305d465952Smartinh #include <stddef.h>
315d465952Smartinh #include <stdint.h>
325d465952Smartinh #include <stdio.h>
335d465952Smartinh #include <stdlib.h>
345d465952Smartinh #include <string.h>
355d465952Smartinh #include <time.h>
365d465952Smartinh #include <unistd.h>
375d465952Smartinh 
385d465952Smartinh #include "btree.h"
395d465952Smartinh 
405d465952Smartinh /* #define DEBUG */
415d465952Smartinh 
425d465952Smartinh #ifdef DEBUG
435d465952Smartinh # define DPRINTF(...)	do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
445d465952Smartinh 			     fprintf(stderr, __VA_ARGS__); \
455d465952Smartinh 			     fprintf(stderr, "\n"); } while(0)
465d465952Smartinh #else
475d465952Smartinh # define DPRINTF(...)
485d465952Smartinh #endif
495d465952Smartinh 
505d465952Smartinh #define PAGESIZE	 4096
515d465952Smartinh #define BT_MINKEYS	 4
525d465952Smartinh #define BT_MAGIC	 0xB3DBB3DB
5337ee45a1Smartinh #define BT_VERSION	 4
545d465952Smartinh #define MAXKEYSIZE	 255
555d465952Smartinh 
565d465952Smartinh #define P_INVALID	 0xFFFFFFFF
575d465952Smartinh 
585d465952Smartinh #define F_ISSET(w, f)	 (((w) & (f)) == (f))
595d465952Smartinh 
60176c8cbcSmartinh typedef uint32_t	 pgno_t;
61176c8cbcSmartinh typedef uint16_t	 indx_t;
62176c8cbcSmartinh 
635d465952Smartinh /* There are four page types: meta, index, leaf and overflow.
645d465952Smartinh  * They all share the same page header.
655d465952Smartinh  */
665d465952Smartinh struct page {				/* represents an on-disk page */
6737ee45a1Smartinh 	pgno_t		 pgno;		/* page number */
685d465952Smartinh #define	P_BRANCH	 0x01		/* branch page */
695d465952Smartinh #define	P_LEAF		 0x02		/* leaf page */
705d465952Smartinh #define	P_OVERFLOW	 0x04		/* overflow page */
715d465952Smartinh #define	P_META		 0x08		/* meta page */
7237ee45a1Smartinh #define	P_HEAD		 0x10		/* header page */
735d465952Smartinh 	uint32_t	 flags;
745d465952Smartinh #define lower		 b.fb.fb_lower
755d465952Smartinh #define upper		 b.fb.fb_upper
765d465952Smartinh #define p_next_pgno	 b.pb_next_pgno
775d465952Smartinh 	union page_bounds {
785d465952Smartinh 		struct {
795d465952Smartinh 			indx_t	 fb_lower;	/* lower bound of free space */
805d465952Smartinh 			indx_t	 fb_upper;	/* upper bound of free space */
815d465952Smartinh 		} fb;
825d465952Smartinh 		pgno_t		 pb_next_pgno;	/* overflow page linked list */
835d465952Smartinh 	} b;
8437ee45a1Smartinh 	indx_t		 ptrs[1];		/* dynamic size */
85bf245c3bSmartinh } __packed;
865d465952Smartinh 
8737ee45a1Smartinh #define PAGEHDRSZ	 offsetof(struct page, ptrs)
885d465952Smartinh 
8937ee45a1Smartinh #define NUMKEYSP(p)	 (((p)->lower - PAGEHDRSZ) >> 1)
9037ee45a1Smartinh #define NUMKEYS(mp)	 (((mp)->page->lower - PAGEHDRSZ) >> 1)
9137ee45a1Smartinh #define SIZELEFT(mp)	 (indx_t)((mp)->page->upper - (mp)->page->lower)
9237ee45a1Smartinh #define PAGEFILL(bt, mp) ((float)((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \
9337ee45a1Smartinh 				((bt)->head.psize - PAGEHDRSZ))
9437ee45a1Smartinh #define IS_LEAF(mp)	 F_ISSET((mp)->page->flags, P_LEAF)
9537ee45a1Smartinh #define IS_BRANCH(mp)	 F_ISSET((mp)->page->flags, P_BRANCH)
9637ee45a1Smartinh #define IS_OVERFLOW(mp)	 F_ISSET((mp)->page->flags, P_OVERFLOW)
9737ee45a1Smartinh 
9837ee45a1Smartinh struct bt_head {				/* header page content */
9937ee45a1Smartinh 	uint32_t	 magic;
1005d465952Smartinh 	uint32_t	 version;
1015d465952Smartinh 	uint32_t	 flags;
1025d465952Smartinh 	uint32_t	 psize;			/* page size */
10337ee45a1Smartinh } __packed;
10437ee45a1Smartinh 
10537ee45a1Smartinh struct bt_meta {				/* meta (footer) page content */
10637ee45a1Smartinh #define BT_TOMBSTONE	 0x01			/* file is replaced */
10737ee45a1Smartinh 	uint32_t	 flags;
1085d465952Smartinh 	pgno_t		 root;			/* page number of root page */
1095d465952Smartinh 	pgno_t		 prev_meta;		/* previous meta page number */
1105d465952Smartinh 	time_t		 created_at;
1115d465952Smartinh 	uint32_t	 branch_pages;
1125d465952Smartinh 	uint32_t	 leaf_pages;
1135d465952Smartinh 	uint32_t	 overflow_pages;
1145d465952Smartinh 	uint32_t	 revisions;
1155d465952Smartinh 	uint32_t	 depth;
1165d465952Smartinh 	uint64_t	 entries;
1175d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
118bf245c3bSmartinh } __packed;
1195d465952Smartinh 
1205d465952Smartinh struct btkey {
1215d465952Smartinh 	size_t			 len;
1225d465952Smartinh 	char			 str[MAXKEYSIZE];
1235d465952Smartinh };
1245d465952Smartinh 
1255d465952Smartinh struct mpage {					/* an in-memory cached page */
1265d465952Smartinh 	RB_ENTRY(mpage)		 entry;		/* page cache entry */
1275d465952Smartinh 	SIMPLEQ_ENTRY(mpage)	 next;		/* queue of dirty pages */
1285d465952Smartinh 	TAILQ_ENTRY(mpage)	 lru_next;	/* LRU queue */
1295d465952Smartinh 	struct mpage		*parent;	/* NULL if root */
1305d465952Smartinh 	unsigned int		 parent_index;	/* keep track of node index */
1315d465952Smartinh 	struct btkey		 prefix;
13237ee45a1Smartinh 	struct page		*page;
1335d465952Smartinh 	pgno_t			 pgno;		/* copy of page->pgno */
1345d465952Smartinh 	short			 ref;		/* increased by cursors */
1355d465952Smartinh 	short			 dirty;		/* 1 if on dirty queue */
1365d465952Smartinh };
1375d465952Smartinh RB_HEAD(page_cache, mpage);
1385d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage);
1395d465952Smartinh TAILQ_HEAD(lru_queue, mpage);
1405d465952Smartinh 
1415d465952Smartinh static int		 mpage_cmp(struct mpage *a, struct mpage *b);
1425d465952Smartinh static struct mpage	*mpage_lookup(struct btree *bt, pgno_t pgno);
1435d465952Smartinh static void		 mpage_add(struct btree *bt, struct mpage *mp);
14437ee45a1Smartinh static void		 mpage_free(struct mpage *mp);
1455d465952Smartinh static void		 mpage_del(struct btree *bt, struct mpage *mp);
146e14f8e24Smartinh static void		 mpage_flush(struct btree *bt);
14737ee45a1Smartinh static struct mpage	*mpage_copy(struct btree *bt, struct mpage *mp);
1485d465952Smartinh static void		 mpage_prune(struct btree *bt);
1495d465952Smartinh static void		 mpage_dirty(struct btree *bt, struct mpage *mp);
1505d465952Smartinh static void		 mpage_touch(struct btree *bt, struct mpage *mp);
1515d465952Smartinh 
1525d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);
1535d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp);
1545d465952Smartinh 
1555d465952Smartinh struct ppage {					/* ordered list of pages */
1565d465952Smartinh 	SLIST_ENTRY(ppage)	 entry;
1575d465952Smartinh 	struct mpage		*mpage;
1585d465952Smartinh 	unsigned int		 ki;		/* cursor index on page */
1595d465952Smartinh };
1605d465952Smartinh SLIST_HEAD(page_stack, ppage);
1615d465952Smartinh 
1625d465952Smartinh #define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->stack)
1635d465952Smartinh #define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->stack)
1645d465952Smartinh #define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->stack, entry)
1655d465952Smartinh #define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->stack, p, entry)
1665d465952Smartinh 
1675d465952Smartinh struct cursor {
1685d465952Smartinh 	struct btree		*bt;
1695d465952Smartinh 	struct btree_txn	*txn;
1705d465952Smartinh 	struct page_stack	 stack;		/* stack of parent pages */
1715d465952Smartinh 	short			 initialized;	/* 1 if initialized */
1725d465952Smartinh 	short			 eof;		/* 1 if end is reached */
1735d465952Smartinh };
1745d465952Smartinh 
1755d465952Smartinh #define METAHASHLEN	 offsetof(struct bt_meta, hash)
17637ee45a1Smartinh #define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))
1775d465952Smartinh 
1785d465952Smartinh struct node {
1795d465952Smartinh #define n_pgno		 p.np_pgno
1805d465952Smartinh #define n_dsize		 p.np_dsize
1815d465952Smartinh 	union {
1825d465952Smartinh 		pgno_t		 np_pgno;	/* child page number */
1835d465952Smartinh 		uint32_t	 np_dsize;	/* leaf data size */
1845d465952Smartinh 	}		 p;
1855d465952Smartinh 	uint16_t	 ksize;			/* key size */
1865d465952Smartinh #define F_BIGDATA	 0x01			/* data put on overflow page */
1875d465952Smartinh 	uint8_t		 flags;
188a33d639aSmartinh 	char		 data[1];
189bf245c3bSmartinh } __packed;
1905d465952Smartinh 
1915d465952Smartinh struct btree_txn {
1925d465952Smartinh 	pgno_t			 root;		/* current / new root page */
1935d465952Smartinh 	pgno_t			 next_pgno;	/* next unallocated page */
1945d465952Smartinh 	struct btree		*bt;		/* btree is ref'd */
1955d465952Smartinh 	struct dirty_queue	*dirty_queue;	/* modified pages */
1965d465952Smartinh #define BT_TXN_RDONLY		 0x01		/* read-only transaction */
1975d465952Smartinh #define BT_TXN_ERROR		 0x02		/* an error has occurred */
1985d465952Smartinh 	unsigned int		 flags;
1995d465952Smartinh };
2005d465952Smartinh 
2015d465952Smartinh struct btree {
2025d465952Smartinh 	int			 fd;
2035d465952Smartinh 	char			*path;
2045d465952Smartinh #define BT_FIXPADDING		 0x01		/* internal */
2055d465952Smartinh 	unsigned int		 flags;
2065d465952Smartinh 	bt_cmp_func		 cmp;		/* user compare function */
20737ee45a1Smartinh 	struct bt_head		 head;
20837ee45a1Smartinh 	struct bt_meta		 meta;
2095d465952Smartinh 	struct page_cache	*page_cache;
2105d465952Smartinh 	struct lru_queue	*lru_queue;
2115d465952Smartinh 	struct btree_txn	*txn;		/* current write transaction */
212e14f8e24Smartinh 	int			 ref;		/* increased by cursors & txn */
2135d465952Smartinh 	struct btree_stat	 stat;
2145d465952Smartinh 	off_t			 size;		/* current file size */
2155d465952Smartinh };
2165d465952Smartinh 
2175d465952Smartinh #define NODESIZE	 offsetof(struct node, data)
2185d465952Smartinh 
2195d465952Smartinh #define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->size))
2205d465952Smartinh #define LEAFSIZE(k, d)	 (NODESIZE + (k)->size + (d)->size)
2215d465952Smartinh #define NODEPTRP(p, i)	 ((struct node *)((char *)(p) + (p)->ptrs[i]))
22237ee45a1Smartinh #define NODEPTR(mp, i)	 NODEPTRP((mp)->page, i)
2235d465952Smartinh #define NODEKEY(node)	 (void *)((node)->data)
2245d465952Smartinh #define NODEDATA(node)	 (void *)((char *)(node)->data + (node)->ksize)
2255d465952Smartinh #define NODEPGNO(node)	 ((node)->p.np_pgno)
2265d465952Smartinh #define NODEDSZ(node)	 ((node)->p.np_dsize)
2275d465952Smartinh 
2285d465952Smartinh #define BT_COMMIT_PAGES	 64	/* max number of pages to write in one commit */
2295d465952Smartinh #define BT_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */
2305d465952Smartinh 
2315d465952Smartinh static int		 btree_read_page(struct btree *bt, pgno_t pgno,
2325d465952Smartinh 			    struct page *page);
23337ee45a1Smartinh static struct mpage	*btree_get_mpage(struct btree *bt, pgno_t pgno);
2345d465952Smartinh static int		 btree_search_page_root(struct btree *bt,
2355d465952Smartinh 			    struct mpage *root, struct btval *key,
2365d465952Smartinh 			    struct cursor *cursor, int modify,
2375d465952Smartinh 			    struct mpage **mpp);
2385d465952Smartinh static int		 btree_search_page(struct btree *bt,
2395d465952Smartinh 			    struct btree_txn *txn, struct btval *key,
2405d465952Smartinh 			    struct cursor *cursor, int modify,
2415d465952Smartinh 			    struct mpage **mpp);
2425d465952Smartinh 
24337ee45a1Smartinh static int		 btree_write_header(struct btree *bt, int fd);
24437ee45a1Smartinh static int		 btree_read_header(struct btree *bt);
2455d465952Smartinh static int		 btree_is_meta_page(struct page *p);
2465d465952Smartinh static int		 btree_read_meta(struct btree *bt, pgno_t *p_next);
247e14f8e24Smartinh static int		 btree_write_meta(struct btree *bt, pgno_t root,
248e14f8e24Smartinh 			    unsigned int flags);
249408be415Smartinh static void		 btree_ref(struct btree *bt);
2505d465952Smartinh 
2515d465952Smartinh static struct node	*btree_search_node(struct btree *bt, struct mpage *mp,
2525d465952Smartinh 			    struct btval *key, int *exactp, unsigned int *kip);
2535d465952Smartinh static int		 btree_add_node(struct btree *bt, struct mpage *mp,
2545d465952Smartinh 			    indx_t indx, struct btval *key, struct btval *data,
2555d465952Smartinh 			    pgno_t pgno, uint8_t flags);
2565d465952Smartinh static void		 btree_del_node(struct btree *bt, struct mpage *mp,
2575d465952Smartinh 			    indx_t indx);
2585d465952Smartinh static int		 btree_read_data(struct btree *bt, struct mpage *mp,
2595d465952Smartinh 			    struct node *leaf, struct btval *data);
2605d465952Smartinh 
2615d465952Smartinh static int		 btree_rebalance(struct btree *bt, struct mpage *mp);
2625d465952Smartinh static int		 btree_update_key(struct btree *bt, struct mpage *mp,
2635d465952Smartinh 			    indx_t indx, struct btval *key);
2645d465952Smartinh static int		 btree_adjust_prefix(struct btree *bt,
2655d465952Smartinh 			    struct mpage *src, int delta);
2665d465952Smartinh static int		 btree_move_node(struct btree *bt, struct mpage *src,
2675d465952Smartinh 			    indx_t srcindx, struct mpage *dst, indx_t dstindx);
2685d465952Smartinh static int		 btree_merge(struct btree *bt, struct mpage *src,
2695d465952Smartinh 			    struct mpage *dst);
2705d465952Smartinh static int		 btree_split(struct btree *bt, struct mpage **mpp,
2715d465952Smartinh 			    unsigned int *newindxp, struct btval *newkey,
2725d465952Smartinh 			    struct btval *newdata, pgno_t newpgno);
2735d465952Smartinh static struct mpage	*btree_new_page(struct btree *bt, uint32_t flags);
2745d465952Smartinh static int		 btree_write_overflow_data(struct btree *bt,
2755d465952Smartinh 			    struct page *p, struct btval *data);
2765d465952Smartinh 
2775d465952Smartinh static void		 cursor_pop_page(struct cursor *cursor);
2785d465952Smartinh static struct ppage	 *cursor_push_page(struct cursor *cursor,
2795d465952Smartinh 			    struct mpage *mp);
2805d465952Smartinh 
2815d465952Smartinh static int		 bt_set_key(struct btree *bt, struct mpage *mp,
2825d465952Smartinh 			    struct node *node, struct btval *key);
2835d465952Smartinh static int		 btree_sibling(struct cursor *cursor, int move_right);
2845d465952Smartinh static int		 btree_cursor_next(struct cursor *cursor,
2855d465952Smartinh 			    struct btval *key, struct btval *data);
2865d465952Smartinh static int		 btree_cursor_set(struct cursor *cursor,
2875d465952Smartinh 			    struct btval *key, struct btval *data);
2885d465952Smartinh static int		 btree_cursor_first(struct cursor *cursor,
2895d465952Smartinh 			    struct btval *key, struct btval *data);
2905d465952Smartinh 
2915d465952Smartinh static void		 bt_reduce_separator(struct btree *bt, struct node *min,
2925d465952Smartinh 			    struct btval *sep);
2935d465952Smartinh static void		 remove_prefix(struct btree *bt, struct btval *key,
2945d465952Smartinh 			    size_t pfxlen);
2955d465952Smartinh static void		 expand_prefix(struct btree *bt, struct mpage *mp,
2965d465952Smartinh 			    indx_t indx, struct btkey *expkey);
2975d465952Smartinh static void		 concat_prefix(struct btree *bt, char *s1, size_t n1,
2985d465952Smartinh 			    char *s2, size_t n2, char *cs, size_t *cn);
2995d465952Smartinh static void		 common_prefix(struct btree *bt, struct btkey *min,
3005d465952Smartinh 			    struct btkey *max, struct btkey *pfx);
3015d465952Smartinh static void		 find_common_prefix(struct btree *bt, struct mpage *mp);
3025d465952Smartinh 
30337ee45a1Smartinh static size_t		 bt_leaf_size(struct btree *bt, struct btval *key,
30437ee45a1Smartinh 			    struct btval *data);
30537ee45a1Smartinh static size_t		 bt_branch_size(struct btree *bt, struct btval *key);
3065d465952Smartinh 
3075d465952Smartinh static pgno_t		 btree_compact_tree(struct btree *bt, pgno_t pgno,
30837ee45a1Smartinh 			    struct btree *btc);
3095d465952Smartinh 
3105d465952Smartinh static int		 memncmp(const void *s1, size_t n1,
3115d465952Smartinh 				 const void *s2, size_t n2);
3125d465952Smartinh static int		 memnrcmp(const void *s1, size_t n1,
3135d465952Smartinh 				  const void *s2, size_t n2);
3145d465952Smartinh 
3155d465952Smartinh static int
3165d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
3175d465952Smartinh {
3185d465952Smartinh 	if (n1 < n2) {
3195d465952Smartinh 		if (memcmp(s1, s2, n1) == 0)
3205d465952Smartinh 			return -1;
3215d465952Smartinh 	}
3225d465952Smartinh 	else if (n1 > n2) {
3235d465952Smartinh 		if (memcmp(s1, s2, n2) == 0)
3245d465952Smartinh 			return 1;
3255d465952Smartinh 	}
3265d465952Smartinh 	return memcmp(s1, s2, n1);
3275d465952Smartinh }
3285d465952Smartinh 
3295d465952Smartinh static int
3305d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
3315d465952Smartinh {
3325d465952Smartinh 	const unsigned char	*p1;
3335d465952Smartinh 	const unsigned char	*p2;
3345d465952Smartinh 
3355d465952Smartinh 	if (n1 == 0)
3365d465952Smartinh 		return n2 == 0 ? 0 : -1;
3375d465952Smartinh 
3385d465952Smartinh 	if (n2 == 0)
3395d465952Smartinh 		return n1 == 0 ? 0 : 1;
3405d465952Smartinh 
3415d465952Smartinh 	p1 = (const unsigned char *)s1 + n1 - 1;
3425d465952Smartinh 	p2 = (const unsigned char *)s2 + n2 - 1;
3435d465952Smartinh 
3445d465952Smartinh 	while (*p1 == *p2) {
3455d465952Smartinh 		if (p1 == s1)
3465d465952Smartinh 			return (p2 == s2) ? 0 : -1;
3475d465952Smartinh 		if (p2 == s2)
3485d465952Smartinh 			return (p1 == p2) ? 0 : 1;
3495d465952Smartinh 		p1--;
3505d465952Smartinh 		p2--;
3515d465952Smartinh 	}
3525d465952Smartinh 	return *p1 - *p2;
3535d465952Smartinh }
3545d465952Smartinh 
3555d465952Smartinh int
3565d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)
3575d465952Smartinh {
3585d465952Smartinh 	return bt->cmp(a, b);
3595d465952Smartinh }
3605d465952Smartinh 
3615d465952Smartinh static void
3625d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,
3635d465952Smartinh     struct btkey *pfx)
3645d465952Smartinh {
3655d465952Smartinh 	size_t		 n = 0;
3665d465952Smartinh 	char		*p1;
3675d465952Smartinh 	char		*p2;
3685d465952Smartinh 
3695d465952Smartinh 	if (min->len == 0 || max->len == 0) {
3705d465952Smartinh 		pfx->len = 0;
3715d465952Smartinh 		return;
3725d465952Smartinh 	}
3735d465952Smartinh 
3745d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
3755d465952Smartinh 		p1 = min->str + min->len - 1;
3765d465952Smartinh 		p2 = max->str + max->len - 1;
3775d465952Smartinh 
3785d465952Smartinh 		while (*p1 == *p2) {
3795d465952Smartinh 			if (p1 < min->str || p2 < max->str)
3805d465952Smartinh 				break;
3815d465952Smartinh 			p1--;
3825d465952Smartinh 			p2--;
3835d465952Smartinh 			n++;
3845d465952Smartinh 		}
3855d465952Smartinh 
3865d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
3875d465952Smartinh 		pfx->len = n;
3885d465952Smartinh 		bcopy(p2 + 1, pfx->str, n);
3895d465952Smartinh 	} else {
3905d465952Smartinh 		p1 = min->str;
3915d465952Smartinh 		p2 = max->str;
3925d465952Smartinh 
3935d465952Smartinh 		while (*p1 == *p2) {
3945d465952Smartinh 			if (n == min->len || n == max->len)
3955d465952Smartinh 				break;
3965d465952Smartinh 			p1++;
3975d465952Smartinh 			p2++;
3985d465952Smartinh 			n++;
3995d465952Smartinh 		}
4005d465952Smartinh 
4015d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
4025d465952Smartinh 		pfx->len = n;
4035d465952Smartinh 		bcopy(max->str, pfx->str, n);
4045d465952Smartinh 	}
4055d465952Smartinh }
4065d465952Smartinh 
4075d465952Smartinh static void
4085d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)
4095d465952Smartinh {
4105d465952Smartinh 	if (pfxlen == 0 || bt->cmp != NULL)
4115d465952Smartinh 		return;
4125d465952Smartinh 
4135d465952Smartinh 	DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen,
4145d465952Smartinh 	    (int)key->size, (char *)key->data);
4155d465952Smartinh 	assert(pfxlen <= key->size);
4165d465952Smartinh 	key->size -= pfxlen;
4175d465952Smartinh 	if (!F_ISSET(bt->flags, BT_REVERSEKEY))
4185d465952Smartinh 		key->data = (char *)key->data + pfxlen;
4195d465952Smartinh }
4205d465952Smartinh 
4215d465952Smartinh static void
4225d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,
4235d465952Smartinh     struct btkey *expkey)
4245d465952Smartinh {
4255d465952Smartinh 	struct node	*node;
4265d465952Smartinh 
4275d465952Smartinh 	node = NODEPTR(mp, indx);
4285d465952Smartinh 	expkey->len = sizeof(expkey->str);
4295d465952Smartinh 	concat_prefix(bt, mp->prefix.str, mp->prefix.len,
4305d465952Smartinh 	    NODEKEY(node), node->ksize, expkey->str, &expkey->len);
4315d465952Smartinh }
4325d465952Smartinh 
4335d465952Smartinh static int
4345d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,
4355d465952Smartinh     struct btkey *pfx)
4365d465952Smartinh {
4375d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY))
4385d465952Smartinh 		return memnrcmp(key1->data, key1->size - pfx->len,
4395d465952Smartinh 		    key2->data, key2->size);
4405d465952Smartinh 	else
4415d465952Smartinh 		return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,
4425d465952Smartinh 		    key2->data, key2->size);
4435d465952Smartinh }
4445d465952Smartinh 
4455d465952Smartinh void
4465d465952Smartinh btval_reset(struct btval *btv)
4475d465952Smartinh {
4485d465952Smartinh 	if (btv) {
4495d465952Smartinh 		if (btv->mp)
4505d465952Smartinh 			btv->mp->ref--;
4515d465952Smartinh 		if (btv->free_data)
4525d465952Smartinh 			free(btv->data);
4535d465952Smartinh 		bzero(btv, sizeof(*btv));
4545d465952Smartinh 	}
4555d465952Smartinh }
4565d465952Smartinh 
4575d465952Smartinh static int
4585d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b)
4595d465952Smartinh {
4605d465952Smartinh 	if (a->pgno > b->pgno)
4615d465952Smartinh 		return 1;
4625d465952Smartinh 	if (a->pgno < b->pgno)
4635d465952Smartinh 		return -1;
4645d465952Smartinh 	return 0;
4655d465952Smartinh }
4665d465952Smartinh 
4675d465952Smartinh static struct mpage *
4685d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno)
4695d465952Smartinh {
4705d465952Smartinh 	struct mpage	 find, *mp;
4715d465952Smartinh 
4725d465952Smartinh 	find.pgno = pgno;
4735d465952Smartinh 	mp = RB_FIND(page_cache, bt->page_cache, &find);
4745d465952Smartinh 	if (mp) {
4755d465952Smartinh 		bt->stat.hits++;
4765d465952Smartinh 		/* Update LRU queue. Move page to the end. */
4775d465952Smartinh 		TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
4785d465952Smartinh 		TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
4795d465952Smartinh 	}
4805d465952Smartinh 	return mp;
4815d465952Smartinh }
4825d465952Smartinh 
4835d465952Smartinh static void
4845d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp)
4855d465952Smartinh {
4865d465952Smartinh 	assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);
4875d465952Smartinh 	bt->stat.cache_size++;
4885d465952Smartinh 	TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
48937ee45a1Smartinh 	mpage_prune(bt);
49037ee45a1Smartinh }
49137ee45a1Smartinh 
49237ee45a1Smartinh static void
49337ee45a1Smartinh mpage_free(struct mpage *mp)
49437ee45a1Smartinh {
49537ee45a1Smartinh 	if (mp != NULL) {
49637ee45a1Smartinh 		free(mp->page);
49737ee45a1Smartinh 		free(mp);
49837ee45a1Smartinh 	}
4995d465952Smartinh }
5005d465952Smartinh 
5015d465952Smartinh static void
5025d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp)
5035d465952Smartinh {
5045d465952Smartinh 	assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);
5055d465952Smartinh 	assert(bt->stat.cache_size > 0);
5065d465952Smartinh 	bt->stat.cache_size--;
5075d465952Smartinh 	TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
5085d465952Smartinh }
5095d465952Smartinh 
510e14f8e24Smartinh static void
511e14f8e24Smartinh mpage_flush(struct btree *bt)
512e14f8e24Smartinh {
513e14f8e24Smartinh 	struct mpage	*mp;
514e14f8e24Smartinh 
515e14f8e24Smartinh 	while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) {
516e14f8e24Smartinh 		mpage_del(bt, mp);
51737ee45a1Smartinh 		mpage_free(mp);
518e14f8e24Smartinh 	}
519e14f8e24Smartinh }
520e14f8e24Smartinh 
5215d465952Smartinh static struct mpage *
52237ee45a1Smartinh mpage_copy(struct btree *bt, struct mpage *mp)
5235d465952Smartinh {
5245d465952Smartinh 	struct mpage	*copy;
5255d465952Smartinh 
5265d465952Smartinh 	if ((copy = calloc(1, sizeof(*copy))) == NULL)
5275d465952Smartinh 		return NULL;
52837ee45a1Smartinh 	if ((copy->page = malloc(bt->head.psize)) == NULL) {
52937ee45a1Smartinh 		free(copy);
53037ee45a1Smartinh 		return NULL;
53137ee45a1Smartinh 	}
53237ee45a1Smartinh 	bcopy(mp->page, copy->page, bt->head.psize);
5335d465952Smartinh 	bcopy(&mp->prefix, &copy->prefix, sizeof(mp->prefix));
5345d465952Smartinh 	copy->parent = mp->parent;
5355d465952Smartinh 	copy->parent_index = mp->parent_index;
5365d465952Smartinh 	copy->pgno = mp->pgno;
5375d465952Smartinh 
5385d465952Smartinh 	return copy;
5395d465952Smartinh }
5405d465952Smartinh 
5415d465952Smartinh /* Remove the least recently used memory pages until the cache size is
5425d465952Smartinh  * within the configured bounds. Pages referenced by cursors or returned
5435d465952Smartinh  * key/data are not pruned.
5445d465952Smartinh  */
5455d465952Smartinh static void
5465d465952Smartinh mpage_prune(struct btree *bt)
5475d465952Smartinh {
5485d465952Smartinh 	struct mpage	*mp, *next;
5495d465952Smartinh 
5505d465952Smartinh 	for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) {
5515d465952Smartinh 		if (bt->stat.cache_size <= bt->stat.max_cache)
5525d465952Smartinh 			break;
5535d465952Smartinh 		next = TAILQ_NEXT(mp, lru_next);
5545d465952Smartinh 		if (!mp->dirty && mp->ref <= 0) {
5555d465952Smartinh 			mpage_del(bt, mp);
55637ee45a1Smartinh 			mpage_free(mp);
5575d465952Smartinh 		}
5585d465952Smartinh 	}
5595d465952Smartinh }
5605d465952Smartinh 
5615d465952Smartinh /* Mark a page as dirty and push it on the dirty queue.
5625d465952Smartinh  */
5635d465952Smartinh static void
5645d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp)
5655d465952Smartinh {
5665d465952Smartinh 	assert(bt != NULL);
5675d465952Smartinh 	assert(bt->txn != NULL);
5685d465952Smartinh 
5695d465952Smartinh 	if (!mp->dirty) {
5705d465952Smartinh 		mp->dirty = 1;
5715d465952Smartinh 		SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);
5725d465952Smartinh 	}
5735d465952Smartinh }
5745d465952Smartinh 
5755d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno.
5765d465952Smartinh  */
5775d465952Smartinh static void
5785d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp)
5795d465952Smartinh {
5805d465952Smartinh 	assert(bt != NULL);
5815d465952Smartinh 	assert(bt->txn != NULL);
5825d465952Smartinh 	assert(mp != NULL);
5835d465952Smartinh 
5845d465952Smartinh 	if (!mp->dirty) {
5855d465952Smartinh 		DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno);
5865d465952Smartinh 		if (mp->ref == 0)
5875d465952Smartinh 			mpage_del(bt, mp);
5885d465952Smartinh 		else
58937ee45a1Smartinh 			mp = mpage_copy(bt, mp);
59037ee45a1Smartinh 		mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
5915d465952Smartinh 		mpage_dirty(bt, mp);
5925d465952Smartinh 		mpage_add(bt, mp);
5935d465952Smartinh 
5945d465952Smartinh 		/* Update the page number to new touched page. */
5955d465952Smartinh 		if (mp->parent != NULL)
5965d465952Smartinh 			NODEPGNO(NODEPTR(mp->parent,
5975d465952Smartinh 			    mp->parent_index)) = mp->pgno;
5985d465952Smartinh 	}
5995d465952Smartinh }
6005d465952Smartinh 
6015d465952Smartinh static int
6025d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)
6035d465952Smartinh {
6045d465952Smartinh 	ssize_t		 rc;
6055d465952Smartinh 
6065d465952Smartinh 	DPRINTF("reading page %u", pgno);
6075d465952Smartinh 	bt->stat.reads++;
60837ee45a1Smartinh 	if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) {
6095d465952Smartinh 		DPRINTF("page %u doesn't exist", pgno);
6100279e526Smartinh 		errno = ENOENT;
6110279e526Smartinh 		return BT_FAIL;
612*36e226d9Smartinh 	} else if (rc != (ssize_t)bt->head.psize) {
61337ee45a1Smartinh 		if (rc > 0)
61437ee45a1Smartinh 			errno = EINVAL;
6155d465952Smartinh 		DPRINTF("read: %s", strerror(errno));
6165d465952Smartinh 		return BT_FAIL;
6175d465952Smartinh 	}
6185d465952Smartinh 
6195d465952Smartinh 	if (page->pgno != pgno) {
6205d465952Smartinh 		DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno);
6215d465952Smartinh 		errno = EINVAL;
6225d465952Smartinh 		return BT_FAIL;
6235d465952Smartinh 	}
6245d465952Smartinh 
62537ee45a1Smartinh 	DPRINTF("page %u has flags 0x%X", pgno, page->flags);
62637ee45a1Smartinh 
6275d465952Smartinh 	return BT_SUCCESS;
6285d465952Smartinh }
6295d465952Smartinh 
6305d465952Smartinh int
6315d465952Smartinh btree_sync(struct btree *bt)
6325d465952Smartinh {
6335d465952Smartinh 	if (!F_ISSET(bt->flags, BT_NOSYNC))
6345d465952Smartinh 		return fsync(bt->fd);
6355d465952Smartinh 	return 0;
6365d465952Smartinh }
6375d465952Smartinh 
6385d465952Smartinh struct btree_txn *
6395d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly)
6405d465952Smartinh {
6415d465952Smartinh 	struct btree_txn	*txn;
6425d465952Smartinh 
6435d465952Smartinh 	if (!rdonly && bt->txn != NULL) {
6445d465952Smartinh 		DPRINTF("write transaction already begun");
6450279e526Smartinh 		errno = EBUSY;
6465d465952Smartinh 		return NULL;
6475d465952Smartinh 	}
6485d465952Smartinh 
6495d465952Smartinh 	if ((txn = calloc(1, sizeof(*txn))) == NULL) {
6505d465952Smartinh 		DPRINTF("calloc: %s", strerror(errno));
6515d465952Smartinh 		return NULL;
6525d465952Smartinh 	}
6535d465952Smartinh 
6545d465952Smartinh 	if (rdonly) {
6555d465952Smartinh 		txn->flags |= BT_TXN_RDONLY;
6565d465952Smartinh 	} else {
6575d465952Smartinh 		txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue));
6585d465952Smartinh 		if (txn->dirty_queue == NULL) {
6595d465952Smartinh 			free(txn);
6605d465952Smartinh 			return NULL;
6615d465952Smartinh 		}
6625d465952Smartinh 		SIMPLEQ_INIT(txn->dirty_queue);
6635d465952Smartinh 
6645d465952Smartinh 		DPRINTF("taking write lock on txn %p", txn);
6655d465952Smartinh 		if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
6660279e526Smartinh 			DPRINTF("flock: %s", strerror(errno));
6670279e526Smartinh 			errno = EBUSY;
6685d465952Smartinh 			free(txn->dirty_queue);
6695d465952Smartinh 			free(txn);
6705d465952Smartinh 			return NULL;
6715d465952Smartinh 		}
6725d465952Smartinh 		bt->txn = txn;
6735d465952Smartinh 	}
6745d465952Smartinh 
6755d465952Smartinh 	txn->bt = bt;
676408be415Smartinh 	btree_ref(bt);
6775d465952Smartinh 
6780279e526Smartinh 	if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) {
6795d465952Smartinh 		btree_txn_abort(txn);
6805d465952Smartinh 		return NULL;
6815d465952Smartinh 	}
6825d465952Smartinh 
68337ee45a1Smartinh 	txn->root = bt->meta.root;
6845d465952Smartinh 	DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root);
6855d465952Smartinh 
6865d465952Smartinh 	return txn;
6875d465952Smartinh }
6885d465952Smartinh 
6895d465952Smartinh void
6905d465952Smartinh btree_txn_abort(struct btree_txn *txn)
6915d465952Smartinh {
6925d465952Smartinh 	struct mpage	*mp;
6935d465952Smartinh 	struct btree	*bt;
6945d465952Smartinh 
6955d465952Smartinh 	if (txn == NULL)
6965d465952Smartinh 		return;
6975d465952Smartinh 
6985d465952Smartinh 	bt = txn->bt;
6995d465952Smartinh 	DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root);
7005d465952Smartinh 
701f189d82aSmartinh 	if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7025d465952Smartinh 		/* Discard all dirty pages.
7035d465952Smartinh 		 */
7045d465952Smartinh 		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
7055d465952Smartinh 			mp = SIMPLEQ_FIRST(txn->dirty_queue);
7065d465952Smartinh 			assert(mp->ref == 0);	/* cursors should be closed */
7075d465952Smartinh 			mpage_del(bt, mp);
7085d465952Smartinh 			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
7095d465952Smartinh 		}
7105d465952Smartinh 
7115d465952Smartinh 		DPRINTF("releasing write lock on txn %p", txn);
7125d465952Smartinh 		txn->bt->txn = NULL;
7135d465952Smartinh 		if (flock(txn->bt->fd, LOCK_UN) != 0) {
7145d465952Smartinh 			DPRINTF("failed to unlock fd %d: %s",
7155d465952Smartinh 			    txn->bt->fd, strerror(errno));
7165d465952Smartinh 		}
7175d465952Smartinh 		free(txn->dirty_queue);
7185d465952Smartinh 	}
7195d465952Smartinh 
7205d465952Smartinh 	btree_close(txn->bt);
7215d465952Smartinh 	free(txn);
7225d465952Smartinh }
7235d465952Smartinh 
7245d465952Smartinh int
7255d465952Smartinh btree_txn_commit(struct btree_txn *txn)
7265d465952Smartinh {
7275d465952Smartinh 	int		 n, done;
7285d465952Smartinh 	ssize_t		 rc;
7295d465952Smartinh 	off_t		 size;
7305d465952Smartinh 	struct mpage	*mp;
7315d465952Smartinh 	struct btree	*bt;
7325d465952Smartinh 	struct iovec	 iov[BT_COMMIT_PAGES];
7335d465952Smartinh 
7345d465952Smartinh 	assert(txn != NULL);
7355d465952Smartinh 	assert(txn->bt != NULL);
7365d465952Smartinh 
7375d465952Smartinh 	bt = txn->bt;
7385d465952Smartinh 
7395d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7405d465952Smartinh 		DPRINTF("attempt to commit read-only transaction");
7415d465952Smartinh 		btree_txn_abort(txn);
7420279e526Smartinh 		errno = EPERM;
7435d465952Smartinh 		return BT_FAIL;
7445d465952Smartinh 	}
7455d465952Smartinh 
7465d465952Smartinh 	if (txn != bt->txn) {
7475d465952Smartinh 		DPRINTF("attempt to commit unknown transaction");
7485d465952Smartinh 		btree_txn_abort(txn);
7490279e526Smartinh 		errno = EINVAL;
7505d465952Smartinh 		return BT_FAIL;
7515d465952Smartinh 	}
7525d465952Smartinh 
7535d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
7545d465952Smartinh 		DPRINTF("error flag is set, can't commit");
7555d465952Smartinh 		btree_txn_abort(txn);
7560279e526Smartinh 		errno = EINVAL;
7575d465952Smartinh 		return BT_FAIL;
7585d465952Smartinh 	}
7595d465952Smartinh 
7605d465952Smartinh 	if (SIMPLEQ_EMPTY(txn->dirty_queue))
7615d465952Smartinh 		goto done;
7625d465952Smartinh 
7635d465952Smartinh 	if (F_ISSET(bt->flags, BT_FIXPADDING)) {
7645d465952Smartinh 		size = lseek(bt->fd, 0, SEEK_END);
76537ee45a1Smartinh 		size += bt->head.psize - (size % bt->head.psize);
7665d465952Smartinh 		DPRINTF("extending to multiple of page size: %llu", size);
7675d465952Smartinh 		if (ftruncate(bt->fd, size) != 0) {
7685d465952Smartinh 			DPRINTF("ftruncate: %s", strerror(errno));
7695d465952Smartinh 			btree_txn_abort(txn);
7705d465952Smartinh 			return BT_FAIL;
7715d465952Smartinh 		}
7725d465952Smartinh 		bt->flags &= ~BT_FIXPADDING;
7735d465952Smartinh 	}
7745d465952Smartinh 
7755d465952Smartinh 	DPRINTF("committing transaction on btree %p, root page %u",
7765d465952Smartinh 	    bt, txn->root);
7775d465952Smartinh 
7785d465952Smartinh 	/* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.
7795d465952Smartinh 	 */
7805d465952Smartinh 	do {
7815d465952Smartinh 		n = 0;
7825d465952Smartinh 		done = 1;
7835d465952Smartinh 		SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) {
7845d465952Smartinh 			DPRINTF("commiting page %u", mp->pgno);
78537ee45a1Smartinh 			iov[n].iov_len = bt->head.psize;
78637ee45a1Smartinh 			iov[n].iov_base = mp->page;
7875d465952Smartinh 			if (++n >= BT_COMMIT_PAGES) {
7885d465952Smartinh 				done = 0;
7895d465952Smartinh 				break;
7905d465952Smartinh 			}
7915d465952Smartinh 		}
7925d465952Smartinh 
7935d465952Smartinh 		if (n == 0)
7945d465952Smartinh 			break;
7955d465952Smartinh 
7965d465952Smartinh 		DPRINTF("commiting %u dirty pages", n);
7975d465952Smartinh 		rc = writev(bt->fd, iov, n);
798*36e226d9Smartinh 		if (rc != (ssize_t)bt->head.psize*n) {
7995d465952Smartinh 			if (rc > 0)
8005d465952Smartinh 				DPRINTF("short write, filesystem full?");
8015d465952Smartinh 			else
8025d465952Smartinh 				DPRINTF("writev: %s", strerror(errno));
8035d465952Smartinh 			btree_txn_abort(txn);
8045d465952Smartinh 			return BT_FAIL;
8055d465952Smartinh 		}
8065d465952Smartinh 
8075d465952Smartinh 		/* Remove the dirty flag from the written pages.
8085d465952Smartinh 		 */
8095d465952Smartinh 		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
8105d465952Smartinh 			mp = SIMPLEQ_FIRST(txn->dirty_queue);
8115d465952Smartinh 			mp->dirty = 0;
8125d465952Smartinh 			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
8135d465952Smartinh 			if (--n == 0)
8145d465952Smartinh 				break;
8155d465952Smartinh 		}
8165d465952Smartinh 	} while (!done);
8175d465952Smartinh 
8185d465952Smartinh 	if (btree_sync(bt) != 0 ||
819e14f8e24Smartinh 	    btree_write_meta(bt, txn->root, 0) != BT_SUCCESS ||
8205d465952Smartinh 	    btree_sync(bt) != 0) {
8215d465952Smartinh 		btree_txn_abort(txn);
8225d465952Smartinh 		return BT_FAIL;
8235d465952Smartinh 	}
8245d465952Smartinh 
8255d465952Smartinh done:
8265d465952Smartinh 	mpage_prune(bt);
8275d465952Smartinh 	btree_txn_abort(txn);
8285d465952Smartinh 
8295d465952Smartinh 	return BT_SUCCESS;
8305d465952Smartinh }
8315d465952Smartinh 
8325d465952Smartinh static int
83337ee45a1Smartinh btree_write_header(struct btree *bt, int fd)
83437ee45a1Smartinh {
83537ee45a1Smartinh 	struct stat	 sb;
83637ee45a1Smartinh 	struct bt_head	*h;
83737ee45a1Smartinh 	struct page	*p;
83837ee45a1Smartinh 	ssize_t		 rc;
83937ee45a1Smartinh 	unsigned int	 psize;
84037ee45a1Smartinh 
84137ee45a1Smartinh 	DPRINTF("writing header page");
84237ee45a1Smartinh 	assert(bt != NULL);
84337ee45a1Smartinh 
84437ee45a1Smartinh 	/* Ask stat for 'optimal blocksize for I/O'.
84537ee45a1Smartinh 	 */
84637ee45a1Smartinh 	if (fstat(fd, &sb) == 0)
84737ee45a1Smartinh 		psize = sb.st_blksize;
84837ee45a1Smartinh 	else
84937ee45a1Smartinh 		psize = PAGESIZE;
85037ee45a1Smartinh 
85137ee45a1Smartinh 	if ((p = calloc(1, psize)) == NULL)
85237ee45a1Smartinh 		return -1;
85337ee45a1Smartinh 	p->flags = P_HEAD;
85437ee45a1Smartinh 
85537ee45a1Smartinh 	h = METADATA(p);
85637ee45a1Smartinh 	h->magic = BT_MAGIC;
85737ee45a1Smartinh 	h->version = BT_VERSION;
85837ee45a1Smartinh 	h->psize = psize;
85937ee45a1Smartinh 	bcopy(h, &bt->head, sizeof(*h));
86037ee45a1Smartinh 
86137ee45a1Smartinh 	rc = write(fd, p, bt->head.psize);
86237ee45a1Smartinh 	free(p);
863*36e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize) {
86437ee45a1Smartinh 		if (rc > 0)
86537ee45a1Smartinh 			DPRINTF("short write, filesystem full?");
86637ee45a1Smartinh 		return BT_FAIL;
86737ee45a1Smartinh 	}
86837ee45a1Smartinh 
86937ee45a1Smartinh 	return BT_SUCCESS;
87037ee45a1Smartinh }
87137ee45a1Smartinh 
87237ee45a1Smartinh static int
87337ee45a1Smartinh btree_read_header(struct btree *bt)
87437ee45a1Smartinh {
87537ee45a1Smartinh 	char		 page[PAGESIZE];
87637ee45a1Smartinh 	struct page	*p;
87737ee45a1Smartinh 	struct bt_head	*h;
87837ee45a1Smartinh 	int		 rc;
87937ee45a1Smartinh 
88037ee45a1Smartinh 	assert(bt != NULL);
88137ee45a1Smartinh 
88237ee45a1Smartinh 	/* We don't know the page size yet, so use a minimum value.
88337ee45a1Smartinh 	 */
88437ee45a1Smartinh 
88537ee45a1Smartinh 	if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) {
88637ee45a1Smartinh 		errno = ENOENT;
88737ee45a1Smartinh 		return -1;
88837ee45a1Smartinh 	} else if (rc != PAGESIZE) {
88937ee45a1Smartinh 		if (rc > 0)
89037ee45a1Smartinh 			errno = EINVAL;
89137ee45a1Smartinh 		DPRINTF("read: %s", strerror(errno));
89237ee45a1Smartinh 		return -1;
89337ee45a1Smartinh 	}
89437ee45a1Smartinh 
89537ee45a1Smartinh 	p = (struct page *)page;
89637ee45a1Smartinh 
89737ee45a1Smartinh 	if (!F_ISSET(p->flags, P_HEAD)) {
89837ee45a1Smartinh 		DPRINTF("page %d not a header page", p->pgno);
89937ee45a1Smartinh 		errno = EINVAL;
90037ee45a1Smartinh 		return -1;
90137ee45a1Smartinh 	}
90237ee45a1Smartinh 
90337ee45a1Smartinh 	h = METADATA(p);
90437ee45a1Smartinh 	if (h->magic != BT_MAGIC) {
90537ee45a1Smartinh 		DPRINTF("header has invalid magic");
90637ee45a1Smartinh 		errno = EINVAL;
90737ee45a1Smartinh 		return -1;
90837ee45a1Smartinh 	}
90937ee45a1Smartinh 
91037ee45a1Smartinh 	if (h->version != BT_VERSION) {
91137ee45a1Smartinh 		DPRINTF("database is version %u, expected version %u",
91237ee45a1Smartinh 		    bt->head.version, BT_VERSION);
91337ee45a1Smartinh 		errno = EINVAL;
91437ee45a1Smartinh 		return -1;
91537ee45a1Smartinh 	}
91637ee45a1Smartinh 
91737ee45a1Smartinh 	bcopy(h, &bt->head, sizeof(*h));
91837ee45a1Smartinh 	return 0;
91937ee45a1Smartinh }
92037ee45a1Smartinh 
92137ee45a1Smartinh static int
922e14f8e24Smartinh btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags)
9235d465952Smartinh {
92437ee45a1Smartinh 	struct mpage	*mp;
92537ee45a1Smartinh 	struct bt_meta	*meta;
9265d465952Smartinh 	ssize_t		 rc;
9275d465952Smartinh 
9285d465952Smartinh 	DPRINTF("writing meta page for root page %u", root);
9295d465952Smartinh 
9305d465952Smartinh 	assert(bt != NULL);
9315d465952Smartinh 	assert(bt->txn != NULL);
9325d465952Smartinh 
93337ee45a1Smartinh 	if ((mp = btree_new_page(bt, P_META)) == NULL)
93437ee45a1Smartinh 		return -1;
9355d465952Smartinh 
93637ee45a1Smartinh 	bt->meta.prev_meta = bt->meta.root;
93737ee45a1Smartinh 	bt->meta.root = root;
93837ee45a1Smartinh 	bt->meta.flags = flags;
93937ee45a1Smartinh 	bt->meta.created_at = time(0);
94037ee45a1Smartinh 	bt->meta.revisions++;
94137ee45a1Smartinh 	SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash);
9425d465952Smartinh 
94337ee45a1Smartinh 	/* Copy the meta data changes to the new meta page. */
94437ee45a1Smartinh 	meta = METADATA(mp->page);
94537ee45a1Smartinh 	bcopy(&bt->meta, meta, sizeof(*meta));
94637ee45a1Smartinh 
947*36e226d9Smartinh 	rc = write(bt->fd, mp->page, bt->head.psize);
948*36e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize) {
9495d465952Smartinh 		if (rc > 0)
9505d465952Smartinh 			DPRINTF("short write, filesystem full?");
9515d465952Smartinh 		return BT_FAIL;
9525d465952Smartinh 	}
9535d465952Smartinh 
9545d465952Smartinh 	if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) {
9555d465952Smartinh 		DPRINTF("failed to update file size: %s", strerror(errno));
9565d465952Smartinh 		bt->size = 0;
9575d465952Smartinh 	}
9585d465952Smartinh 
9595d465952Smartinh 	return BT_SUCCESS;
9605d465952Smartinh }
9615d465952Smartinh 
9625d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise.
9635d465952Smartinh  */
9645d465952Smartinh static int
9655d465952Smartinh btree_is_meta_page(struct page *p)
9665d465952Smartinh {
9675d465952Smartinh 	struct bt_meta	*m;
9685d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
9695d465952Smartinh 
9705d465952Smartinh 	m = METADATA(p);
9715d465952Smartinh 	if (!F_ISSET(p->flags, P_META)) {
9725d465952Smartinh 		DPRINTF("page %d not a meta page", p->pgno);
97337ee45a1Smartinh 		errno = EINVAL;
9745d465952Smartinh 		return 0;
9755d465952Smartinh 	}
9765d465952Smartinh 
9775d465952Smartinh 	if (m->root >= p->pgno && m->root != P_INVALID) {
9785d465952Smartinh 		DPRINTF("page %d points to an invalid root page", p->pgno);
97937ee45a1Smartinh 		errno = EINVAL;
9805d465952Smartinh 		return 0;
9815d465952Smartinh 	}
9825d465952Smartinh 
9835d465952Smartinh 	SHA1((unsigned char *)m, METAHASHLEN, hash);
9845d465952Smartinh 	if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) {
9855d465952Smartinh 		DPRINTF("page %d has an invalid digest", p->pgno);
98637ee45a1Smartinh 		errno = EINVAL;
9875d465952Smartinh 		return 0;
9885d465952Smartinh 	}
9895d465952Smartinh 
9905d465952Smartinh 	return 1;
9915d465952Smartinh }
9925d465952Smartinh 
9935d465952Smartinh static int
9945d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next)
9955d465952Smartinh {
99637ee45a1Smartinh 	struct mpage	*mp;
99737ee45a1Smartinh 	struct bt_meta	*meta;
9985d465952Smartinh 	pgno_t		 meta_pgno, next_pgno;
9995d465952Smartinh 	off_t		 size;
10005d465952Smartinh 
10015d465952Smartinh 	assert(bt != NULL);
10025d465952Smartinh 
10035d465952Smartinh 	if ((size = lseek(bt->fd, 0, SEEK_END)) == -1)
10045d465952Smartinh 		goto fail;
10055d465952Smartinh 
10065d465952Smartinh 	DPRINTF("btree_read_meta: size = %llu", size);
10075d465952Smartinh 
10085d465952Smartinh 	if (size < bt->size) {
10095d465952Smartinh 		DPRINTF("file has shrunk!");
10105d465952Smartinh 		errno = EIO;
10115d465952Smartinh 		goto fail;
10125d465952Smartinh 	}
10135d465952Smartinh 
101437ee45a1Smartinh 	if (size == bt->head.psize) {		/* there is only the header */
10155d465952Smartinh 		if (p_next != NULL)
101637ee45a1Smartinh 			*p_next = 1;
10170279e526Smartinh 		return BT_SUCCESS;		/* new file */
10185d465952Smartinh 	}
10195d465952Smartinh 
102037ee45a1Smartinh 	next_pgno = size / bt->head.psize;
10215d465952Smartinh 	if (next_pgno == 0) {
10225d465952Smartinh 		DPRINTF("corrupt file");
10235d465952Smartinh 		errno = EIO;
10245d465952Smartinh 		goto fail;
10255d465952Smartinh 	}
10265d465952Smartinh 
10275d465952Smartinh 	meta_pgno = next_pgno - 1;
10285d465952Smartinh 
102937ee45a1Smartinh 	if (size % bt->head.psize != 0) {
10305d465952Smartinh 		DPRINTF("filesize not a multiple of the page size!");
10315d465952Smartinh 		bt->flags |= BT_FIXPADDING;
10325d465952Smartinh 		next_pgno++;
10335d465952Smartinh 	}
10345d465952Smartinh 
10355d465952Smartinh 	if (p_next != NULL)
10365d465952Smartinh 		*p_next = next_pgno;
10375d465952Smartinh 
10385d465952Smartinh 	if (size == bt->size) {
10395d465952Smartinh 		DPRINTF("size unchanged, keeping current meta page");
104037ee45a1Smartinh 		if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) {
104162436105Smartinh 			DPRINTF("file is dead");
10420279e526Smartinh 			errno = ESTALE;
10430279e526Smartinh 			return BT_FAIL;
104462436105Smartinh 		} else
10455d465952Smartinh 			return BT_SUCCESS;
10465d465952Smartinh 	}
10475d465952Smartinh 	bt->size = size;
10485d465952Smartinh 
10495d465952Smartinh 	while (meta_pgno > 0) {
105037ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL)
105137ee45a1Smartinh 			break;
105237ee45a1Smartinh 		if (btree_is_meta_page(mp->page)) {
105337ee45a1Smartinh 			meta = METADATA(mp->page);
105437ee45a1Smartinh 			DPRINTF("flags = 0x%x", meta->flags);
105537ee45a1Smartinh 			if (F_ISSET(meta->flags, BT_TOMBSTONE)) {
1056e14f8e24Smartinh 				DPRINTF("file is dead");
10570279e526Smartinh 				errno = ESTALE;
10580279e526Smartinh 				return BT_FAIL;
105937ee45a1Smartinh 			} else {
106037ee45a1Smartinh 				/* Make copy of last meta page. */
106137ee45a1Smartinh 				bcopy(meta, &bt->meta, sizeof(bt->meta));
10625d465952Smartinh 				return BT_SUCCESS;
10635d465952Smartinh 			}
106437ee45a1Smartinh 		}
10655d465952Smartinh 		--meta_pgno;	/* scan backwards to first valid meta page */
10665d465952Smartinh 	}
10675d465952Smartinh 
10685d465952Smartinh 	errno = EIO;
10695d465952Smartinh fail:
10705d465952Smartinh 	if (p_next != NULL)
10715d465952Smartinh 		*p_next = P_INVALID;
10725d465952Smartinh 	return BT_FAIL;
10735d465952Smartinh }
10745d465952Smartinh 
10755d465952Smartinh struct btree *
1076176c8cbcSmartinh btree_open_fd(int fd, unsigned int flags)
10775d465952Smartinh {
10785d465952Smartinh 	struct btree	*bt;
107937ee45a1Smartinh 	int		 fl;
10805d465952Smartinh 
10815d465952Smartinh 	fl = fcntl(fd, F_GETFL, 0);
10825d465952Smartinh 	if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1)
10835d465952Smartinh 		return NULL;
10845d465952Smartinh 
10855d465952Smartinh 	if ((bt = calloc(1, sizeof(*bt))) == NULL)
10865d465952Smartinh 		return NULL;
10875d465952Smartinh 	bt->fd = fd;
10885d465952Smartinh 	bt->flags = flags;
10895d465952Smartinh 	bt->flags &= ~BT_FIXPADDING;
1090408be415Smartinh 	bt->ref = 1;
109137ee45a1Smartinh 	bt->meta.root = P_INVALID;
10925d465952Smartinh 
10935d465952Smartinh 	if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL)
10945d465952Smartinh 		goto fail;
10955d465952Smartinh 	bt->stat.max_cache = BT_MAXCACHE_DEF;
10965d465952Smartinh 	RB_INIT(bt->page_cache);
10975d465952Smartinh 
10985d465952Smartinh 	if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL)
10995d465952Smartinh 		goto fail;
11005d465952Smartinh 	TAILQ_INIT(bt->lru_queue);
11015d465952Smartinh 
110237ee45a1Smartinh 	if (btree_read_header(bt) != 0) {
110337ee45a1Smartinh 		if (errno != ENOENT)
11045d465952Smartinh 			goto fail;
110537ee45a1Smartinh 		DPRINTF("new database");
110637ee45a1Smartinh 		btree_write_header(bt, bt->fd);
110737ee45a1Smartinh 	}
11085d465952Smartinh 
11090279e526Smartinh 	if (btree_read_meta(bt, NULL) != 0)
11105d465952Smartinh 		goto fail;
11115d465952Smartinh 
11125d465952Smartinh 	DPRINTF("opened database version %u, pagesize %u",
111337ee45a1Smartinh 	    bt->head.version, bt->head.psize);
111437ee45a1Smartinh 	DPRINTF("timestamp: %s", ctime(&bt->meta.created_at));
111537ee45a1Smartinh 	DPRINTF("depth: %u", bt->meta.depth);
111637ee45a1Smartinh 	DPRINTF("entries: %llu", bt->meta.entries);
111737ee45a1Smartinh 	DPRINTF("revisions: %u", bt->meta.revisions);
111837ee45a1Smartinh 	DPRINTF("branch pages: %u", bt->meta.branch_pages);
111937ee45a1Smartinh 	DPRINTF("leaf pages: %u", bt->meta.leaf_pages);
112037ee45a1Smartinh 	DPRINTF("overflow pages: %u", bt->meta.overflow_pages);
112137ee45a1Smartinh 	DPRINTF("root: %u", bt->meta.root);
112237ee45a1Smartinh 	DPRINTF("previous meta page: %u", bt->meta.prev_meta);
11235d465952Smartinh 
11245d465952Smartinh 	return bt;
11255d465952Smartinh 
11265d465952Smartinh fail:
11275d465952Smartinh 	free(bt->lru_queue);
11285d465952Smartinh 	free(bt->page_cache);
11295d465952Smartinh 	free(bt);
11305d465952Smartinh 	return NULL;
11315d465952Smartinh }
11325d465952Smartinh 
11335d465952Smartinh struct btree *
1134176c8cbcSmartinh btree_open(const char *path, unsigned int flags, mode_t mode)
11355d465952Smartinh {
11365d465952Smartinh 	int		 fd, oflags;
11375d465952Smartinh 	struct btree	*bt;
11385d465952Smartinh 
11395d465952Smartinh 	if (F_ISSET(flags, BT_RDONLY))
11405d465952Smartinh 		oflags = O_RDONLY;
11415d465952Smartinh 	else
11425d465952Smartinh 		oflags = O_RDWR | O_CREAT | O_APPEND;
11435d465952Smartinh 
11445d465952Smartinh 	if ((fd = open(path, oflags, mode)) == -1)
11455d465952Smartinh 		return NULL;
11465d465952Smartinh 
11475d465952Smartinh 	if ((bt = btree_open_fd(fd, flags)) == NULL)
11485d465952Smartinh 		close(fd);
11495d465952Smartinh 	else {
11505d465952Smartinh 		bt->path = strdup(path);
11515d465952Smartinh 		DPRINTF("opened btree %p", bt);
11525d465952Smartinh 	}
11535d465952Smartinh 
11545d465952Smartinh 	return bt;
11555d465952Smartinh }
11565d465952Smartinh 
1157408be415Smartinh static void
1158408be415Smartinh btree_ref(struct btree *bt)
1159408be415Smartinh {
1160408be415Smartinh 	bt->ref++;
1161408be415Smartinh 	DPRINTF("ref is now %d on btree %p", bt->ref, bt);
1162408be415Smartinh }
1163408be415Smartinh 
11645d465952Smartinh void
11655d465952Smartinh btree_close(struct btree *bt)
11665d465952Smartinh {
1167e14f8e24Smartinh 	if (bt == NULL)
1168e14f8e24Smartinh 		return;
11695d465952Smartinh 
1170e14f8e24Smartinh 	if (--bt->ref == 0) {
11715d465952Smartinh 		DPRINTF("ref is zero, closing btree %p", bt);
11725d465952Smartinh 		close(bt->fd);
1173e14f8e24Smartinh 		mpage_flush(bt);
11745d465952Smartinh 		free(bt->page_cache);
11755d465952Smartinh 		free(bt);
1176e14f8e24Smartinh 	} else
1177408be415Smartinh 		DPRINTF("ref is now %d on btree %p", bt->ref, bt);
11785d465952Smartinh }
11795d465952Smartinh 
11805d465952Smartinh /* Search for key within a leaf page, using binary search.
11815d465952Smartinh  * Returns the smallest entry larger or equal to the key.
11825d465952Smartinh  * If exactp is non-null, stores whether the found entry was an exact match
11835d465952Smartinh  * in *exactp (1 or 0).
11845d465952Smartinh  * If kip is non-null, stores the index of the found entry in *kip.
11855d465952Smartinh  * If no entry larger of equal to the key is found, returns NULL.
11865d465952Smartinh  */
11875d465952Smartinh static struct node *
11885d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,
11895d465952Smartinh     int *exactp, unsigned int *kip)
11905d465952Smartinh {
11915d465952Smartinh 	unsigned int	 i = 0;
11925d465952Smartinh 	int		 low, high;
11935d465952Smartinh 	int		 rc = 0;
11945d465952Smartinh 	struct node	*node;
11955d465952Smartinh 	struct btval	 nodekey;
11965d465952Smartinh 
11975d465952Smartinh 	DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]",
11985d465952Smartinh 	    NUMKEYS(mp),
11995d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
12005d465952Smartinh 	    mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);
12015d465952Smartinh 
12025d465952Smartinh 	assert(NUMKEYS(mp) > 0);
12035d465952Smartinh 
12045d465952Smartinh 	bzero(&nodekey, sizeof(nodekey));
12055d465952Smartinh 
12065d465952Smartinh 	low = IS_LEAF(mp) ? 0 : 1;
12075d465952Smartinh 	high = NUMKEYS(mp) - 1;
12085d465952Smartinh 	while (low <= high) {
12095d465952Smartinh 		i = (low + high) >> 1;
12105d465952Smartinh 		node = NODEPTR(mp, i);
12115d465952Smartinh 
12125d465952Smartinh 		nodekey.size = node->ksize;
12135d465952Smartinh 		nodekey.data = NODEKEY(node);
12145d465952Smartinh 
12155d465952Smartinh 		if (bt->cmp)
12165d465952Smartinh 			rc = bt->cmp(key, &nodekey);
12175d465952Smartinh 		else
12185d465952Smartinh 			rc = bt_cmp(bt, key, &nodekey, &mp->prefix);
12195d465952Smartinh 
12205d465952Smartinh 		if (IS_LEAF(mp))
12215d465952Smartinh 			DPRINTF("found leaf index %u [%.*s], rc = %i",
12225d465952Smartinh 			    i, (int)nodekey.size, (char *)nodekey.data, rc);
12235d465952Smartinh 		else
12245d465952Smartinh 			DPRINTF("found branch index %u [%.*s -> %u], rc = %i",
12255d465952Smartinh 			    i, (int)node->ksize, (char *)NODEKEY(node),
12265d465952Smartinh 			    node->n_pgno, rc);
12275d465952Smartinh 
12285d465952Smartinh 		if (rc == 0)
12295d465952Smartinh 			break;
12305d465952Smartinh 		if (rc > 0)
12315d465952Smartinh 			low = i + 1;
12325d465952Smartinh 		else
12335d465952Smartinh 			high = i - 1;
12345d465952Smartinh 	}
12355d465952Smartinh 
12365d465952Smartinh 	if (rc > 0) {	/* Found entry is less than the key. */
12375d465952Smartinh 		i++;	/* Skip to get the smallest entry larger than key. */
12385d465952Smartinh 		if (i >= NUMKEYS(mp))
12395d465952Smartinh 			/* There is no entry larger or equal to the key. */
12405d465952Smartinh 			return NULL;
12415d465952Smartinh 	}
12425d465952Smartinh 	if (exactp)
12435d465952Smartinh 		*exactp = (rc == 0);
12445d465952Smartinh 	if (kip)	/* Store the key index if requested. */
12455d465952Smartinh 		*kip = i;
12465d465952Smartinh 
12475d465952Smartinh 	return NODEPTR(mp, i);
12485d465952Smartinh }
12495d465952Smartinh 
12505d465952Smartinh static void
12515d465952Smartinh cursor_pop_page(struct cursor *cursor)
12525d465952Smartinh {
12535d465952Smartinh 	struct ppage	*top;
12545d465952Smartinh 
12555d465952Smartinh 	top = CURSOR_TOP(cursor);
12565d465952Smartinh 	CURSOR_POP(cursor);
12575d465952Smartinh 	top->mpage->ref--;
12585d465952Smartinh 
12595d465952Smartinh 	DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor);
12605d465952Smartinh 
12615d465952Smartinh 	free(top);
12625d465952Smartinh }
12635d465952Smartinh 
12645d465952Smartinh static struct ppage *
12655d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp)
12665d465952Smartinh {
12675d465952Smartinh 	struct ppage	*ppage;
12685d465952Smartinh 
12695d465952Smartinh 	DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor);
12705d465952Smartinh 
12715d465952Smartinh 	if ((ppage = calloc(1, sizeof(*ppage))) == NULL)
12725d465952Smartinh 		return NULL;
12735d465952Smartinh 	ppage->mpage = mp;
12745d465952Smartinh 	mp->ref++;
12755d465952Smartinh 	CURSOR_PUSH(cursor, ppage);
12765d465952Smartinh 	return ppage;
12775d465952Smartinh }
12785d465952Smartinh 
127937ee45a1Smartinh static struct mpage *
128037ee45a1Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno)
12815d465952Smartinh {
12825d465952Smartinh 	int		 rc;
12835d465952Smartinh 	struct mpage	*mp;
12845d465952Smartinh 
12855d465952Smartinh 	mp = mpage_lookup(bt, pgno);
12865d465952Smartinh 	if (mp == NULL) {
12875d465952Smartinh 		if ((mp = calloc(1, sizeof(*mp))) == NULL)
128837ee45a1Smartinh 			return NULL;
128937ee45a1Smartinh 		if ((mp->page = malloc(bt->head.psize)) == NULL) {
12905d465952Smartinh 			free(mp);
129137ee45a1Smartinh 			return NULL;
129237ee45a1Smartinh 		}
129337ee45a1Smartinh 		if ((rc = btree_read_page(bt, pgno, mp->page)) != BT_SUCCESS) {
129437ee45a1Smartinh 			mpage_free(mp);
129537ee45a1Smartinh 			return NULL;
12965d465952Smartinh 		}
12975d465952Smartinh 		mp->pgno = pgno;
12985d465952Smartinh 		mpage_add(bt, mp);
12995d465952Smartinh 	} else
13005d465952Smartinh 		DPRINTF("returning page %u from cache", pgno);
13015d465952Smartinh 
130237ee45a1Smartinh 	return mp;
13035d465952Smartinh }
13045d465952Smartinh 
13055d465952Smartinh static void
13065d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,
13075d465952Smartinh     char *cs, size_t *cn)
13085d465952Smartinh {
13095d465952Smartinh 	assert(*cn >= n1 + n2);
13105d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
13115d465952Smartinh 		bcopy(s2, cs, n2);
13125d465952Smartinh 		bcopy(s1, cs + n2, n1);
13135d465952Smartinh 	} else {
13145d465952Smartinh 		bcopy(s1, cs, n1);
13155d465952Smartinh 		bcopy(s2, cs + n1, n2);
13165d465952Smartinh 	}
13175d465952Smartinh 	*cn = n1 + n2;
13185d465952Smartinh }
13195d465952Smartinh 
13205d465952Smartinh static void
13215d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp)
13225d465952Smartinh {
13235d465952Smartinh 	indx_t			 lbound = 0, ubound = 0;
13245d465952Smartinh 	struct mpage		*lp, *up;
13255d465952Smartinh 	struct btkey		 lprefix, uprefix;
13265d465952Smartinh 
13275d465952Smartinh 	mp->prefix.len = 0;
13285d465952Smartinh 	if (bt->cmp != NULL)
13295d465952Smartinh 		return;
13305d465952Smartinh 
13315d465952Smartinh 	lp = mp;
13325d465952Smartinh 	while (lp->parent != NULL) {
13335d465952Smartinh 		if (lp->parent_index > 0) {
13345d465952Smartinh 			lbound = lp->parent_index;
13355d465952Smartinh 			break;
13365d465952Smartinh 		}
13375d465952Smartinh 		lp = lp->parent;
13385d465952Smartinh 	}
13395d465952Smartinh 
13405d465952Smartinh 	up = mp;
13415d465952Smartinh 	while (up->parent != NULL) {
13425d465952Smartinh 		if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) {
13435d465952Smartinh 			ubound = up->parent_index + 1;
13445d465952Smartinh 			break;
13455d465952Smartinh 		}
13465d465952Smartinh 		up = up->parent;
13475d465952Smartinh 	}
13485d465952Smartinh 
13495d465952Smartinh 	if (lp->parent != NULL && up->parent != NULL) {
13505d465952Smartinh 		expand_prefix(bt, lp->parent, lbound, &lprefix);
13515d465952Smartinh 		expand_prefix(bt, up->parent, ubound, &uprefix);
13525d465952Smartinh 		common_prefix(bt, &lprefix, &uprefix, &mp->prefix);
13535d465952Smartinh 	}
13545d465952Smartinh 	else if (mp->parent)
13555d465952Smartinh 		bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));
13565d465952Smartinh 
13575d465952Smartinh 	DPRINTF("found common prefix [%.*s] (len %zu) for page %u",
13585d465952Smartinh 	    (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);
13595d465952Smartinh }
13605d465952Smartinh 
13615d465952Smartinh static int
13625d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,
13635d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
13645d465952Smartinh {
13655d465952Smartinh 	struct mpage	*mp, *parent;
13665d465952Smartinh 
13675d465952Smartinh 	if (cursor && cursor_push_page(cursor, root) == NULL)
13685d465952Smartinh 		return BT_FAIL;
13695d465952Smartinh 
13705d465952Smartinh 	mp = root;
13715d465952Smartinh 	while (IS_BRANCH(mp)) {
13725d465952Smartinh 		unsigned int	 i = 0;
13735d465952Smartinh 		struct node	*node;
13745d465952Smartinh 
13755d465952Smartinh 		DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp));
13765d465952Smartinh 		assert(NUMKEYS(mp) > 1);
13775d465952Smartinh 		node = NODEPTR(mp, 0);
13785d465952Smartinh 		DPRINTF("found index 0 to page %u", NODEPGNO(node));
13795d465952Smartinh 
13805d465952Smartinh 		if (key == NULL)	/* Initialize cursor to first page. */
13815d465952Smartinh 			i = 0;
13825d465952Smartinh 		else {
13835d465952Smartinh 			int	 exact;
13845d465952Smartinh 			node = btree_search_node(bt, mp, key, &exact, &i);
13855d465952Smartinh 			if (node == NULL)
13865d465952Smartinh 				i = NUMKEYS(mp) - 1;
13875d465952Smartinh 			else if (!exact) {
13885d465952Smartinh 				assert(i > 0);
13895d465952Smartinh 				i--;
13905d465952Smartinh 			}
13915d465952Smartinh 		}
13925d465952Smartinh 
13935d465952Smartinh 		if (key)
13945d465952Smartinh 			DPRINTF("following index %u for key %.*s",
13955d465952Smartinh 			    i, (int)key->size, (char *)key->data);
13965d465952Smartinh 		assert(i >= 0 && i < NUMKEYS(mp));
13975d465952Smartinh 		node = NODEPTR(mp, i);
13985d465952Smartinh 
13995d465952Smartinh 		if (cursor)
14005d465952Smartinh 			CURSOR_TOP(cursor)->ki = i;
14015d465952Smartinh 
14025d465952Smartinh 		parent = mp;
140337ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
14045d465952Smartinh 			return BT_FAIL;
14055d465952Smartinh 		mp->parent = parent;
14065d465952Smartinh 		mp->parent_index = i;
14075d465952Smartinh 		find_common_prefix(bt, mp);
14085d465952Smartinh 
14095d465952Smartinh 		if (cursor && cursor_push_page(cursor, mp) == NULL)
14105d465952Smartinh 			return BT_FAIL;
14115d465952Smartinh 
14125d465952Smartinh 		if (modify)
14135d465952Smartinh 			mpage_touch(bt, mp);
14145d465952Smartinh 	}
14155d465952Smartinh 
14165d465952Smartinh 	if (!IS_LEAF(mp)) {
14175d465952Smartinh 		DPRINTF("internal error, index points to a %02X page!?",
141837ee45a1Smartinh 		    mp->page->flags);
14195d465952Smartinh 		return BT_FAIL;
14205d465952Smartinh 	}
14215d465952Smartinh 
14225d465952Smartinh 	DPRINTF("found leaf page %u for key %.*s", mp->pgno,
14235d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL);
14245d465952Smartinh 
14255d465952Smartinh 	*mpp = mp;
14265d465952Smartinh 	return BT_SUCCESS;
14275d465952Smartinh }
14285d465952Smartinh 
14295d465952Smartinh /* Search for the page a given key should be in.
14305d465952Smartinh  * Stores a pointer to the found page in *mpp.
14315d465952Smartinh  * If key is NULL, search for the lowest page (used by btree_cursor_first).
14325d465952Smartinh  * If cursor is non-null, pushes parent pages on the cursor stack.
14335d465952Smartinh  * If modify is true, visited pages are updated with new page numbers.
14345d465952Smartinh  */
14355d465952Smartinh static int
14365d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,
14375d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
14385d465952Smartinh {
14395d465952Smartinh 	int		 rc;
14405d465952Smartinh 	pgno_t		 root;
14415d465952Smartinh 	struct mpage	*mp;
14425d465952Smartinh 
14435d465952Smartinh 	/* Can't modify pages outside a transaction. */
14445d465952Smartinh 	if (txn == NULL && modify) {
14455d465952Smartinh 		errno = EINVAL;
14465d465952Smartinh 		return BT_FAIL;
14475d465952Smartinh 	}
14485d465952Smartinh 
14495d465952Smartinh 	/* Choose which root page to start with. If a transaction is given
14505d465952Smartinh          * use the root page from the transaction, otherwise read the last
14515d465952Smartinh          * committed root page.
14525d465952Smartinh 	 */
14535d465952Smartinh 	if (txn == NULL) {
14545d465952Smartinh 		if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)
14555d465952Smartinh 			return rc;
145637ee45a1Smartinh 		root = bt->meta.root;
14575d465952Smartinh 	} else if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
14585d465952Smartinh 		DPRINTF("transaction has failed, must abort");
14590279e526Smartinh 		errno = EINVAL;
14605d465952Smartinh 		return BT_FAIL;
14615d465952Smartinh 	} else
14625d465952Smartinh 		root = txn->root;
14635d465952Smartinh 
14640279e526Smartinh 	if (root == P_INVALID) {		/* Tree is empty. */
146537ee45a1Smartinh 		DPRINTF("tree is empty");
14660279e526Smartinh 		errno = ENOENT;
14670279e526Smartinh 		return BT_FAIL;
14680279e526Smartinh 	}
14695d465952Smartinh 
147037ee45a1Smartinh 	if ((mp = btree_get_mpage(bt, root)) == NULL)
147137ee45a1Smartinh 		return BT_FAIL;
147237ee45a1Smartinh 
147337ee45a1Smartinh 	DPRINTF("root page has flags 0x%X", mp->page->flags);
14745d465952Smartinh 
14755d465952Smartinh 	assert(mp->parent == NULL);
14765d465952Smartinh 	assert(mp->prefix.len == 0);
14775d465952Smartinh 
14785d465952Smartinh 	if (modify && !mp->dirty) {
14795d465952Smartinh 		mpage_touch(bt, mp);
14805d465952Smartinh 		txn->root = mp->pgno;
14815d465952Smartinh 	}
14825d465952Smartinh 
14835d465952Smartinh 	return btree_search_page_root(bt, mp, key, cursor, modify, mpp);
14845d465952Smartinh }
14855d465952Smartinh 
14865d465952Smartinh static int
14875d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,
14885d465952Smartinh     struct btval *data)
14895d465952Smartinh {
149037ee45a1Smartinh 	struct mpage	*omp;		/* overflow mpage */
14915d465952Smartinh 	size_t		 psz;
14925d465952Smartinh 	size_t		 max;
14935d465952Smartinh 	size_t		 sz = 0;
14945d465952Smartinh 	pgno_t		 pgno;
14955d465952Smartinh 
14965d465952Smartinh 	bzero(data, sizeof(*data));
149737ee45a1Smartinh 	max = bt->head.psize - PAGEHDRSZ;
14985d465952Smartinh 
14995d465952Smartinh 	if (!F_ISSET(leaf->flags, F_BIGDATA)) {
15005d465952Smartinh 		data->size = leaf->n_dsize;
15015d465952Smartinh 		if (data->size > 0) {
15025d465952Smartinh 			if (mp == NULL) {
15035d465952Smartinh 				if ((data->data = malloc(data->size)) == NULL)
15045d465952Smartinh 					return BT_FAIL;
15055d465952Smartinh 				bcopy(NODEDATA(leaf), data->data, data->size);
15065d465952Smartinh 				data->free_data = 1;
15075d465952Smartinh 				data->mp = NULL;
15085d465952Smartinh 			} else {
15095d465952Smartinh 				data->data = NODEDATA(leaf);
15105d465952Smartinh 				data->free_data = 0;
15115d465952Smartinh 				data->mp = mp;
15125d465952Smartinh 				mp->ref++;
15135d465952Smartinh 			}
15145d465952Smartinh 		}
15155d465952Smartinh 		return BT_SUCCESS;
15165d465952Smartinh 	}
15175d465952Smartinh 
15185d465952Smartinh 	/* Read overflow data.
15195d465952Smartinh 	 */
15205d465952Smartinh 	DPRINTF("allocating %u byte for overflow data", leaf->n_dsize);
15215d465952Smartinh 	if ((data->data = malloc(leaf->n_dsize)) == NULL)
15225d465952Smartinh 		return BT_FAIL;
15235d465952Smartinh 	data->size = leaf->n_dsize;
15245d465952Smartinh 	data->free_data = 1;
15255d465952Smartinh 	data->mp = NULL;
15265d465952Smartinh 	pgno = *(pgno_t *)NODEDATA(leaf);	/* XXX: alignment? */
15275d465952Smartinh 	for (sz = 0; sz < data->size; ) {
152837ee45a1Smartinh 		if ((omp = btree_get_mpage(bt, pgno)) == NULL ||
152937ee45a1Smartinh 		    !F_ISSET(omp->page->flags, P_OVERFLOW)) {
153037ee45a1Smartinh 			DPRINTF("read overflow page failed (%02x)", omp->page->flags);
15315d465952Smartinh 			free(data->data);
153237ee45a1Smartinh 			mpage_free(omp);
15335d465952Smartinh 			return BT_FAIL;
15345d465952Smartinh 		}
15355d465952Smartinh 		psz = data->size - sz;
15365d465952Smartinh 		if (psz > max)
15375d465952Smartinh 			psz = max;
153837ee45a1Smartinh 		bcopy(omp->page->ptrs, (char *)data->data + sz, psz);
15395d465952Smartinh 		sz += psz;
154037ee45a1Smartinh 		pgno = omp->page->p_next_pgno;
15415d465952Smartinh 	}
15425d465952Smartinh 
15435d465952Smartinh 	return BT_SUCCESS;
15445d465952Smartinh }
15455d465952Smartinh 
15465d465952Smartinh int
15475d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn,
15485d465952Smartinh     struct btval *key, struct btval *data)
15495d465952Smartinh {
15505d465952Smartinh 	int		 rc, exact;
15515d465952Smartinh 	struct node	*leaf;
15525d465952Smartinh 	struct mpage	*mp;
15535d465952Smartinh 
15545d465952Smartinh 	assert(key);
15555d465952Smartinh 	assert(data);
15565d465952Smartinh 	DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data);
15575d465952Smartinh 
155859916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
155959916c99Smartinh 		errno = EINVAL;
156059916c99Smartinh 		return BT_FAIL;
156159916c99Smartinh 	}
156259916c99Smartinh 
156359916c99Smartinh 	if (bt == NULL) {
156459916c99Smartinh 		if (txn == NULL) {
156559916c99Smartinh 			errno = EINVAL;
156659916c99Smartinh 			return BT_FAIL;
156759916c99Smartinh 		}
156859916c99Smartinh 		bt = txn->bt;
156959916c99Smartinh 	}
157059916c99Smartinh 
15715d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
15725d465952Smartinh 		errno = EINVAL;
15735d465952Smartinh 		return BT_FAIL;
15745d465952Smartinh 	}
15755d465952Smartinh 
15765d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS)
15775d465952Smartinh 		return rc;
15785d465952Smartinh 
15795d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, NULL);
15805d465952Smartinh 	if (leaf && exact)
15815d465952Smartinh 		rc = btree_read_data(bt, mp, leaf, data);
15820279e526Smartinh 	else {
15830279e526Smartinh 		errno = ENOENT;
15840279e526Smartinh 		rc = BT_FAIL;
15850279e526Smartinh 	}
15865d465952Smartinh 
15875d465952Smartinh 	mpage_prune(bt);
15885d465952Smartinh 	return rc;
15895d465952Smartinh }
15905d465952Smartinh 
15915d465952Smartinh static int
15925d465952Smartinh btree_sibling(struct cursor *cursor, int move_right)
15935d465952Smartinh {
15945d465952Smartinh 	int		 rc;
15955d465952Smartinh 	struct node	*indx;
15965d465952Smartinh 	struct ppage	*parent, *top;
15975d465952Smartinh 	struct mpage	*mp;
15985d465952Smartinh 
15995d465952Smartinh 	top = CURSOR_TOP(cursor);
16000279e526Smartinh 	if ((parent = SLIST_NEXT(top, entry)) == NULL) {
16010279e526Smartinh 		errno = ENOENT;
16020279e526Smartinh 		return BT_FAIL;			/* root has no siblings */
16030279e526Smartinh 	}
16045d465952Smartinh 
16055d465952Smartinh 	DPRINTF("parent page is page %u, index %u",
16065d465952Smartinh 	    parent->mpage->pgno, parent->ki);
16075d465952Smartinh 
16085d465952Smartinh 	cursor_pop_page(cursor);
16095d465952Smartinh 	if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))
16105d465952Smartinh 		       : (parent->ki == 0)) {
16115d465952Smartinh 		DPRINTF("no more keys left, moving to %s sibling",
16125d465952Smartinh 		    move_right ? "right" : "left");
16135d465952Smartinh 		if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS)
16145d465952Smartinh 			return rc;
16155d465952Smartinh 		parent = CURSOR_TOP(cursor);
16165d465952Smartinh 	} else {
16175d465952Smartinh 		if (move_right)
16185d465952Smartinh 			parent->ki++;
16195d465952Smartinh 		else
16205d465952Smartinh 			parent->ki--;
16215d465952Smartinh 		DPRINTF("just moving to %s index key %u",
16225d465952Smartinh 		    move_right ? "right" : "left", parent->ki);
16235d465952Smartinh 	}
16245d465952Smartinh 	assert(IS_BRANCH(parent->mpage));
16255d465952Smartinh 
16265d465952Smartinh 	indx = NODEPTR(parent->mpage, parent->ki);
162737ee45a1Smartinh 	if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL)
16285d465952Smartinh 		return BT_FAIL;
16295d465952Smartinh 	mp->parent = parent->mpage;
16305d465952Smartinh 	mp->parent_index = parent->ki;
16315d465952Smartinh 
16325d465952Smartinh 	cursor_push_page(cursor, mp);
16335d465952Smartinh 	find_common_prefix(cursor->bt, mp);
16345d465952Smartinh 
16355d465952Smartinh 	return BT_SUCCESS;
16365d465952Smartinh }
16375d465952Smartinh 
16385d465952Smartinh static int
16395d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,
16405d465952Smartinh     struct btval *key)
16415d465952Smartinh {
16425d465952Smartinh 	if (key == NULL)
16435d465952Smartinh 		return 0;
16445d465952Smartinh 
16455d465952Smartinh 	if (mp->prefix.len > 0) {
16465d465952Smartinh 		key->size = node->ksize + mp->prefix.len;
16475d465952Smartinh 		key->data = malloc(key->size);
16485d465952Smartinh 		if (key->data == NULL)
16495d465952Smartinh 			return -1;
16505d465952Smartinh 		concat_prefix(bt,
16515d465952Smartinh 		    mp->prefix.str, mp->prefix.len,
16525d465952Smartinh 		    NODEKEY(node), node->ksize,
16535d465952Smartinh 		    key->data, &key->size);
16545d465952Smartinh 		key->free_data = 1;
16555d465952Smartinh 	} else {
16565d465952Smartinh 		key->size = node->ksize;
16575d465952Smartinh 		key->data = NODEKEY(node);
16585d465952Smartinh 		key->free_data = 0;
16595d465952Smartinh 		key->mp = mp;
16605d465952Smartinh 		mp->ref++;
16615d465952Smartinh 	}
16625d465952Smartinh 
16635d465952Smartinh 	return 0;
16645d465952Smartinh }
16655d465952Smartinh 
16665d465952Smartinh static int
16675d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)
16685d465952Smartinh {
16695d465952Smartinh 	struct ppage	*top;
16705d465952Smartinh 	struct mpage	*mp;
16715d465952Smartinh 	struct node	*leaf;
16725d465952Smartinh 
16730279e526Smartinh 	if (cursor->eof) {
16740279e526Smartinh 		errno = ENOENT;
16750279e526Smartinh 		return BT_FAIL;
16760279e526Smartinh 	}
16775d465952Smartinh 
16785d465952Smartinh 	assert(cursor->initialized);
16795d465952Smartinh 
16805d465952Smartinh 	top = CURSOR_TOP(cursor);
16815d465952Smartinh 	mp = top->mpage;
16825d465952Smartinh 
16835d465952Smartinh 	DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor);
16845d465952Smartinh 
16855d465952Smartinh 	if (top->ki + 1 >= NUMKEYS(mp)) {
16865d465952Smartinh 		DPRINTF("=====> move to next sibling page");
16875d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS) {
16885d465952Smartinh 			cursor->eof = 1;
16890279e526Smartinh 			return BT_FAIL;
16905d465952Smartinh 		}
16915d465952Smartinh 		top = CURSOR_TOP(cursor);
16925d465952Smartinh 		mp = top->mpage;
16935d465952Smartinh 		DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
16945d465952Smartinh 	} else
16955d465952Smartinh 		top->ki++;
16965d465952Smartinh 
16975d465952Smartinh 	DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
16985d465952Smartinh 	    mp->pgno, NUMKEYS(mp), top->ki);
16995d465952Smartinh 
17005d465952Smartinh 	assert(IS_LEAF(mp));
17015d465952Smartinh 	leaf = NODEPTR(mp, top->ki);
17025d465952Smartinh 
17035d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17045d465952Smartinh 		return BT_FAIL;
17055d465952Smartinh 
17065d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17075d465952Smartinh 		return BT_FAIL;
17085d465952Smartinh 
17095d465952Smartinh 	return BT_SUCCESS;
17105d465952Smartinh }
17115d465952Smartinh 
17125d465952Smartinh static int
17135d465952Smartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data)
17145d465952Smartinh {
17155d465952Smartinh 	int		 rc;
17165d465952Smartinh 	struct node	*leaf;
17175d465952Smartinh 	struct mpage	*mp;
17185d465952Smartinh 	struct ppage	*top;
17195d465952Smartinh 
17205d465952Smartinh 	assert(cursor);
17215d465952Smartinh 	assert(key);
17225d465952Smartinh 	assert(key->size > 0);
17235d465952Smartinh 
17245d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp);
17255d465952Smartinh 	if (rc != BT_SUCCESS)
17265d465952Smartinh 		return rc;
17275d465952Smartinh 	assert(IS_LEAF(mp));
17285d465952Smartinh 
17295d465952Smartinh 	top = CURSOR_TOP(cursor);
17305d465952Smartinh 	leaf = btree_search_node(cursor->bt, mp, key, NULL, &top->ki);
17315d465952Smartinh 	if (leaf == NULL) {
17325d465952Smartinh 		DPRINTF("===> inexact leaf not found, goto sibling");
17335d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS)
17340279e526Smartinh 			return BT_FAIL;		/* no entries matched */
17355d465952Smartinh 		top = CURSOR_TOP(cursor);
17365d465952Smartinh 		top->ki = 0;
17375d465952Smartinh 		mp = top->mpage;
17385d465952Smartinh 		assert(IS_LEAF(mp));
17395d465952Smartinh 		leaf = NODEPTR(mp, 0);
17405d465952Smartinh 	}
17415d465952Smartinh 
17425d465952Smartinh 	cursor->initialized = 1;
17435d465952Smartinh 	cursor->eof = 0;
17445d465952Smartinh 
17455d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17465d465952Smartinh 		return BT_FAIL;
17475d465952Smartinh 
17485d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17495d465952Smartinh 		return BT_FAIL;
17505d465952Smartinh 	DPRINTF("==> cursor placed on key %.*s",
17515d465952Smartinh 	    (int)key->size, (char *)key->data);
17525d465952Smartinh 
17535d465952Smartinh 	return BT_SUCCESS;
17545d465952Smartinh }
17555d465952Smartinh 
17565d465952Smartinh static int
17575d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)
17585d465952Smartinh {
17595d465952Smartinh 	int		 rc;
17605d465952Smartinh 	struct mpage	*mp;
17615d465952Smartinh 	struct node	*leaf;
17625d465952Smartinh 
17635d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp);
17645d465952Smartinh 	if (rc != BT_SUCCESS)
17655d465952Smartinh 		return rc;
17665d465952Smartinh 	assert(IS_LEAF(mp));
17675d465952Smartinh 
17685d465952Smartinh 	leaf = NODEPTR(mp, 0);
17695d465952Smartinh 	cursor->initialized = 1;
17705d465952Smartinh 	cursor->eof = 0;
17715d465952Smartinh 
17725d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17735d465952Smartinh 		return BT_FAIL;
17745d465952Smartinh 
17755d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17765d465952Smartinh 		return BT_FAIL;
17775d465952Smartinh 
17785d465952Smartinh 	return BT_SUCCESS;
17795d465952Smartinh }
17805d465952Smartinh 
17815d465952Smartinh int
17825d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,
17835d465952Smartinh     enum cursor_op op)
17845d465952Smartinh {
17855d465952Smartinh 	int		 rc;
17865d465952Smartinh 
17875d465952Smartinh 	assert(cursor);
17885d465952Smartinh 
17895d465952Smartinh 	switch (op) {
17905d465952Smartinh 	case BT_CURSOR:
17915d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
17925d465952Smartinh 			cursor_pop_page(cursor);
17935d465952Smartinh 		if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) {
17945d465952Smartinh 			errno = EINVAL;
17955d465952Smartinh 			rc = BT_FAIL;
17965d465952Smartinh 		} else
17975d465952Smartinh 			rc = btree_cursor_set(cursor, key, data);
17985d465952Smartinh 		break;
17995d465952Smartinh 	case BT_NEXT:
18005d465952Smartinh 		if (!cursor->initialized)
18015d465952Smartinh 			rc = btree_cursor_first(cursor, key, data);
18025d465952Smartinh 		else
18035d465952Smartinh 			rc = btree_cursor_next(cursor, key, data);
18045d465952Smartinh 		break;
18055d465952Smartinh 	case BT_FIRST:
18065d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
18075d465952Smartinh 			cursor_pop_page(cursor);
18085d465952Smartinh 		rc = btree_cursor_first(cursor, key, data);
18095d465952Smartinh 		break;
18105d465952Smartinh 	default:
18115d465952Smartinh 		DPRINTF("unhandled/unimplemented cursor operation %u", op);
18125d465952Smartinh 		rc = BT_FAIL;
18135d465952Smartinh 		break;
18145d465952Smartinh 	}
18155d465952Smartinh 
18165d465952Smartinh 	mpage_prune(cursor->bt);
18175d465952Smartinh 
18185d465952Smartinh 	return rc;
18195d465952Smartinh }
18205d465952Smartinh 
18215d465952Smartinh static struct mpage *
18225d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags)
18235d465952Smartinh {
18245d465952Smartinh 	struct mpage	*mp;
18255d465952Smartinh 
18265d465952Smartinh 	assert(bt != NULL);
18275d465952Smartinh 	assert(bt->txn != NULL);
18285d465952Smartinh 
182937ee45a1Smartinh 	DPRINTF("allocating new mpage %u, page size %u",
183037ee45a1Smartinh 	    bt->txn->next_pgno, bt->head.psize);
18315d465952Smartinh 	if ((mp = calloc(1, sizeof(*mp))) == NULL)
18325d465952Smartinh 		return NULL;
183337ee45a1Smartinh 	if ((mp->page = malloc(bt->head.psize)) == NULL) {
183437ee45a1Smartinh 		free(mp);
183537ee45a1Smartinh 		return NULL;
183637ee45a1Smartinh 	}
183737ee45a1Smartinh 	mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
183837ee45a1Smartinh 	mp->page->flags = flags;
183937ee45a1Smartinh 	mp->page->lower = PAGEHDRSZ;
184037ee45a1Smartinh 	mp->page->upper = bt->head.psize;
18415d465952Smartinh 
18425d465952Smartinh 	if (IS_BRANCH(mp))
184337ee45a1Smartinh 		bt->meta.branch_pages++;
18445d465952Smartinh 	else if (IS_LEAF(mp))
184537ee45a1Smartinh 		bt->meta.leaf_pages++;
18465d465952Smartinh 	else if (IS_OVERFLOW(mp))
184737ee45a1Smartinh 		bt->meta.overflow_pages++;
18485d465952Smartinh 
18495d465952Smartinh 	mpage_add(bt, mp);
18505d465952Smartinh 	mpage_dirty(bt, mp);
18515d465952Smartinh 
18525d465952Smartinh 	return mp;
18535d465952Smartinh }
18545d465952Smartinh 
18555d465952Smartinh static size_t
185637ee45a1Smartinh bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data)
18575d465952Smartinh {
18585d465952Smartinh 	size_t		 sz;
18595d465952Smartinh 
18605d465952Smartinh 	sz = LEAFSIZE(key, data);
186137ee45a1Smartinh 	if (data->size >= bt->head.psize / BT_MINKEYS) {
18625d465952Smartinh 		/* put on overflow page */
18635d465952Smartinh 		sz -= data->size - sizeof(pgno_t);
18645d465952Smartinh 	}
18655d465952Smartinh 
18665d465952Smartinh 	return sz + sizeof(indx_t);
18675d465952Smartinh }
18685d465952Smartinh 
18695d465952Smartinh static size_t
187037ee45a1Smartinh bt_branch_size(struct btree *bt, struct btval *key)
18715d465952Smartinh {
18725d465952Smartinh 	size_t		 sz;
18735d465952Smartinh 
18745d465952Smartinh 	sz = INDXSIZE(key);
187537ee45a1Smartinh 	if (sz >= bt->head.psize / BT_MINKEYS) {
18765d465952Smartinh 		/* put on overflow page */
18775d465952Smartinh 		/* not implemented */
18785d465952Smartinh 		/* sz -= key->size - sizeof(pgno_t); */
18795d465952Smartinh 	}
18805d465952Smartinh 
18815d465952Smartinh 	return sz + sizeof(indx_t);
18825d465952Smartinh }
18835d465952Smartinh 
18845d465952Smartinh static int
18855d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)
18865d465952Smartinh {
18875d465952Smartinh 	size_t		 done = 0;
18885d465952Smartinh 	size_t		 sz;
18895d465952Smartinh 	size_t		 max;
18905d465952Smartinh 	pgno_t		*linkp;			/* linked page stored here */
18915d465952Smartinh 	struct mpage	*next = NULL;
18925d465952Smartinh 
189337ee45a1Smartinh 	max = bt->head.psize - PAGEHDRSZ;
18945d465952Smartinh 
18955d465952Smartinh 	while (done < data->size) {
18965d465952Smartinh 		linkp = &p->p_next_pgno;
18975d465952Smartinh 		if (data->size - done > max) {
18985d465952Smartinh 			/* need another overflow page */
18995d465952Smartinh 			if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)
19005d465952Smartinh 				return BT_FAIL;
19015d465952Smartinh 			*linkp = next->pgno;
19025d465952Smartinh 			DPRINTF("linking overflow page %u", next->pgno);
19035d465952Smartinh 		} else
19045d465952Smartinh 			*linkp = 0;		/* indicates end of list */
19055d465952Smartinh 		sz = data->size - done;
19065d465952Smartinh 		if (sz > max)
19075d465952Smartinh 			sz = max;
19085d465952Smartinh 		DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno);
19095d465952Smartinh 		bcopy((char *)data->data + done, p->ptrs, sz);
19105d465952Smartinh 		done += sz;
191137ee45a1Smartinh 		p = next->page;
19125d465952Smartinh 	}
19135d465952Smartinh 
19145d465952Smartinh 	return BT_SUCCESS;
19155d465952Smartinh }
19165d465952Smartinh 
19175d465952Smartinh /* Key prefix should already be stripped.
19185d465952Smartinh  */
19195d465952Smartinh static int
19205d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,
19215d465952Smartinh     struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)
19225d465952Smartinh {
19235d465952Smartinh 	unsigned int	 i;
19245d465952Smartinh 	size_t		 node_size = NODESIZE;
19255d465952Smartinh 	indx_t		 ofs;
19265d465952Smartinh 	struct node	*node;
19275d465952Smartinh 	struct page	*p;
19285d465952Smartinh 	struct mpage	*ofp = NULL;		/* overflow page */
19295d465952Smartinh 
193037ee45a1Smartinh 	p = mp->page;
19315d465952Smartinh 	assert(p->upper >= p->lower);
19325d465952Smartinh 
193337ee45a1Smartinh 	DPRINTF("add node [%.*s] to %s page %u at index %i, key size %zu",
19345d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL,
19355d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
193637ee45a1Smartinh 	    mp->pgno, indx, key ? key->size : 0);
19375d465952Smartinh 
19385d465952Smartinh 	if (key != NULL)
19395d465952Smartinh 		node_size += key->size;
19405d465952Smartinh 
19415d465952Smartinh 	if (IS_LEAF(mp)) {
19425d465952Smartinh 		assert(data);
19435d465952Smartinh 		node_size += data->size;
19445d465952Smartinh 		if (F_ISSET(flags, F_BIGDATA)) {
19455d465952Smartinh 			/* Data already on overflow page. */
19465d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
194737ee45a1Smartinh 		} else if (data->size >= bt->head.psize / BT_MINKEYS) {
19485d465952Smartinh 			/* Put data on overflow page. */
19495d465952Smartinh 			DPRINTF("data size is %zu, put on overflow page",
19505d465952Smartinh 			    data->size);
19515d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
19525d465952Smartinh 			if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)
19535d465952Smartinh 				return BT_FAIL;
19545d465952Smartinh 			DPRINTF("allocated overflow page %u", ofp->pgno);
19555d465952Smartinh 			flags |= F_BIGDATA;
19565d465952Smartinh 		}
19575d465952Smartinh 	}
19585d465952Smartinh 
19595d465952Smartinh 	if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
19605d465952Smartinh 		DPRINTF("not enough room in page %u, got %lu ptrs",
19615d465952Smartinh 		    mp->pgno, NUMKEYS(mp));
19625d465952Smartinh 		DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower,
19635d465952Smartinh 		    p->upper - p->lower);
196437ee45a1Smartinh 		DPRINTF("node size = %zu", node_size);
19655d465952Smartinh 		return BT_FAIL;
19665d465952Smartinh 	}
19675d465952Smartinh 
19685d465952Smartinh 	/* Move higher pointers up one slot. */
19695d465952Smartinh 	for (i = NUMKEYS(mp); i > indx; i--)
19705d465952Smartinh 		p->ptrs[i] = p->ptrs[i - 1];
19715d465952Smartinh 
19725d465952Smartinh 	/* Adjust free space offsets. */
19735d465952Smartinh 	ofs = p->upper - node_size;
19745d465952Smartinh 	assert(ofs >= p->lower + sizeof(indx_t));
19755d465952Smartinh 	p->ptrs[indx] = ofs;
19765d465952Smartinh 	p->upper = ofs;
19775d465952Smartinh 	p->lower += sizeof(indx_t);
19785d465952Smartinh 
19795d465952Smartinh 	/* Write the node data. */
19805d465952Smartinh 	node = NODEPTR(mp, indx);
19815d465952Smartinh 	node->ksize = (key == NULL) ? 0 : key->size;
19825d465952Smartinh 	node->flags = flags;
19835d465952Smartinh 	if (IS_LEAF(mp))
19845d465952Smartinh 		node->n_dsize = data->size;
19855d465952Smartinh 	else
19865d465952Smartinh 		node->n_pgno = pgno;
19875d465952Smartinh 
19885d465952Smartinh 	if (key)
19895d465952Smartinh 		bcopy(key->data, NODEKEY(node), key->size);
19905d465952Smartinh 
19915d465952Smartinh 	if (IS_LEAF(mp)) {
19925d465952Smartinh 		if (ofp == NULL) {
19935d465952Smartinh 			if (F_ISSET(flags, F_BIGDATA))
19945d465952Smartinh 				bcopy(data->data, node->data + key->size,
19955d465952Smartinh 				    sizeof(pgno_t));
19965d465952Smartinh 			else
19975d465952Smartinh 				bcopy(data->data, node->data + key->size,
19985d465952Smartinh 				    data->size);
19995d465952Smartinh 		} else {
20005d465952Smartinh 			bcopy(&ofp->pgno, node->data + key->size,
20015d465952Smartinh 			    sizeof(pgno_t));
200237ee45a1Smartinh 			if (btree_write_overflow_data(bt, ofp->page,
20035d465952Smartinh 			    data) == BT_FAIL)
20045d465952Smartinh 				return BT_FAIL;
20055d465952Smartinh 		}
20065d465952Smartinh 	}
20075d465952Smartinh 
20085d465952Smartinh 	return BT_SUCCESS;
20095d465952Smartinh }
20105d465952Smartinh 
20115d465952Smartinh static void
20125d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx)
20135d465952Smartinh {
20145d465952Smartinh 	unsigned int	 sz;
20155d465952Smartinh 	indx_t		 i, j, numkeys, ptr;
20165d465952Smartinh 	struct node	*node;
20175d465952Smartinh 	char		*base;
20185d465952Smartinh 
20195d465952Smartinh 	DPRINTF("delete node %u on %s page %u", indx,
20205d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);
20215d465952Smartinh 	assert(indx < NUMKEYS(mp));
20225d465952Smartinh 
20235d465952Smartinh 	node = NODEPTR(mp, indx);
20245d465952Smartinh 	sz = NODESIZE + node->ksize;
20255d465952Smartinh 	if (IS_LEAF(mp)) {
20265d465952Smartinh 		if (F_ISSET(node->flags, F_BIGDATA))
20275d465952Smartinh 			sz += sizeof(pgno_t);
20285d465952Smartinh 		else
20295d465952Smartinh 			sz += NODEDSZ(node);
20305d465952Smartinh 	}
20315d465952Smartinh 
203237ee45a1Smartinh 	ptr = mp->page->ptrs[indx];
20335d465952Smartinh 	numkeys = NUMKEYS(mp);
20345d465952Smartinh 	for (i = j = 0; i < numkeys; i++) {
20355d465952Smartinh 		if (i != indx) {
203637ee45a1Smartinh 			mp->page->ptrs[j] = mp->page->ptrs[i];
203737ee45a1Smartinh 			if (mp->page->ptrs[i] < ptr)
203837ee45a1Smartinh 				mp->page->ptrs[j] += sz;
20395d465952Smartinh 			j++;
20405d465952Smartinh 		}
20415d465952Smartinh 	}
20425d465952Smartinh 
204337ee45a1Smartinh 	base = (char *)mp->page + mp->page->upper;
204437ee45a1Smartinh 	bcopy(base, base + sz, ptr - mp->page->upper);
20455d465952Smartinh 
204637ee45a1Smartinh 	mp->page->lower -= sizeof(indx_t);
204737ee45a1Smartinh 	mp->page->upper += sz;
20485d465952Smartinh }
20495d465952Smartinh 
20505d465952Smartinh struct cursor *
20515d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)
20525d465952Smartinh {
20535d465952Smartinh 	struct cursor	*cursor;
20545d465952Smartinh 
205559916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
205659916c99Smartinh 		errno = EINVAL;
205759916c99Smartinh 		return NULL;
205859916c99Smartinh 	}
205959916c99Smartinh 
206059916c99Smartinh 	if (bt == NULL) {
206159916c99Smartinh 		if (txn == NULL) {
206259916c99Smartinh 			errno = EINVAL;
206359916c99Smartinh 			return NULL;
206459916c99Smartinh 		}
206559916c99Smartinh 		bt = txn->bt;
206659916c99Smartinh 	}
206759916c99Smartinh 
20685d465952Smartinh 	if ((cursor = calloc(1, sizeof(*cursor))) != NULL) {
20695d465952Smartinh 		SLIST_INIT(&cursor->stack);
20705d465952Smartinh 		cursor->bt = bt;
20715d465952Smartinh 		cursor->txn = txn;
2072408be415Smartinh 		btree_ref(bt);
20735d465952Smartinh 	}
20745d465952Smartinh 
20755d465952Smartinh 	return cursor;
20765d465952Smartinh }
20775d465952Smartinh 
20785d465952Smartinh void
20795d465952Smartinh btree_cursor_close(struct cursor *cursor)
20805d465952Smartinh {
20815d465952Smartinh 	if (cursor != NULL) {
20825d465952Smartinh 		while (!CURSOR_EMPTY(cursor))
20835d465952Smartinh 			cursor_pop_page(cursor);
20845d465952Smartinh 
20855d465952Smartinh 		btree_close(cursor->bt);
20865d465952Smartinh 		free(cursor);
20875d465952Smartinh 	}
20885d465952Smartinh }
20895d465952Smartinh 
20905d465952Smartinh static int
20915d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx,
20925d465952Smartinh     struct btval *key)
20935d465952Smartinh {
20945d465952Smartinh 	indx_t			 ptr, i, numkeys;
20955d465952Smartinh 	int			 delta;
20965d465952Smartinh 	size_t			 len;
20975d465952Smartinh 	struct node		*node;
20985d465952Smartinh 	char			*base;
20995d465952Smartinh 
21005d465952Smartinh 	node = NODEPTR(mp, indx);
210137ee45a1Smartinh 	ptr = mp->page->ptrs[indx];
21025d465952Smartinh 	DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u",
21035d465952Smartinh 	    indx, ptr,
21045d465952Smartinh 	    (int)node->ksize, (char *)NODEKEY(node),
21055d465952Smartinh 	    (int)key->size, (char *)key->data,
21065d465952Smartinh 	    mp->pgno);
21075d465952Smartinh 
21085d465952Smartinh 	if (key->size != node->ksize) {
21095d465952Smartinh 		delta = key->size - node->ksize;
21105d465952Smartinh 		if (delta > 0 && SIZELEFT(mp) < delta) {
21115d465952Smartinh 			DPRINTF("OUCH! Not enough room, delta = %d", delta);
21125d465952Smartinh 			return BT_FAIL;
21135d465952Smartinh 		}
21145d465952Smartinh 
21155d465952Smartinh 		numkeys = NUMKEYS(mp);
21165d465952Smartinh 		for (i = 0; i < numkeys; i++) {
211737ee45a1Smartinh 			if (mp->page->ptrs[i] <= ptr)
211837ee45a1Smartinh 				mp->page->ptrs[i] -= delta;
21195d465952Smartinh 		}
21205d465952Smartinh 
212137ee45a1Smartinh 		base = (char *)mp->page + mp->page->upper;
212237ee45a1Smartinh 		len = ptr - mp->page->upper + NODESIZE;
21235d465952Smartinh 		bcopy(base, base - delta, len);
212437ee45a1Smartinh 		mp->page->upper -= delta;
21255d465952Smartinh 
21265d465952Smartinh 		node = NODEPTR(mp, indx);
21275d465952Smartinh 		node->ksize = key->size;
21285d465952Smartinh 	}
21295d465952Smartinh 
21305d465952Smartinh 	bcopy(key->data, NODEKEY(node), key->size);
21315d465952Smartinh 
21325d465952Smartinh 	return BT_SUCCESS;
21335d465952Smartinh }
21345d465952Smartinh 
21355d465952Smartinh static int
21365d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)
21375d465952Smartinh {
21385d465952Smartinh 	indx_t		 i;
21395d465952Smartinh 	struct node	*node;
21405d465952Smartinh 	struct btkey	 tmpkey;
21415d465952Smartinh 	struct btval	 key;
21425d465952Smartinh 
21435d465952Smartinh 	DPRINTF("adjusting prefix lengths on page %u with delta %d",
21445d465952Smartinh 	    src->pgno, delta);
21455d465952Smartinh 	assert(delta != 0);
21465d465952Smartinh 
21475d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
21485d465952Smartinh 		node = NODEPTR(src, i);
21495d465952Smartinh 		tmpkey.len = node->ksize - delta;
21505d465952Smartinh 		if (delta > 0) {
21515d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY))
21525d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);
21535d465952Smartinh 			else
21545d465952Smartinh 				bcopy((char *)NODEKEY(node) + delta, tmpkey.str,
21555d465952Smartinh 				    tmpkey.len);
21565d465952Smartinh 		} else {
21575d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
21585d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, node->ksize);
21595d465952Smartinh 				bcopy(src->prefix.str, tmpkey.str + node->ksize,
21605d465952Smartinh 				    -delta);
21615d465952Smartinh 			} else {
21625d465952Smartinh 				bcopy(src->prefix.str + src->prefix.len + delta,
21635d465952Smartinh 				    tmpkey.str, -delta);
21645d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str - delta,
21655d465952Smartinh 				    node->ksize);
21665d465952Smartinh 			}
21675d465952Smartinh 		}
21685d465952Smartinh 		key.size = tmpkey.len;
21695d465952Smartinh 		key.data = tmpkey.str;
21705d465952Smartinh 		if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)
21715d465952Smartinh 			return BT_FAIL;
21725d465952Smartinh 	}
21735d465952Smartinh 
21745d465952Smartinh 	return BT_SUCCESS;
21755d465952Smartinh }
21765d465952Smartinh 
21775d465952Smartinh /* Move a node from src to dst.
21785d465952Smartinh  */
21795d465952Smartinh static int
21805d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,
21815d465952Smartinh     struct mpage *dst, indx_t dstindx)
21825d465952Smartinh {
21835d465952Smartinh 	int			 rc;
21845d465952Smartinh 	unsigned int		 pfxlen, mp_pfxlen = 0;
21855d465952Smartinh 	struct node		*node, *srcnode;
218637ee45a1Smartinh 	struct mpage		*mp = NULL;
21875d465952Smartinh 	struct btkey		 tmpkey, srckey;
21885d465952Smartinh 	struct btval		 key, data;
21895d465952Smartinh 
21905d465952Smartinh 	assert(src->parent);
21915d465952Smartinh 	assert(dst->parent);
21925d465952Smartinh 
21935d465952Smartinh 	srcnode = NODEPTR(src, srcindx);
21945d465952Smartinh 	DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u",
21955d465952Smartinh 	    IS_LEAF(src) ? "leaf" : "branch",
21965d465952Smartinh 	    srcindx,
21975d465952Smartinh 	    (int)srcnode->ksize, (char *)NODEKEY(srcnode),
21985d465952Smartinh 	    src->pgno,
21995d465952Smartinh 	    dstindx, dst->pgno);
22005d465952Smartinh 
22015d465952Smartinh 	if (IS_BRANCH(src)) {
22025d465952Smartinh 		/* Need to check if the page the moved node points to
22035d465952Smartinh 		 * changes prefix.
22045d465952Smartinh 		 */
220537ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL)
220637ee45a1Smartinh 			return BT_FAIL;
22075d465952Smartinh 		mp->parent = src;
22085d465952Smartinh 		mp->parent_index = srcindx;
22095d465952Smartinh 		find_common_prefix(bt, mp);
22105d465952Smartinh 		mp_pfxlen = mp->prefix.len;
22115d465952Smartinh 	}
22125d465952Smartinh 
22135d465952Smartinh 	/* Mark src and dst as dirty. */
22145d465952Smartinh 	mpage_touch(bt, src);
22155d465952Smartinh 	mpage_touch(bt, dst);
22165d465952Smartinh 
22175d465952Smartinh 	find_common_prefix(bt, dst);
22185d465952Smartinh 
22195d465952Smartinh 	/* Check if src node has destination page prefix. Otherwise the
22205d465952Smartinh 	 * destination page must expand its prefix on all its nodes.
22215d465952Smartinh 	 */
22225d465952Smartinh 	srckey.len = srcnode->ksize;
22235d465952Smartinh 	bcopy(NODEKEY(srcnode), srckey.str, srckey.len);
22245d465952Smartinh 	common_prefix(bt, &srckey, &dst->prefix, &tmpkey);
22255d465952Smartinh 	if (tmpkey.len != dst->prefix.len) {
22265d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
22275d465952Smartinh 		    tmpkey.len - dst->prefix.len) != BT_SUCCESS)
22285d465952Smartinh 			return BT_FAIL;
22295d465952Smartinh 		bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));
22305d465952Smartinh 	}
22315d465952Smartinh 
22325d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
22335d465952Smartinh 		struct mpage	*low;
22345d465952Smartinh 
22355d465952Smartinh 		/* must find the lowest key below src
22365d465952Smartinh 		 */
22375d465952Smartinh 		assert(btree_search_page_root(bt, src, NULL, NULL, 0,
22385d465952Smartinh 		    &low) == BT_SUCCESS);
22395d465952Smartinh 		expand_prefix(bt, low, 0, &srckey);
22405d465952Smartinh 		DPRINTF("found lowest key [%.*s] on leaf page %u",
22415d465952Smartinh 		    (int)srckey.len, srckey.str, low->pgno);
22425d465952Smartinh 	} else {
22435d465952Smartinh 		srckey.len = srcnode->ksize;
22445d465952Smartinh 		bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);
22455d465952Smartinh 	}
22465d465952Smartinh 	find_common_prefix(bt, src);
22475d465952Smartinh 
22485d465952Smartinh 	/* expand the prefix */
22495d465952Smartinh 	tmpkey.len = sizeof(tmpkey.str);
22505d465952Smartinh 	concat_prefix(bt, src->prefix.str, src->prefix.len,
22515d465952Smartinh 	    srckey.str, srckey.len, tmpkey.str, &tmpkey.len);
22525d465952Smartinh 
22535d465952Smartinh 	/* Add the node to the destination page. Adjust prefix for
22545d465952Smartinh 	 * destination page.
22555d465952Smartinh 	 */
22565d465952Smartinh 	key.size = tmpkey.len;
22575d465952Smartinh 	key.data = tmpkey.str;
22585d465952Smartinh 	remove_prefix(bt, &key, dst->prefix.len);
22595d465952Smartinh 	data.size = NODEDSZ(srcnode);
22605d465952Smartinh 	data.data = NODEDATA(srcnode);
22615d465952Smartinh 	rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),
22625d465952Smartinh 	    srcnode->flags);
22635d465952Smartinh 	if (rc != BT_SUCCESS)
22645d465952Smartinh 		return rc;
22655d465952Smartinh 
22665d465952Smartinh 	/* Delete the node from the source page.
22675d465952Smartinh 	 */
22685d465952Smartinh 	btree_del_node(bt, src, srcindx);
22695d465952Smartinh 
22705d465952Smartinh 	/* Update the parent separators.
22715d465952Smartinh 	 */
22725d465952Smartinh 	if (srcindx == 0 && src->parent_index != 0) {
22735d465952Smartinh 		node = NODEPTR(src->parent, src->parent_index);
22745d465952Smartinh 		DPRINTF("current parent separator for source page %u is [%.*s]",
22755d465952Smartinh 		    src->pgno,
22765d465952Smartinh 		    (int)node->ksize, (char *)NODEKEY(node));
22775d465952Smartinh 
22785d465952Smartinh 		expand_prefix(bt, src, 0, &tmpkey);
22795d465952Smartinh 		key.size = tmpkey.len;
22805d465952Smartinh 		key.data = tmpkey.str;
22815d465952Smartinh 		remove_prefix(bt, &key, src->parent->prefix.len);
22825d465952Smartinh 
22835d465952Smartinh 		DPRINTF("update separator for source page %u to [%.*s]",
22845d465952Smartinh 		    src->pgno, (int)key.size, (char *)key.data);
22855d465952Smartinh 		if (btree_update_key(bt, src->parent, src->parent_index,
22865d465952Smartinh 		    &key) != BT_SUCCESS)
22875d465952Smartinh 			return BT_FAIL;
22885d465952Smartinh 	}
22895d465952Smartinh 
22905d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
22915d465952Smartinh 		struct btval	 nullkey;
22925d465952Smartinh 		nullkey.size = 0;
22935d465952Smartinh 		assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);
22945d465952Smartinh 	}
22955d465952Smartinh 
22965d465952Smartinh 	if (dstindx == 0 && dst->parent_index != 0) {
22975d465952Smartinh 		node = NODEPTR(dst->parent, dst->parent_index);
22985d465952Smartinh 		DPRINTF("current parent separator for destination page %u is [%.*s]",
22995d465952Smartinh 		    dst->pgno,
23005d465952Smartinh 		    (int)node->ksize, (char *)NODEKEY(node));
23015d465952Smartinh 
23025d465952Smartinh 		expand_prefix(bt, dst, 0, &tmpkey);
23035d465952Smartinh 		key.size = tmpkey.len;
23045d465952Smartinh 		key.data = tmpkey.str;
23055d465952Smartinh 		remove_prefix(bt, &key, dst->parent->prefix.len);
23065d465952Smartinh 
23075d465952Smartinh 		DPRINTF("update separator for destination page %u to [%.*s]",
23085d465952Smartinh 		    dst->pgno, (int)key.size, (char *)key.data);
23095d465952Smartinh 		if (btree_update_key(bt, dst->parent, dst->parent_index,
23105d465952Smartinh 		    &key) != BT_SUCCESS)
23115d465952Smartinh 			return BT_FAIL;
23125d465952Smartinh 	}
23135d465952Smartinh 
23145d465952Smartinh 	if (dstindx == 0 && IS_BRANCH(dst)) {
23155d465952Smartinh 		struct btval	 nullkey;
23165d465952Smartinh 		nullkey.size = 0;
23175d465952Smartinh 		assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);
23185d465952Smartinh 	}
23195d465952Smartinh 
23205d465952Smartinh 	/* We can get a new page prefix here!
23215d465952Smartinh 	 * Must update keys in all nodes of this page!
23225d465952Smartinh 	 */
23235d465952Smartinh 	pfxlen = src->prefix.len;
23245d465952Smartinh 	find_common_prefix(bt, src);
23255d465952Smartinh 	if (src->prefix.len != pfxlen) {
23265d465952Smartinh 		if (btree_adjust_prefix(bt, src,
23275d465952Smartinh 		    src->prefix.len - pfxlen) != BT_SUCCESS)
23285d465952Smartinh 			return BT_FAIL;
23295d465952Smartinh 	}
23305d465952Smartinh 
23315d465952Smartinh 	pfxlen = dst->prefix.len;
23325d465952Smartinh 	find_common_prefix(bt, dst);
23335d465952Smartinh 	if (dst->prefix.len != pfxlen) {
23345d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
23355d465952Smartinh 		    dst->prefix.len - pfxlen) != BT_SUCCESS)
23365d465952Smartinh 			return BT_FAIL;
23375d465952Smartinh 	}
23385d465952Smartinh 
23395d465952Smartinh 	if (IS_BRANCH(dst)) {
23405d465952Smartinh 		mp->parent = dst;
23415d465952Smartinh 		mp->parent_index = dstindx;
23425d465952Smartinh 		find_common_prefix(bt, mp);
23435d465952Smartinh 		if (mp->prefix.len != mp_pfxlen) {
23445d465952Smartinh 			DPRINTF("moved branch node has changed prefix");
23455d465952Smartinh 			mpage_touch(bt, mp);
23465d465952Smartinh 			if (btree_adjust_prefix(bt, mp,
23475d465952Smartinh 			    mp->prefix.len - mp_pfxlen) != BT_SUCCESS)
23485d465952Smartinh 				return BT_FAIL;
23495d465952Smartinh 		}
23505d465952Smartinh 	}
23515d465952Smartinh 
23525d465952Smartinh 	return BT_SUCCESS;
23535d465952Smartinh }
23545d465952Smartinh 
23555d465952Smartinh static int
23565d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)
23575d465952Smartinh {
23585d465952Smartinh 	int			 rc;
23595d465952Smartinh 	indx_t			 i;
2360a33d639aSmartinh 	unsigned int		 pfxlen;
23615d465952Smartinh 	struct node		*srcnode;
23625d465952Smartinh 	struct btkey		 tmpkey, dstpfx;
23635d465952Smartinh 	struct btval		 key, data;
23645d465952Smartinh 
23655d465952Smartinh 	DPRINTF("merging page %u and %u", src->pgno, dst->pgno);
23665d465952Smartinh 
23675d465952Smartinh 	assert(src->parent);	/* can't merge root page */
23685d465952Smartinh 	assert(dst->parent);
23695d465952Smartinh 	assert(bt->txn != NULL);
23705d465952Smartinh 
23715d465952Smartinh 	/* Mark src and dst as dirty. */
23725d465952Smartinh 	mpage_touch(bt, src);
23735d465952Smartinh 	mpage_touch(bt, dst);
23745d465952Smartinh 
23755d465952Smartinh 	find_common_prefix(bt, src);
23765d465952Smartinh 	find_common_prefix(bt, dst);
23775d465952Smartinh 
23785d465952Smartinh 	/* Check if source nodes has destination page prefix. Otherwise
23795d465952Smartinh 	 * the destination page must expand its prefix on all its nodes.
23805d465952Smartinh 	 */
23815d465952Smartinh 	common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);
23825d465952Smartinh 	if (dstpfx.len != dst->prefix.len) {
23835d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
23845d465952Smartinh 		    dstpfx.len - dst->prefix.len) != BT_SUCCESS)
23855d465952Smartinh 			return BT_FAIL;
23865d465952Smartinh 		bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));
23875d465952Smartinh 	}
23885d465952Smartinh 
23895d465952Smartinh 	/* Move all nodes from src to dst.
23905d465952Smartinh 	 */
23915d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
23925d465952Smartinh 		srcnode = NODEPTR(src, i);
23935d465952Smartinh 
23945d465952Smartinh 		/* If branch node 0 (implicit key), find the real key.
23955d465952Smartinh 		 */
23965d465952Smartinh 		if (i == 0 && IS_BRANCH(src)) {
23975d465952Smartinh 			struct mpage	*low;
23985d465952Smartinh 
23995d465952Smartinh 			/* must find the lowest key below src
24005d465952Smartinh 			 */
24015d465952Smartinh 			assert(btree_search_page_root(bt, src, NULL, NULL, 0,
24025d465952Smartinh 			    &low) == BT_SUCCESS);
24035d465952Smartinh 			expand_prefix(bt, low, 0, &tmpkey);
24045d465952Smartinh 			DPRINTF("found lowest key [%.*s] on leaf page %u",
24055d465952Smartinh 			    (int)tmpkey.len, tmpkey.str, low->pgno);
24065d465952Smartinh 		} else {
24075d465952Smartinh 			expand_prefix(bt, src, i, &tmpkey);
24085d465952Smartinh 		}
24095d465952Smartinh 
24105d465952Smartinh 		key.size = tmpkey.len;
24115d465952Smartinh 		key.data = tmpkey.str;
24125d465952Smartinh 
24135d465952Smartinh 		remove_prefix(bt, &key, dst->prefix.len);
24145d465952Smartinh 		data.size = NODEDSZ(srcnode);
24155d465952Smartinh 		data.data = NODEDATA(srcnode);
24165d465952Smartinh 		rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,
24175d465952Smartinh 		    &data, NODEPGNO(srcnode), srcnode->flags);
24185d465952Smartinh 		if (rc != BT_SUCCESS)
24195d465952Smartinh 			return rc;
24205d465952Smartinh 	}
24215d465952Smartinh 
24225d465952Smartinh 	DPRINTF("dst page %u now has %lu keys (%.1f%% filled)",
242337ee45a1Smartinh 	    dst->pgno, NUMKEYS(dst), PAGEFILL(bt, dst) * 100);
24245d465952Smartinh 
24255d465952Smartinh 	/* Unlink the src page from parent.
24265d465952Smartinh 	 */
24275d465952Smartinh 	btree_del_node(bt, src->parent, src->parent_index);
24285d465952Smartinh 	if (src->parent_index == 0) {
24295d465952Smartinh 		key.size = 0;
24305d465952Smartinh 		if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)
24315d465952Smartinh 			return BT_FAIL;
24325d465952Smartinh 
2433a33d639aSmartinh 		pfxlen = src->prefix.len;
24345d465952Smartinh 		find_common_prefix(bt, src);
24355d465952Smartinh 		assert (src->prefix.len == pfxlen);
24365d465952Smartinh 	}
24375d465952Smartinh 
24385d465952Smartinh 	if (IS_LEAF(src))
243937ee45a1Smartinh 		bt->meta.leaf_pages--;
24405d465952Smartinh 	else
244137ee45a1Smartinh 		bt->meta.branch_pages--;
24425d465952Smartinh 
24435d465952Smartinh 	return btree_rebalance(bt, src->parent);
24445d465952Smartinh }
24455d465952Smartinh 
24465d465952Smartinh #define FILL_THRESHOLD	 0.25
24475d465952Smartinh 
24485d465952Smartinh static int
24495d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp)
24505d465952Smartinh {
245137ee45a1Smartinh 	struct node	*node;
24525d465952Smartinh 	struct mpage	*parent;
245337ee45a1Smartinh 	struct mpage	*root;
24545d465952Smartinh 	struct mpage	*neighbor = NULL;
245537ee45a1Smartinh 	indx_t		 si = 0, di = 0;
24565d465952Smartinh 
24575d465952Smartinh 	assert(bt != NULL);
24585d465952Smartinh 	assert(bt->txn != NULL);
24595d465952Smartinh 	assert(mp != NULL);
24605d465952Smartinh 
24615d465952Smartinh 	DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)",
24625d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
246337ee45a1Smartinh 	    mp->pgno, NUMKEYS(mp), PAGEFILL(bt, mp) * 100);
24645d465952Smartinh 
246537ee45a1Smartinh 	if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) {
24665d465952Smartinh 		DPRINTF("no need to rebalance page %u, above fill threshold",
24675d465952Smartinh 		    mp->pgno);
24685d465952Smartinh 		return BT_SUCCESS;
24695d465952Smartinh 	}
24705d465952Smartinh 
24715d465952Smartinh 	parent = mp->parent;
24725d465952Smartinh 
24735d465952Smartinh 	if (parent == NULL) {
24745d465952Smartinh 		if (NUMKEYS(mp) == 0) {
24755d465952Smartinh 			DPRINTF("tree is completely empty");
24765d465952Smartinh 			bt->txn->root = P_INVALID;
247737ee45a1Smartinh 			bt->meta.depth--;
247837ee45a1Smartinh 			bt->meta.leaf_pages--;
24795d465952Smartinh 		} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
24805d465952Smartinh 			DPRINTF("collapsing root page!");
24815d465952Smartinh 			bt->txn->root = NODEPGNO(NODEPTR(mp, 0));
248237ee45a1Smartinh 			if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL)
248337ee45a1Smartinh 				return BT_FAIL;
248437ee45a1Smartinh 			root->parent = NULL;
248537ee45a1Smartinh 			bt->meta.depth--;
248637ee45a1Smartinh 			bt->meta.branch_pages--;
24875d465952Smartinh 		} else
24885d465952Smartinh 			DPRINTF("root page doesn't need rebalancing");
24895d465952Smartinh 		return BT_SUCCESS;
24905d465952Smartinh 	}
24915d465952Smartinh 
24925d465952Smartinh 	/* The parent (branch page) must have at least 2 pointers,
24935d465952Smartinh 	 * otherwise the tree is invalid.
24945d465952Smartinh 	 */
24955d465952Smartinh 	assert(NUMKEYS(parent) > 1);
24965d465952Smartinh 
24975d465952Smartinh 	/* Leaf page fill factor is below the threshold.
24985d465952Smartinh 	 * Try to move keys from left or right neighbor, or
24995d465952Smartinh 	 * merge with a neighbor page.
25005d465952Smartinh 	 */
25015d465952Smartinh 
25025d465952Smartinh 	/* Find neighbors.
25035d465952Smartinh 	 */
25045d465952Smartinh 	if (mp->parent_index == 0) {
25055d465952Smartinh 		/* We're the leftmost leaf in our parent.
25065d465952Smartinh 		 */
25075d465952Smartinh 		DPRINTF("reading right neighbor");
250837ee45a1Smartinh 		node = NODEPTR(parent, mp->parent_index + 1);
250937ee45a1Smartinh 		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25105d465952Smartinh 			return BT_FAIL;
25115d465952Smartinh 		neighbor->parent_index = mp->parent_index + 1;
25125d465952Smartinh 		si = 0;
25135d465952Smartinh 		di = NUMKEYS(mp);
25145d465952Smartinh 	} else {
25155d465952Smartinh 		/* There is at least one neighbor to the left.
25165d465952Smartinh 		 */
25175d465952Smartinh 		DPRINTF("reading left neighbor");
251837ee45a1Smartinh 		node = NODEPTR(parent, mp->parent_index - 1);
251937ee45a1Smartinh 		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25205d465952Smartinh 			return BT_FAIL;
25215d465952Smartinh 		neighbor->parent_index = mp->parent_index - 1;
25225d465952Smartinh 		si = NUMKEYS(neighbor) - 1;
25235d465952Smartinh 		di = 0;
25245d465952Smartinh 	}
25255d465952Smartinh 	neighbor->parent = parent;
25265d465952Smartinh 
25275d465952Smartinh 	DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)",
252837ee45a1Smartinh 	    neighbor->pgno, NUMKEYS(neighbor), PAGEFILL(bt, neighbor) * 100);
25295d465952Smartinh 
25305d465952Smartinh 	/* If the neighbor page is above threshold and has at least two
25315d465952Smartinh 	 * keys, move one key from it.
25325d465952Smartinh 	 *
25335d465952Smartinh 	 * Otherwise we should try to merge them, but that might not be
25345d465952Smartinh 	 * possible, even if both are below threshold, as prefix expansion
25355d465952Smartinh 	 * might make keys larger. FIXME: detect this
25365d465952Smartinh 	 */
253737ee45a1Smartinh 	if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD &&
25385d465952Smartinh 	    NUMKEYS(neighbor) >= NUMKEYS(mp) + 2)
25395d465952Smartinh 		return btree_move_node(bt, neighbor, si, mp, di);
25405d465952Smartinh 	else { /* FIXME: if (has_enough_room()) */
25415d465952Smartinh 		if (mp->parent_index == 0)
25425d465952Smartinh 			return btree_merge(bt, neighbor, mp);
25435d465952Smartinh 		else
25445d465952Smartinh 			return btree_merge(bt, mp, neighbor);
25455d465952Smartinh 	}
25465d465952Smartinh }
25475d465952Smartinh 
25485d465952Smartinh int
25495d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn,
25505d465952Smartinh     struct btval *key, struct btval *data)
25515d465952Smartinh {
25525d465952Smartinh 	int		 rc, exact, close_txn = 0;
25535d465952Smartinh 	unsigned int	 ki;
25545d465952Smartinh 	struct node	*leaf;
25555d465952Smartinh 	struct mpage	*mp;
25565d465952Smartinh 
25575d465952Smartinh 	DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data);
25585d465952Smartinh 
25595d465952Smartinh 	assert(key != NULL);
25605d465952Smartinh 
256159916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
256259916c99Smartinh 		errno = EINVAL;
256359916c99Smartinh 		return BT_FAIL;
256459916c99Smartinh 	}
256559916c99Smartinh 
25667e04cf1eSmartinh 	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
25677e04cf1eSmartinh 		errno = EINVAL;
25687e04cf1eSmartinh 		return BT_FAIL;
25697e04cf1eSmartinh 	}
25707e04cf1eSmartinh 
257159916c99Smartinh 	if (bt == NULL) {
257259916c99Smartinh 		if (txn == NULL) {
257359916c99Smartinh 			errno = EINVAL;
257459916c99Smartinh 			return BT_FAIL;
257559916c99Smartinh 		}
257659916c99Smartinh 		bt = txn->bt;
257759916c99Smartinh 	}
257859916c99Smartinh 
25795d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
25805d465952Smartinh 		errno = EINVAL;
25815d465952Smartinh 		return BT_FAIL;
25825d465952Smartinh 	}
25835d465952Smartinh 
25845d465952Smartinh 	if (txn == NULL) {
25855d465952Smartinh 		close_txn = 1;
25865d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
25875d465952Smartinh 			return BT_FAIL;
25885d465952Smartinh 	}
25895d465952Smartinh 
25905d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS)
25915d465952Smartinh 		goto done;
25925d465952Smartinh 
25935d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, &ki);
25945d465952Smartinh 	if (leaf == NULL || !exact) {
25950279e526Smartinh 		errno = ENOENT;
25960279e526Smartinh 		rc = BT_FAIL;
25975d465952Smartinh 		goto done;
25985d465952Smartinh 	}
25995d465952Smartinh 
26005d465952Smartinh 	if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)
26015d465952Smartinh 		goto done;
26025d465952Smartinh 
26035d465952Smartinh 	btree_del_node(bt, mp, ki);
260437ee45a1Smartinh 	bt->meta.entries--;
26055d465952Smartinh 	rc = btree_rebalance(bt, mp);
26065d465952Smartinh 	if (rc != BT_SUCCESS)
26075d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
26085d465952Smartinh 
26095d465952Smartinh done:
26105d465952Smartinh 	if (close_txn) {
26115d465952Smartinh 		if (rc == BT_SUCCESS)
26125d465952Smartinh 			rc = btree_txn_commit(txn);
26135d465952Smartinh 		else
26145d465952Smartinh 			btree_txn_abort(txn);
26155d465952Smartinh 	}
26165d465952Smartinh 	mpage_prune(bt);
26175d465952Smartinh 	return rc;
26185d465952Smartinh }
26195d465952Smartinh 
26205d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that
26215d465952Smartinh  * still makes it uniquely distinguishable from <min>.
26225d465952Smartinh  *
26235d465952Smartinh  * <min> is guaranteed to be sorted less than <sep>
26245d465952Smartinh  *
26255d465952Smartinh  * On return, <sep> is modified to the minimum length.
26265d465952Smartinh  */
26275d465952Smartinh static void
26285d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)
26295d465952Smartinh {
26305d465952Smartinh 	size_t		 n = 0;
26315d465952Smartinh 	char		*p1;
26325d465952Smartinh 	char		*p2;
26335d465952Smartinh 
26345d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
26355d465952Smartinh 
26365d465952Smartinh 		assert(sep->size > 0);
26375d465952Smartinh 
26385d465952Smartinh 		p1 = (char *)NODEKEY(min) + min->ksize - 1;
26395d465952Smartinh 		p2 = (char *)sep->data + sep->size - 1;
26405d465952Smartinh 
26415d465952Smartinh 		while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) {
26425d465952Smartinh 			assert(p2 > (char *)sep->data);
26435d465952Smartinh 			p1--;
26445d465952Smartinh 			p2--;
26455d465952Smartinh 			n++;
26465d465952Smartinh 		}
26475d465952Smartinh 
26485d465952Smartinh 		sep->data = p2;
26495d465952Smartinh 		sep->size = n + 1;
26505d465952Smartinh 	} else {
26515d465952Smartinh 
26525d465952Smartinh 		assert(min->ksize > 0);
26535d465952Smartinh 		assert(sep->size > 0);
26545d465952Smartinh 
26555d465952Smartinh 		p1 = (char *)NODEKEY(min);
26565d465952Smartinh 		p2 = (char *)sep->data;
26575d465952Smartinh 
26585d465952Smartinh 		while (*p1 == *p2) {
26595d465952Smartinh 			p1++;
26605d465952Smartinh 			p2++;
26615d465952Smartinh 			n++;
26625d465952Smartinh 			if (n == min->ksize || n == sep->size)
26635d465952Smartinh 				break;
26645d465952Smartinh 		}
26655d465952Smartinh 
26665d465952Smartinh 		sep->size = n + 1;
26675d465952Smartinh 	}
26685d465952Smartinh 
26695d465952Smartinh 	DPRINTF("reduced separator to [%.*s] > [%.*s]",
26705d465952Smartinh 	    (int)sep->size, (char *)sep->data,
26715d465952Smartinh 	    (int)min->ksize, (char *)NODEKEY(min));
26725d465952Smartinh }
26735d465952Smartinh 
26745d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
26755d465952Smartinh  * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
26765d465952Smartinh  * *newindxp with the actual values after split, ie if *mpp and *newindxp
26775d465952Smartinh  * refer to a node in the new right sibling page.
26785d465952Smartinh  */
26795d465952Smartinh static int
26805d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,
26815d465952Smartinh     struct btval *newkey, struct btval *newdata, pgno_t newpgno)
26825d465952Smartinh {
26835d465952Smartinh 	uint8_t		 flags;
26845d465952Smartinh 	int		 rc = BT_SUCCESS, ins_new = 0;
26855d465952Smartinh 	indx_t		 newindx;
26865d465952Smartinh 	pgno_t		 pgno = 0;
26875d465952Smartinh 	size_t		 orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;
26885d465952Smartinh 	unsigned int	 i, j, split_indx;
26895d465952Smartinh 	struct node	*node;
26905d465952Smartinh 	struct mpage	*pright, *p, *mp;
26915d465952Smartinh 	struct btval	 sepkey, rkey, rdata;
26925d465952Smartinh 	struct btkey	 tmpkey;
269337ee45a1Smartinh 	struct page	*copy;
26945d465952Smartinh 
26955d465952Smartinh 	assert(bt != NULL);
26965d465952Smartinh 	assert(bt->txn != NULL);
26975d465952Smartinh 
26985d465952Smartinh 	mp = *mpp;
26995d465952Smartinh 	newindx = *newindxp;
27005d465952Smartinh 
27015d465952Smartinh 	DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i",
27025d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,
27035d465952Smartinh 	    (int)newkey->size, (char *)newkey->data, *newindxp);
27045d465952Smartinh 	DPRINTF("page %u has prefix [%.*s]", mp->pgno,
27055d465952Smartinh 	    (int)mp->prefix.len, (char *)mp->prefix.str);
27065d465952Smartinh 	orig_pfx_len = mp->prefix.len;
27075d465952Smartinh 
27085d465952Smartinh 	if (mp->parent == NULL) {
27095d465952Smartinh 		if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)
27105d465952Smartinh 			return BT_FAIL;
27115d465952Smartinh 		mp->parent_index = 0;
27125d465952Smartinh 		bt->txn->root = mp->parent->pgno;
27135d465952Smartinh 		DPRINTF("root split! new root = %u", mp->parent->pgno);
271437ee45a1Smartinh 		bt->meta.depth++;
27155d465952Smartinh 
27165d465952Smartinh 		/* Add left (implicit) pointer. */
27175d465952Smartinh 		if (btree_add_node(bt, mp->parent, 0, NULL, NULL,
27185d465952Smartinh 		    mp->pgno, 0) != BT_SUCCESS)
27195d465952Smartinh 			return BT_FAIL;
27205d465952Smartinh 	} else {
27215d465952Smartinh 		DPRINTF("parent branch page is %u", mp->parent->pgno);
27225d465952Smartinh 	}
27235d465952Smartinh 
27245d465952Smartinh 	/* Create a right sibling. */
272537ee45a1Smartinh 	if ((pright = btree_new_page(bt, mp->page->flags)) == NULL)
27265d465952Smartinh 		return BT_FAIL;
27275d465952Smartinh 	pright->parent = mp->parent;
27285d465952Smartinh 	pright->parent_index = mp->parent_index + 1;
27295d465952Smartinh 	DPRINTF("new right sibling: page %u", pright->pgno);
27305d465952Smartinh 
27315d465952Smartinh 	/* Move half of the keys to the right sibling. */
273237ee45a1Smartinh 	if ((copy = malloc(bt->head.psize)) == NULL)
273337ee45a1Smartinh 		return BT_FAIL;
273437ee45a1Smartinh 	bcopy(mp->page, copy, bt->head.psize);
27355d465952Smartinh 	assert(mp->ref == 0);				/* XXX */
273637ee45a1Smartinh 	bzero(&mp->page->ptrs, bt->head.psize - PAGEHDRSZ);
273737ee45a1Smartinh 	mp->page->lower = PAGEHDRSZ;
273837ee45a1Smartinh 	mp->page->upper = bt->head.psize;
27395d465952Smartinh 
274037ee45a1Smartinh 	split_indx = NUMKEYSP(copy) / 2 + 1;
27415d465952Smartinh 
27425d465952Smartinh 	/* First find the separating key between the split pages.
27435d465952Smartinh 	 */
27445d465952Smartinh 	bzero(&sepkey, sizeof(sepkey));
27455d465952Smartinh 	if (newindx == split_indx) {
27465d465952Smartinh 		sepkey.size = newkey->size;
27475d465952Smartinh 		sepkey.data = newkey->data;
27485d465952Smartinh 		remove_prefix(bt, &sepkey, mp->prefix.len);
27495d465952Smartinh 	} else {
275037ee45a1Smartinh 		node = NODEPTRP(copy, split_indx);
27515d465952Smartinh 		sepkey.size = node->ksize;
27525d465952Smartinh 		sepkey.data = NODEKEY(node);
27535d465952Smartinh 	}
27545d465952Smartinh 
27555d465952Smartinh 	if (IS_LEAF(mp) && bt->cmp == NULL) {
27565d465952Smartinh 		/* Find the smallest separator. */
27575d465952Smartinh 		/* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */
275837ee45a1Smartinh 		node = NODEPTRP(copy, split_indx - 1);
27595d465952Smartinh 		bt_reduce_separator(bt, node, &sepkey);
27605d465952Smartinh 	}
27615d465952Smartinh 
27625d465952Smartinh 	/* Fix separator wrt parent prefix. */
27635d465952Smartinh 	if (bt->cmp == NULL) {
27645d465952Smartinh 		tmpkey.len = sizeof(tmpkey.str);
27655d465952Smartinh 		concat_prefix(bt, mp->prefix.str, mp->prefix.len,
27665d465952Smartinh 		    sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len);
27675d465952Smartinh 		sepkey.data = tmpkey.str;
27685d465952Smartinh 		sepkey.size = tmpkey.len;
27695d465952Smartinh 	}
27705d465952Smartinh 
27715d465952Smartinh 	DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data);
27725d465952Smartinh 
27735d465952Smartinh 	/* Copy separator key to the parent.
27745d465952Smartinh 	 */
277537ee45a1Smartinh 	if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) {
27765d465952Smartinh 		rc = btree_split(bt, &pright->parent, &pright->parent_index,
27775d465952Smartinh 		    &sepkey, NULL, pright->pgno);
27785d465952Smartinh 
27795d465952Smartinh 		/* Right page might now have changed parent.
27805d465952Smartinh 		 * Check if left page also changed parent.
27815d465952Smartinh 		 */
27825d465952Smartinh 		if (pright->parent != mp->parent &&
27835d465952Smartinh 		    mp->parent_index >= NUMKEYS(mp->parent)) {
27845d465952Smartinh 			mp->parent = pright->parent;
27855d465952Smartinh 			mp->parent_index = pright->parent_index - 1;
27865d465952Smartinh 		}
27875d465952Smartinh 	} else {
27885d465952Smartinh 		remove_prefix(bt, &sepkey, pright->parent->prefix.len);
27895d465952Smartinh 		rc = btree_add_node(bt, pright->parent, pright->parent_index,
27905d465952Smartinh 		    &sepkey, NULL, pright->pgno, 0);
27915d465952Smartinh 	}
279237ee45a1Smartinh 	if (rc != BT_SUCCESS) {
279337ee45a1Smartinh 		free(copy);
27945d465952Smartinh 		return BT_FAIL;
279537ee45a1Smartinh 	}
27965d465952Smartinh 
27975d465952Smartinh 	/* Update prefix for right and left page, if the parent was split.
27985d465952Smartinh 	 */
27995d465952Smartinh 	find_common_prefix(bt, pright);
28005d465952Smartinh 	assert(orig_pfx_len <= pright->prefix.len);
28015d465952Smartinh 	right_pfx_diff = pright->prefix.len - orig_pfx_len;
28025d465952Smartinh 
28035d465952Smartinh 	find_common_prefix(bt, mp);
28045d465952Smartinh 	assert(orig_pfx_len <= mp->prefix.len);
28055d465952Smartinh 	left_pfx_diff = mp->prefix.len - orig_pfx_len;
28065d465952Smartinh 
280737ee45a1Smartinh 	for (i = j = 0; i <= NUMKEYSP(copy); j++) {
28085d465952Smartinh 		if (i < split_indx) {
28095d465952Smartinh 			/* Re-insert in left sibling. */
28105d465952Smartinh 			p = mp;
28115d465952Smartinh 			pfx_diff = left_pfx_diff;
28125d465952Smartinh 		} else {
28135d465952Smartinh 			/* Insert in right sibling. */
28145d465952Smartinh 			if (i == split_indx)
28155d465952Smartinh 				/* Reset insert index for right sibling. */
28165d465952Smartinh 				j = (i == newindx && ins_new);
28175d465952Smartinh 			p = pright;
28185d465952Smartinh 			pfx_diff = right_pfx_diff;
28195d465952Smartinh 		}
28205d465952Smartinh 
28215d465952Smartinh 		if (i == newindx && !ins_new) {
28225d465952Smartinh 			/* Insert the original entry that caused the split. */
28235d465952Smartinh 			rkey.data = newkey->data;
28245d465952Smartinh 			rkey.size = newkey->size;
28255d465952Smartinh 			if (IS_LEAF(mp)) {
28265d465952Smartinh 				rdata.data = newdata->data;
28275d465952Smartinh 				rdata.size = newdata->size;
28285d465952Smartinh 			} else
28295d465952Smartinh 				pgno = newpgno;
28305d465952Smartinh 			flags = 0;
28315d465952Smartinh 			pfx_diff = p->prefix.len;
28325d465952Smartinh 
28335d465952Smartinh 			ins_new = 1;
28345d465952Smartinh 
28355d465952Smartinh 			/* Update page and index for the new key. */
28365d465952Smartinh 			*newindxp = j;
28375d465952Smartinh 			*mpp = p;
283837ee45a1Smartinh 		} else if (i == NUMKEYSP(copy)) {
28395d465952Smartinh 			break;
28405d465952Smartinh 		} else {
284137ee45a1Smartinh 			node = NODEPTRP(copy, i);
28425d465952Smartinh 			rkey.data = NODEKEY(node);
28435d465952Smartinh 			rkey.size = node->ksize;
28445d465952Smartinh 			if (IS_LEAF(mp)) {
28455d465952Smartinh 				rdata.data = NODEDATA(node);
28465d465952Smartinh 				rdata.size = node->n_dsize;
28475d465952Smartinh 			} else
28485d465952Smartinh 				pgno = node->n_pgno;
28495d465952Smartinh 			flags = node->flags;
28505d465952Smartinh 
28515d465952Smartinh 			i++;
28525d465952Smartinh 		}
28535d465952Smartinh 
28545d465952Smartinh 		if (!IS_LEAF(mp) && j == 0) {
28555d465952Smartinh 			/* First branch index doesn't need key data. */
28565d465952Smartinh 			rkey.size = 0;
28575d465952Smartinh 		} else
28585d465952Smartinh 			remove_prefix(bt, &rkey, pfx_diff);
28595d465952Smartinh 
28605d465952Smartinh 		rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);
28615d465952Smartinh 	}
28625d465952Smartinh 
286337ee45a1Smartinh 	free(copy);
28645d465952Smartinh 	return rc;
28655d465952Smartinh }
28665d465952Smartinh 
28675d465952Smartinh int
28685d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn,
28695d465952Smartinh     struct btval *key, struct btval *data, unsigned int flags)
28705d465952Smartinh {
28715d465952Smartinh 	int		 rc = BT_SUCCESS, exact, close_txn = 0;
28725d465952Smartinh 	unsigned int	 ki;
28735d465952Smartinh 	struct node	*leaf;
28745d465952Smartinh 	struct mpage	*mp;
28755d465952Smartinh 	struct btval	 xkey;
28765d465952Smartinh 
28775d465952Smartinh 	assert(key != NULL);
28785d465952Smartinh 	assert(data != NULL);
28795d465952Smartinh 
288059916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
288159916c99Smartinh 		errno = EINVAL;
288259916c99Smartinh 		return BT_FAIL;
288359916c99Smartinh 	}
288459916c99Smartinh 
28857e04cf1eSmartinh 	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
28867e04cf1eSmartinh 		errno = EINVAL;
28877e04cf1eSmartinh 		return BT_FAIL;
28887e04cf1eSmartinh 	}
28897e04cf1eSmartinh 
289059916c99Smartinh 	if (bt == NULL) {
289159916c99Smartinh 		if (txn == NULL) {
289259916c99Smartinh 			errno = EINVAL;
289359916c99Smartinh 			return BT_FAIL;
289459916c99Smartinh 		}
289559916c99Smartinh 		bt = txn->bt;
289659916c99Smartinh 	}
289759916c99Smartinh 
28985d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
28995d465952Smartinh 		errno = EINVAL;
29005d465952Smartinh 		return BT_FAIL;
29015d465952Smartinh 	}
29025d465952Smartinh 
29035d465952Smartinh 	DPRINTF("==> put key %.*s, size %zu, data size %zu",
29045d465952Smartinh 		(int)key->size, (char *)key->data, key->size, data->size);
29055d465952Smartinh 
29065d465952Smartinh 	if (txn == NULL) {
29075d465952Smartinh 		close_txn = 1;
29085d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
29095d465952Smartinh 			return BT_FAIL;
29105d465952Smartinh 	}
29115d465952Smartinh 
29125d465952Smartinh 	rc = btree_search_page(bt, txn, key, NULL, 1, &mp);
29135d465952Smartinh 	if (rc == BT_SUCCESS) {
29145d465952Smartinh 		leaf = btree_search_node(bt, mp, key, &exact, &ki);
29155d465952Smartinh 		if (leaf && exact) {
29165d465952Smartinh 			if (F_ISSET(flags, BT_NOOVERWRITE)) {
29175d465952Smartinh 				DPRINTF("duplicate key %.*s",
29185d465952Smartinh 				    (int)key->size, (char *)key->data);
29190279e526Smartinh 				errno = EEXIST;
29200279e526Smartinh 				rc = BT_FAIL;
29215d465952Smartinh 				goto done;
29225d465952Smartinh 			}
29235d465952Smartinh 			btree_del_node(bt, mp, ki);
29245d465952Smartinh 		}
29255d465952Smartinh 		if (leaf == NULL) {		/* append if not found */
29265d465952Smartinh 			ki = NUMKEYS(mp);
29275d465952Smartinh 			DPRINTF("appending key at index %i", ki);
29285d465952Smartinh 		}
29290279e526Smartinh 	} else if (errno == ENOENT) {
29305d465952Smartinh 		/* new file, just write a root leaf page */
29315d465952Smartinh 		DPRINTF("allocating new root leaf page");
29325d465952Smartinh 		if ((mp = btree_new_page(bt, P_LEAF)) == NULL) {
29335d465952Smartinh 			rc = BT_FAIL;
29345d465952Smartinh 			goto done;
29355d465952Smartinh 		}
29365d465952Smartinh 		txn->root = mp->pgno;
293737ee45a1Smartinh 		bt->meta.depth++;
29385d465952Smartinh 		ki = 0;
29395d465952Smartinh 	}
29405d465952Smartinh 	else
29415d465952Smartinh 		goto done;
29425d465952Smartinh 
29435d465952Smartinh 	assert(IS_LEAF(mp));
29445d465952Smartinh 	DPRINTF("there are %lu keys, should insert new key at index %i",
29455d465952Smartinh 		NUMKEYS(mp), ki);
29465d465952Smartinh 
29475d465952Smartinh 	/* Copy the key pointer as it is modified by the prefix code. The
29485d465952Smartinh 	 * caller might have malloc'ed the data.
29495d465952Smartinh 	 */
29505d465952Smartinh 	xkey.data = key->data;
29515d465952Smartinh 	xkey.size = key->size;
29525d465952Smartinh 
295337ee45a1Smartinh 	if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) {
29545d465952Smartinh 		rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);
29555d465952Smartinh 	} else {
29565d465952Smartinh 		/* There is room already in this leaf page. */
29575d465952Smartinh 		remove_prefix(bt, &xkey, mp->prefix.len);
29585d465952Smartinh 		rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);
29595d465952Smartinh 	}
29605d465952Smartinh 
29615d465952Smartinh 	if (rc != BT_SUCCESS)
29625d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
29635d465952Smartinh 	else
296437ee45a1Smartinh 		bt->meta.entries++;
29655d465952Smartinh 
29665d465952Smartinh done:
29675d465952Smartinh 	if (close_txn) {
29685d465952Smartinh 		if (rc == BT_SUCCESS)
29695d465952Smartinh 			rc = btree_txn_commit(txn);
29705d465952Smartinh 		else
29715d465952Smartinh 			btree_txn_abort(txn);
29725d465952Smartinh 	}
29735d465952Smartinh 	mpage_prune(bt);
29745d465952Smartinh 	return rc;
29755d465952Smartinh }
29765d465952Smartinh 
29775d465952Smartinh static pgno_t
297837ee45a1Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc)
29795d465952Smartinh {
298037ee45a1Smartinh 	ssize_t		 rc;
29815d465952Smartinh 	indx_t		 i;
29825d465952Smartinh 	pgno_t		*pnext;
29835d465952Smartinh 	struct node	*node;
29845d465952Smartinh 	struct page	*p;
29855d465952Smartinh 	struct mpage	*mp;
29865d465952Smartinh 
298737ee45a1Smartinh 	/* Get the page and make a copy of it.
298837ee45a1Smartinh 	 */
298937ee45a1Smartinh 	if ((mp = btree_get_mpage(bt, pgno)) == NULL)
29905d465952Smartinh 		return P_INVALID;
299137ee45a1Smartinh 	if ((p = malloc(bt->head.psize)) == NULL)
299237ee45a1Smartinh 		return P_INVALID;
299337ee45a1Smartinh 	bcopy(mp->page, p, bt->head.psize);
29945d465952Smartinh 
299537ee45a1Smartinh         /* Go through all nodes in the (copied) page and update the
299637ee45a1Smartinh          * page pointers.
299737ee45a1Smartinh 	 */
29985d465952Smartinh 	if (F_ISSET(p->flags, P_BRANCH)) {
29995d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
30005d465952Smartinh 			node = NODEPTRP(p, i);
300137ee45a1Smartinh 			node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc);
300237ee45a1Smartinh 			if (node->n_pgno == P_INVALID) {
300337ee45a1Smartinh 				free(p);
30045d465952Smartinh 				return P_INVALID;
30055d465952Smartinh 			}
300637ee45a1Smartinh 		}
30075d465952Smartinh 	} else if (F_ISSET(p->flags, P_LEAF)) {
30085d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
30095d465952Smartinh 			node = NODEPTRP(p, i);
30105d465952Smartinh 			if (F_ISSET(node->flags, F_BIGDATA)) {
30115d465952Smartinh 				pnext = NODEDATA(node);
301237ee45a1Smartinh 				*pnext = btree_compact_tree(bt, *pnext, btc);
301337ee45a1Smartinh 				if (*pnext == P_INVALID) {
301437ee45a1Smartinh 					free(p);
30155d465952Smartinh 					return P_INVALID;
30165d465952Smartinh 				}
30175d465952Smartinh 			}
301837ee45a1Smartinh 		}
30195d465952Smartinh 	} else if (F_ISSET(p->flags, P_OVERFLOW)) {
30205d465952Smartinh 		pnext = &p->p_next_pgno;
30215d465952Smartinh 		if (*pnext > 0) {
302237ee45a1Smartinh 			*pnext = btree_compact_tree(bt, *pnext, btc);
302337ee45a1Smartinh 			if (*pnext == P_INVALID) {
302437ee45a1Smartinh 				free(p);
30255d465952Smartinh 				return P_INVALID;
30265d465952Smartinh 			}
302737ee45a1Smartinh 		}
30285d465952Smartinh 	} else
30295d465952Smartinh 		assert(0);
30305d465952Smartinh 
303137ee45a1Smartinh 	pgno = p->pgno = btc->txn->next_pgno++;
303237ee45a1Smartinh 	rc = write(btc->fd, p, bt->head.psize);
303337ee45a1Smartinh 	free(p);
3034*36e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize)
30355d465952Smartinh 		return P_INVALID;
303637ee45a1Smartinh 	return pgno;
30375d465952Smartinh }
30385d465952Smartinh 
30395d465952Smartinh int
30405d465952Smartinh btree_compact(struct btree *bt)
30415d465952Smartinh {
30425d465952Smartinh 	char			*compact_path = NULL;
304337ee45a1Smartinh 	struct btree		*btc;
304437ee45a1Smartinh 	struct btree_txn	*txn, *txnc = NULL;
304537ee45a1Smartinh 	int			 fd;
304637ee45a1Smartinh 	pgno_t			 root;
30475d465952Smartinh 
30485d465952Smartinh 	assert(bt != NULL);
30495d465952Smartinh 
30500279e526Smartinh 	DPRINTF("compacting btree %p with path %s", bt, bt->path);
30510279e526Smartinh 
30520279e526Smartinh 	if (bt->path == NULL) {
30530279e526Smartinh 		errno = EINVAL;
30540279e526Smartinh 		return BT_FAIL;
30550279e526Smartinh 	}
30560279e526Smartinh 
30570279e526Smartinh 	if ((txn = btree_txn_begin(bt, 0)) == NULL)
30585d465952Smartinh 		return BT_FAIL;
30595d465952Smartinh 
30605d465952Smartinh 	asprintf(&compact_path, "%s.compact.XXXXXX", bt->path);
30615d465952Smartinh 	fd = mkstemp(compact_path);
30625d465952Smartinh 	if (fd == -1) {
30635d465952Smartinh 		free(compact_path);
30640279e526Smartinh 		btree_txn_abort(txn);
30655d465952Smartinh 		return BT_FAIL;
30665d465952Smartinh 	}
30675d465952Smartinh 
306837ee45a1Smartinh 	if ((btc = btree_open_fd(fd, 0)) == NULL)
306937ee45a1Smartinh 		goto failed;
30705d465952Smartinh 
307137ee45a1Smartinh 	if ((txnc = btree_txn_begin(btc, 0)) == NULL)
307237ee45a1Smartinh 		goto failed;
307337ee45a1Smartinh 
307437ee45a1Smartinh 	if (bt->meta.root != P_INVALID) {
307537ee45a1Smartinh 		root = btree_compact_tree(bt, bt->meta.root, btc);
30765d465952Smartinh 		if (root == P_INVALID)
30775d465952Smartinh 			goto failed;
307837ee45a1Smartinh 		if (btree_write_meta(btc, root, 0) != BT_SUCCESS)
30795d465952Smartinh 			goto failed;
308037ee45a1Smartinh 	}
30815d465952Smartinh 
30825d465952Smartinh 	fsync(fd);
30835d465952Smartinh 
30845d465952Smartinh 	DPRINTF("renaming %s to %s", compact_path, bt->path);
30855d465952Smartinh 	if (rename(compact_path, bt->path) != 0)
30865d465952Smartinh 		goto failed;
30875d465952Smartinh 
3088e14f8e24Smartinh 	/* Write a "tombstone" meta page so other processes can pick up
3089e14f8e24Smartinh 	 * the change and re-open the file.
3090e14f8e24Smartinh 	 */
3091e14f8e24Smartinh 	if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS)
3092e14f8e24Smartinh 		goto failed;
30935d465952Smartinh 
3094e14f8e24Smartinh 	btree_txn_abort(txn);
309537ee45a1Smartinh 	btree_txn_abort(txnc);
30965d465952Smartinh 	free(compact_path);
309737ee45a1Smartinh 	btree_close(btc);
309837ee45a1Smartinh 	return 0;
30995d465952Smartinh 
31005d465952Smartinh failed:
31015d465952Smartinh 	btree_txn_abort(txn);
310237ee45a1Smartinh 	btree_txn_abort(txnc);
31035d465952Smartinh 	unlink(compact_path);
31045d465952Smartinh 	free(compact_path);
310537ee45a1Smartinh 	btree_close(btc);
31065d465952Smartinh 	return BT_FAIL;
31075d465952Smartinh }
31085d465952Smartinh 
31095d465952Smartinh /* Reverts the last change. Truncates the file at the last root page.
31105d465952Smartinh  */
31115d465952Smartinh int
31125d465952Smartinh btree_revert(struct btree *bt)
31135d465952Smartinh {
311437ee45a1Smartinh 	if (btree_read_meta(bt, NULL) != 0)
311537ee45a1Smartinh 		return -1;
31165d465952Smartinh 
311737ee45a1Smartinh 	DPRINTF("truncating file at page %u", bt->meta.root);
311837ee45a1Smartinh 	return ftruncate(bt->fd, bt->head.psize * bt->meta.root);
31195d465952Smartinh }
31205d465952Smartinh 
31215d465952Smartinh void
31225d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size)
31235d465952Smartinh {
31245d465952Smartinh 	bt->stat.max_cache = cache_size;
31255d465952Smartinh }
31265d465952Smartinh 
31275d465952Smartinh unsigned int
31285d465952Smartinh btree_get_flags(struct btree *bt)
31295d465952Smartinh {
31305d465952Smartinh 	return (bt->flags & ~BT_FIXPADDING);
31315d465952Smartinh }
31325d465952Smartinh 
31335d465952Smartinh const char *
31345d465952Smartinh btree_get_path(struct btree *bt)
31355d465952Smartinh {
31365d465952Smartinh 	return bt->path;
31375d465952Smartinh }
31385d465952Smartinh 
31395d465952Smartinh const struct btree_stat *
31405d465952Smartinh btree_stat(struct btree *bt)
31415d465952Smartinh {
314237ee45a1Smartinh 	bt->stat.branch_pages = bt->meta.branch_pages;
314337ee45a1Smartinh 	bt->stat.leaf_pages = bt->meta.leaf_pages;
314437ee45a1Smartinh 	bt->stat.overflow_pages = bt->meta.overflow_pages;
314537ee45a1Smartinh 	bt->stat.revisions = bt->meta.revisions;
314637ee45a1Smartinh 	bt->stat.depth = bt->meta.depth;
314737ee45a1Smartinh 	bt->stat.entries = bt->meta.entries;
314837ee45a1Smartinh 	bt->stat.psize = bt->head.psize;
314937ee45a1Smartinh 	bt->stat.created_at = bt->meta.created_at;
31505d465952Smartinh 
31515d465952Smartinh 	return &bt->stat;
31525d465952Smartinh }
31535d465952Smartinh 
3154