xref: /openbsd/usr.sbin/ldapd/btree.c (revision 3572a0e3)
1*3572a0e3Ssthen /*	$OpenBSD: btree.c,v 1.38 2017/05/26 21:23:14 sthen Exp $ */
25d465952Smartinh 
35d465952Smartinh /*
45d465952Smartinh  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
55d465952Smartinh  *
65d465952Smartinh  * Permission to use, copy, modify, and distribute this software for any
75d465952Smartinh  * purpose with or without fee is hereby granted, provided that the above
85d465952Smartinh  * copyright notice and this permission notice appear in all copies.
95d465952Smartinh  *
105d465952Smartinh  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
115d465952Smartinh  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
125d465952Smartinh  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
135d465952Smartinh  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
145d465952Smartinh  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
155d465952Smartinh  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
165d465952Smartinh  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
175d465952Smartinh  */
185d465952Smartinh 
195d465952Smartinh #include <sys/types.h>
205d465952Smartinh #include <sys/tree.h>
2137ee45a1Smartinh #include <sys/stat.h>
225d465952Smartinh #include <sys/queue.h>
235d465952Smartinh #include <sys/uio.h>
245d465952Smartinh 
255d465952Smartinh #include <assert.h>
265d465952Smartinh #include <err.h>
275d465952Smartinh #include <errno.h>
285d465952Smartinh #include <fcntl.h>
295d465952Smartinh #include <stddef.h>
305d465952Smartinh #include <stdint.h>
315d465952Smartinh #include <stdio.h>
325d465952Smartinh #include <stdlib.h>
335d465952Smartinh #include <string.h>
345d465952Smartinh #include <time.h>
355d465952Smartinh #include <unistd.h>
365d465952Smartinh 
375d465952Smartinh #include "btree.h"
385d465952Smartinh 
395d465952Smartinh /* #define DEBUG */
405d465952Smartinh 
415d465952Smartinh #ifdef DEBUG
425d465952Smartinh # define DPRINTF(...)	do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
435d465952Smartinh 			     fprintf(stderr, __VA_ARGS__); \
445d465952Smartinh 			     fprintf(stderr, "\n"); } while(0)
455d465952Smartinh #else
465d465952Smartinh # define DPRINTF(...)
475d465952Smartinh #endif
485d465952Smartinh 
495d465952Smartinh #define PAGESIZE	 4096
505d465952Smartinh #define BT_MINKEYS	 4
515d465952Smartinh #define BT_MAGIC	 0xB3DBB3DB
5237ee45a1Smartinh #define BT_VERSION	 4
535d465952Smartinh #define MAXKEYSIZE	 255
545d465952Smartinh 
555d465952Smartinh #define P_INVALID	 0xFFFFFFFF
565d465952Smartinh 
575d465952Smartinh #define F_ISSET(w, f)	 (((w) & (f)) == (f))
58*3572a0e3Ssthen #define MINIMUM(a,b)	 ((a) < (b) ? (a) : (b))
595d465952Smartinh 
60176c8cbcSmartinh typedef uint32_t	 pgno_t;
61176c8cbcSmartinh typedef uint16_t	 indx_t;
62176c8cbcSmartinh 
635d465952Smartinh /* There are four page types: meta, index, leaf and overflow.
645d465952Smartinh  * They all share the same page header.
655d465952Smartinh  */
665d465952Smartinh struct page {				/* represents an on-disk page */
6737ee45a1Smartinh 	pgno_t		 pgno;		/* page number */
685d465952Smartinh #define	P_BRANCH	 0x01		/* branch page */
695d465952Smartinh #define	P_LEAF		 0x02		/* leaf page */
705d465952Smartinh #define	P_OVERFLOW	 0x04		/* overflow page */
715d465952Smartinh #define	P_META		 0x08		/* meta page */
7237ee45a1Smartinh #define	P_HEAD		 0x10		/* header page */
735d465952Smartinh 	uint32_t	 flags;
745d465952Smartinh #define lower		 b.fb.fb_lower
755d465952Smartinh #define upper		 b.fb.fb_upper
765d465952Smartinh #define p_next_pgno	 b.pb_next_pgno
775d465952Smartinh 	union page_bounds {
785d465952Smartinh 		struct {
795d465952Smartinh 			indx_t	 fb_lower;	/* lower bound of free space */
805d465952Smartinh 			indx_t	 fb_upper;	/* upper bound of free space */
815d465952Smartinh 		} fb;
825d465952Smartinh 		pgno_t		 pb_next_pgno;	/* overflow page linked list */
835d465952Smartinh 	} b;
8437ee45a1Smartinh 	indx_t		 ptrs[1];		/* dynamic size */
85bf245c3bSmartinh } __packed;
865d465952Smartinh 
8737ee45a1Smartinh #define PAGEHDRSZ	 offsetof(struct page, ptrs)
885d465952Smartinh 
8937ee45a1Smartinh #define NUMKEYSP(p)	 (((p)->lower - PAGEHDRSZ) >> 1)
9037ee45a1Smartinh #define NUMKEYS(mp)	 (((mp)->page->lower - PAGEHDRSZ) >> 1)
9137ee45a1Smartinh #define SIZELEFT(mp)	 (indx_t)((mp)->page->upper - (mp)->page->lower)
92944ad5d4Smartinh #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \
9337ee45a1Smartinh 				((bt)->head.psize - PAGEHDRSZ))
9437ee45a1Smartinh #define IS_LEAF(mp)	 F_ISSET((mp)->page->flags, P_LEAF)
9537ee45a1Smartinh #define IS_BRANCH(mp)	 F_ISSET((mp)->page->flags, P_BRANCH)
9637ee45a1Smartinh #define IS_OVERFLOW(mp)	 F_ISSET((mp)->page->flags, P_OVERFLOW)
9737ee45a1Smartinh 
9837ee45a1Smartinh struct bt_head {				/* header page content */
9937ee45a1Smartinh 	uint32_t	 magic;
1005d465952Smartinh 	uint32_t	 version;
1015d465952Smartinh 	uint32_t	 flags;
1025d465952Smartinh 	uint32_t	 psize;			/* page size */
10337ee45a1Smartinh } __packed;
10437ee45a1Smartinh 
10537ee45a1Smartinh struct bt_meta {				/* meta (footer) page content */
10637ee45a1Smartinh #define BT_TOMBSTONE	 0x01			/* file is replaced */
10737ee45a1Smartinh 	uint32_t	 flags;
1085d465952Smartinh 	pgno_t		 root;			/* page number of root page */
1095d465952Smartinh 	pgno_t		 prev_meta;		/* previous meta page number */
1105d465952Smartinh 	time_t		 created_at;
1115d465952Smartinh 	uint32_t	 branch_pages;
1125d465952Smartinh 	uint32_t	 leaf_pages;
1135d465952Smartinh 	uint32_t	 overflow_pages;
1145d465952Smartinh 	uint32_t	 revisions;
1155d465952Smartinh 	uint32_t	 depth;
1165d465952Smartinh 	uint64_t	 entries;
1175d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
118bf245c3bSmartinh } __packed;
1195d465952Smartinh 
1205d465952Smartinh struct btkey {
1215d465952Smartinh 	size_t			 len;
1225d465952Smartinh 	char			 str[MAXKEYSIZE];
1235d465952Smartinh };
1245d465952Smartinh 
1255d465952Smartinh struct mpage {					/* an in-memory cached page */
1265d465952Smartinh 	RB_ENTRY(mpage)		 entry;		/* page cache entry */
1275d465952Smartinh 	SIMPLEQ_ENTRY(mpage)	 next;		/* queue of dirty pages */
1285d465952Smartinh 	TAILQ_ENTRY(mpage)	 lru_next;	/* LRU queue */
1295d465952Smartinh 	struct mpage		*parent;	/* NULL if root */
1305d465952Smartinh 	unsigned int		 parent_index;	/* keep track of node index */
1315d465952Smartinh 	struct btkey		 prefix;
13237ee45a1Smartinh 	struct page		*page;
1335d465952Smartinh 	pgno_t			 pgno;		/* copy of page->pgno */
1345d465952Smartinh 	short			 ref;		/* increased by cursors */
1355d465952Smartinh 	short			 dirty;		/* 1 if on dirty queue */
1365d465952Smartinh };
1375d465952Smartinh RB_HEAD(page_cache, mpage);
1385d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage);
1395d465952Smartinh TAILQ_HEAD(lru_queue, mpage);
1405d465952Smartinh 
1415d465952Smartinh static int		 mpage_cmp(struct mpage *a, struct mpage *b);
1425d465952Smartinh static struct mpage	*mpage_lookup(struct btree *bt, pgno_t pgno);
1435d465952Smartinh static void		 mpage_add(struct btree *bt, struct mpage *mp);
14437ee45a1Smartinh static void		 mpage_free(struct mpage *mp);
1455d465952Smartinh static void		 mpage_del(struct btree *bt, struct mpage *mp);
146e14f8e24Smartinh static void		 mpage_flush(struct btree *bt);
14737ee45a1Smartinh static struct mpage	*mpage_copy(struct btree *bt, struct mpage *mp);
1485d465952Smartinh static void		 mpage_prune(struct btree *bt);
1495d465952Smartinh static void		 mpage_dirty(struct btree *bt, struct mpage *mp);
1507f8a466aSmartinh static struct mpage	*mpage_touch(struct btree *bt, struct mpage *mp);
1515d465952Smartinh 
1525d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);
1535d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp);
1545d465952Smartinh 
1555d465952Smartinh struct ppage {					/* ordered list of pages */
1565d465952Smartinh 	SLIST_ENTRY(ppage)	 entry;
1575d465952Smartinh 	struct mpage		*mpage;
1585d465952Smartinh 	unsigned int		 ki;		/* cursor index on page */
1595d465952Smartinh };
1605d465952Smartinh SLIST_HEAD(page_stack, ppage);
1615d465952Smartinh 
1625d465952Smartinh #define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->stack)
1635d465952Smartinh #define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->stack)
1645d465952Smartinh #define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->stack, entry)
1655d465952Smartinh #define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->stack, p, entry)
1665d465952Smartinh 
1675d465952Smartinh struct cursor {
1685d465952Smartinh 	struct btree		*bt;
1695d465952Smartinh 	struct btree_txn	*txn;
1705d465952Smartinh 	struct page_stack	 stack;		/* stack of parent pages */
1715d465952Smartinh 	short			 initialized;	/* 1 if initialized */
1725d465952Smartinh 	short			 eof;		/* 1 if end is reached */
1735d465952Smartinh };
1745d465952Smartinh 
1755d465952Smartinh #define METAHASHLEN	 offsetof(struct bt_meta, hash)
17637ee45a1Smartinh #define METADATA(p)	 ((void *)((char *)p + PAGEHDRSZ))
1775d465952Smartinh 
1785d465952Smartinh struct node {
1795d465952Smartinh #define n_pgno		 p.np_pgno
1805d465952Smartinh #define n_dsize		 p.np_dsize
1815d465952Smartinh 	union {
1825d465952Smartinh 		pgno_t		 np_pgno;	/* child page number */
1835d465952Smartinh 		uint32_t	 np_dsize;	/* leaf data size */
1845d465952Smartinh 	}		 p;
1855d465952Smartinh 	uint16_t	 ksize;			/* key size */
1865d465952Smartinh #define F_BIGDATA	 0x01			/* data put on overflow page */
1875d465952Smartinh 	uint8_t		 flags;
188a33d639aSmartinh 	char		 data[1];
189bf245c3bSmartinh } __packed;
1905d465952Smartinh 
1915d465952Smartinh struct btree_txn {
1925d465952Smartinh 	pgno_t			 root;		/* current / new root page */
1935d465952Smartinh 	pgno_t			 next_pgno;	/* next unallocated page */
1945d465952Smartinh 	struct btree		*bt;		/* btree is ref'd */
1955d465952Smartinh 	struct dirty_queue	*dirty_queue;	/* modified pages */
1965d465952Smartinh #define BT_TXN_RDONLY		 0x01		/* read-only transaction */
1975d465952Smartinh #define BT_TXN_ERROR		 0x02		/* an error has occurred */
1985d465952Smartinh 	unsigned int		 flags;
1995d465952Smartinh };
2005d465952Smartinh 
2015d465952Smartinh struct btree {
2025d465952Smartinh 	int			 fd;
2035d465952Smartinh 	char			*path;
2045d465952Smartinh #define BT_FIXPADDING		 0x01		/* internal */
2055d465952Smartinh 	unsigned int		 flags;
2065d465952Smartinh 	bt_cmp_func		 cmp;		/* user compare function */
20737ee45a1Smartinh 	struct bt_head		 head;
20837ee45a1Smartinh 	struct bt_meta		 meta;
2095d465952Smartinh 	struct page_cache	*page_cache;
2105d465952Smartinh 	struct lru_queue	*lru_queue;
2115d465952Smartinh 	struct btree_txn	*txn;		/* current write transaction */
212e14f8e24Smartinh 	int			 ref;		/* increased by cursors & txn */
2135d465952Smartinh 	struct btree_stat	 stat;
2145d465952Smartinh 	off_t			 size;		/* current file size */
2155d465952Smartinh };
2165d465952Smartinh 
2175d465952Smartinh #define NODESIZE	 offsetof(struct node, data)
2185d465952Smartinh 
2195d465952Smartinh #define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->size))
2205d465952Smartinh #define LEAFSIZE(k, d)	 (NODESIZE + (k)->size + (d)->size)
2215d465952Smartinh #define NODEPTRP(p, i)	 ((struct node *)((char *)(p) + (p)->ptrs[i]))
22237ee45a1Smartinh #define NODEPTR(mp, i)	 NODEPTRP((mp)->page, i)
2235d465952Smartinh #define NODEKEY(node)	 (void *)((node)->data)
2245d465952Smartinh #define NODEDATA(node)	 (void *)((char *)(node)->data + (node)->ksize)
2255d465952Smartinh #define NODEPGNO(node)	 ((node)->p.np_pgno)
2265d465952Smartinh #define NODEDSZ(node)	 ((node)->p.np_dsize)
2275d465952Smartinh 
2285d465952Smartinh #define BT_COMMIT_PAGES	 64	/* max number of pages to write in one commit */
2295d465952Smartinh #define BT_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */
2305d465952Smartinh 
2315d465952Smartinh static int		 btree_read_page(struct btree *bt, pgno_t pgno,
2325d465952Smartinh 			    struct page *page);
23337ee45a1Smartinh static struct mpage	*btree_get_mpage(struct btree *bt, pgno_t pgno);
2345d465952Smartinh static int		 btree_search_page_root(struct btree *bt,
2355d465952Smartinh 			    struct mpage *root, struct btval *key,
2365d465952Smartinh 			    struct cursor *cursor, int modify,
2375d465952Smartinh 			    struct mpage **mpp);
2385d465952Smartinh static int		 btree_search_page(struct btree *bt,
2395d465952Smartinh 			    struct btree_txn *txn, struct btval *key,
2405d465952Smartinh 			    struct cursor *cursor, int modify,
2415d465952Smartinh 			    struct mpage **mpp);
2425d465952Smartinh 
24337ee45a1Smartinh static int		 btree_write_header(struct btree *bt, int fd);
24437ee45a1Smartinh static int		 btree_read_header(struct btree *bt);
2455d465952Smartinh static int		 btree_is_meta_page(struct page *p);
2465d465952Smartinh static int		 btree_read_meta(struct btree *bt, pgno_t *p_next);
247e14f8e24Smartinh static int		 btree_write_meta(struct btree *bt, pgno_t root,
248e14f8e24Smartinh 			    unsigned int flags);
249408be415Smartinh static void		 btree_ref(struct btree *bt);
2505d465952Smartinh 
2515d465952Smartinh static struct node	*btree_search_node(struct btree *bt, struct mpage *mp,
2525d465952Smartinh 			    struct btval *key, int *exactp, unsigned int *kip);
2535d465952Smartinh static int		 btree_add_node(struct btree *bt, struct mpage *mp,
2545d465952Smartinh 			    indx_t indx, struct btval *key, struct btval *data,
2555d465952Smartinh 			    pgno_t pgno, uint8_t flags);
2565d465952Smartinh static void		 btree_del_node(struct btree *bt, struct mpage *mp,
2575d465952Smartinh 			    indx_t indx);
2585d465952Smartinh static int		 btree_read_data(struct btree *bt, struct mpage *mp,
2595d465952Smartinh 			    struct node *leaf, struct btval *data);
2605d465952Smartinh 
2615d465952Smartinh static int		 btree_rebalance(struct btree *bt, struct mpage *mp);
2625d465952Smartinh static int		 btree_update_key(struct btree *bt, struct mpage *mp,
2635d465952Smartinh 			    indx_t indx, struct btval *key);
2645d465952Smartinh static int		 btree_adjust_prefix(struct btree *bt,
2655d465952Smartinh 			    struct mpage *src, int delta);
2665d465952Smartinh static int		 btree_move_node(struct btree *bt, struct mpage *src,
2675d465952Smartinh 			    indx_t srcindx, struct mpage *dst, indx_t dstindx);
2685d465952Smartinh static int		 btree_merge(struct btree *bt, struct mpage *src,
2695d465952Smartinh 			    struct mpage *dst);
2705d465952Smartinh static int		 btree_split(struct btree *bt, struct mpage **mpp,
2715d465952Smartinh 			    unsigned int *newindxp, struct btval *newkey,
2725d465952Smartinh 			    struct btval *newdata, pgno_t newpgno);
2735d465952Smartinh static struct mpage	*btree_new_page(struct btree *bt, uint32_t flags);
2745d465952Smartinh static int		 btree_write_overflow_data(struct btree *bt,
2755d465952Smartinh 			    struct page *p, struct btval *data);
2765d465952Smartinh 
2775d465952Smartinh static void		 cursor_pop_page(struct cursor *cursor);
2785d465952Smartinh static struct ppage	 *cursor_push_page(struct cursor *cursor,
2795d465952Smartinh 			    struct mpage *mp);
2805d465952Smartinh 
2815d465952Smartinh static int		 bt_set_key(struct btree *bt, struct mpage *mp,
2825d465952Smartinh 			    struct node *node, struct btval *key);
2835d465952Smartinh static int		 btree_sibling(struct cursor *cursor, int move_right);
2845d465952Smartinh static int		 btree_cursor_next(struct cursor *cursor,
2855d465952Smartinh 			    struct btval *key, struct btval *data);
2865d465952Smartinh static int		 btree_cursor_set(struct cursor *cursor,
2875cea9f8fSmartinh 			    struct btval *key, struct btval *data, int *exactp);
2885d465952Smartinh static int		 btree_cursor_first(struct cursor *cursor,
2895d465952Smartinh 			    struct btval *key, struct btval *data);
2905d465952Smartinh 
2915d465952Smartinh static void		 bt_reduce_separator(struct btree *bt, struct node *min,
2925d465952Smartinh 			    struct btval *sep);
2935d465952Smartinh static void		 remove_prefix(struct btree *bt, struct btval *key,
2945d465952Smartinh 			    size_t pfxlen);
2955d465952Smartinh static void		 expand_prefix(struct btree *bt, struct mpage *mp,
2965d465952Smartinh 			    indx_t indx, struct btkey *expkey);
2975d465952Smartinh static void		 concat_prefix(struct btree *bt, char *s1, size_t n1,
2985d465952Smartinh 			    char *s2, size_t n2, char *cs, size_t *cn);
2995d465952Smartinh static void		 common_prefix(struct btree *bt, struct btkey *min,
3005d465952Smartinh 			    struct btkey *max, struct btkey *pfx);
3015d465952Smartinh static void		 find_common_prefix(struct btree *bt, struct mpage *mp);
3025d465952Smartinh 
30337ee45a1Smartinh static size_t		 bt_leaf_size(struct btree *bt, struct btval *key,
30437ee45a1Smartinh 			    struct btval *data);
30537ee45a1Smartinh static size_t		 bt_branch_size(struct btree *bt, struct btval *key);
3065d465952Smartinh 
3075d465952Smartinh static pgno_t		 btree_compact_tree(struct btree *bt, pgno_t pgno,
30837ee45a1Smartinh 			    struct btree *btc);
3095d465952Smartinh 
3105d465952Smartinh static int		 memncmp(const void *s1, size_t n1,
3115d465952Smartinh 				 const void *s2, size_t n2);
3125d465952Smartinh static int		 memnrcmp(const void *s1, size_t n1,
3135d465952Smartinh 				  const void *s2, size_t n2);
3145d465952Smartinh 
3155d465952Smartinh static int
memncmp(const void * s1,size_t n1,const void * s2,size_t n2)3165d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
3175d465952Smartinh {
3185d465952Smartinh 	if (n1 < n2) {
3195d465952Smartinh 		if (memcmp(s1, s2, n1) == 0)
3205d465952Smartinh 			return -1;
3215d465952Smartinh 	}
3225d465952Smartinh 	else if (n1 > n2) {
3235d465952Smartinh 		if (memcmp(s1, s2, n2) == 0)
3245d465952Smartinh 			return 1;
3255d465952Smartinh 	}
3265d465952Smartinh 	return memcmp(s1, s2, n1);
3275d465952Smartinh }
3285d465952Smartinh 
3295d465952Smartinh static int
memnrcmp(const void * s1,size_t n1,const void * s2,size_t n2)3305d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
3315d465952Smartinh {
3325d465952Smartinh 	const unsigned char	*p1;
3335d465952Smartinh 	const unsigned char	*p2;
3345d465952Smartinh 
3355d465952Smartinh 	if (n1 == 0)
3365d465952Smartinh 		return n2 == 0 ? 0 : -1;
3375d465952Smartinh 
3385d465952Smartinh 	if (n2 == 0)
3395d465952Smartinh 		return n1 == 0 ? 0 : 1;
3405d465952Smartinh 
3415d465952Smartinh 	p1 = (const unsigned char *)s1 + n1 - 1;
3425d465952Smartinh 	p2 = (const unsigned char *)s2 + n2 - 1;
3435d465952Smartinh 
3445d465952Smartinh 	while (*p1 == *p2) {
3455d465952Smartinh 		if (p1 == s1)
3465d465952Smartinh 			return (p2 == s2) ? 0 : -1;
3475d465952Smartinh 		if (p2 == s2)
3485d465952Smartinh 			return (p1 == p2) ? 0 : 1;
3495d465952Smartinh 		p1--;
3505d465952Smartinh 		p2--;
3515d465952Smartinh 	}
3525d465952Smartinh 	return *p1 - *p2;
3535d465952Smartinh }
3545d465952Smartinh 
3555d465952Smartinh int
btree_cmp(struct btree * bt,const struct btval * a,const struct btval * b)3565d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)
3575d465952Smartinh {
3585d465952Smartinh 	return bt->cmp(a, b);
3595d465952Smartinh }
3605d465952Smartinh 
3615d465952Smartinh static void
common_prefix(struct btree * bt,struct btkey * min,struct btkey * max,struct btkey * pfx)3625d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,
3635d465952Smartinh     struct btkey *pfx)
3645d465952Smartinh {
3655d465952Smartinh 	size_t		 n = 0;
3665d465952Smartinh 	char		*p1;
3675d465952Smartinh 	char		*p2;
3685d465952Smartinh 
3695d465952Smartinh 	if (min->len == 0 || max->len == 0) {
3705d465952Smartinh 		pfx->len = 0;
3715d465952Smartinh 		return;
3725d465952Smartinh 	}
3735d465952Smartinh 
3745d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
3755d465952Smartinh 		p1 = min->str + min->len - 1;
3765d465952Smartinh 		p2 = max->str + max->len - 1;
3775d465952Smartinh 
3785d465952Smartinh 		while (*p1 == *p2) {
3795d465952Smartinh 			if (p1 < min->str || p2 < max->str)
3805d465952Smartinh 				break;
3815d465952Smartinh 			p1--;
3825d465952Smartinh 			p2--;
3835d465952Smartinh 			n++;
3845d465952Smartinh 		}
3855d465952Smartinh 
3865d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
3875d465952Smartinh 		pfx->len = n;
3885d465952Smartinh 		bcopy(p2 + 1, pfx->str, n);
3895d465952Smartinh 	} else {
3905d465952Smartinh 		p1 = min->str;
3915d465952Smartinh 		p2 = max->str;
3925d465952Smartinh 
3935d465952Smartinh 		while (*p1 == *p2) {
3945d465952Smartinh 			if (n == min->len || n == max->len)
3955d465952Smartinh 				break;
3965d465952Smartinh 			p1++;
3975d465952Smartinh 			p2++;
3985d465952Smartinh 			n++;
3995d465952Smartinh 		}
4005d465952Smartinh 
4015d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
4025d465952Smartinh 		pfx->len = n;
4035d465952Smartinh 		bcopy(max->str, pfx->str, n);
4045d465952Smartinh 	}
4055d465952Smartinh }
4065d465952Smartinh 
4075d465952Smartinh static void
remove_prefix(struct btree * bt,struct btval * key,size_t pfxlen)4085d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)
4095d465952Smartinh {
4105d465952Smartinh 	if (pfxlen == 0 || bt->cmp != NULL)
4115d465952Smartinh 		return;
4125d465952Smartinh 
4135d465952Smartinh 	DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen,
4145d465952Smartinh 	    (int)key->size, (char *)key->data);
4155d465952Smartinh 	assert(pfxlen <= key->size);
4165d465952Smartinh 	key->size -= pfxlen;
4175d465952Smartinh 	if (!F_ISSET(bt->flags, BT_REVERSEKEY))
4185d465952Smartinh 		key->data = (char *)key->data + pfxlen;
4195d465952Smartinh }
4205d465952Smartinh 
4215d465952Smartinh static void
expand_prefix(struct btree * bt,struct mpage * mp,indx_t indx,struct btkey * expkey)4225d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,
4235d465952Smartinh     struct btkey *expkey)
4245d465952Smartinh {
4255d465952Smartinh 	struct node	*node;
4265d465952Smartinh 
4275d465952Smartinh 	node = NODEPTR(mp, indx);
4285d465952Smartinh 	expkey->len = sizeof(expkey->str);
4295d465952Smartinh 	concat_prefix(bt, mp->prefix.str, mp->prefix.len,
4305d465952Smartinh 	    NODEKEY(node), node->ksize, expkey->str, &expkey->len);
4315d465952Smartinh }
4325d465952Smartinh 
4335d465952Smartinh static int
bt_cmp(struct btree * bt,const struct btval * key1,const struct btval * key2,struct btkey * pfx)4345d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,
4355d465952Smartinh     struct btkey *pfx)
4365d465952Smartinh {
4375d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY))
4385d465952Smartinh 		return memnrcmp(key1->data, key1->size - pfx->len,
4395d465952Smartinh 		    key2->data, key2->size);
4405d465952Smartinh 	else
4415d465952Smartinh 		return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,
4425d465952Smartinh 		    key2->data, key2->size);
4435d465952Smartinh }
4445d465952Smartinh 
4455d465952Smartinh void
btval_reset(struct btval * btv)4465d465952Smartinh btval_reset(struct btval *btv)
4475d465952Smartinh {
4485d465952Smartinh 	if (btv) {
4495d465952Smartinh 		if (btv->mp)
4505d465952Smartinh 			btv->mp->ref--;
4515d465952Smartinh 		if (btv->free_data)
4525d465952Smartinh 			free(btv->data);
45337f4b933Smmcc 		memset(btv, 0, sizeof(*btv));
4545d465952Smartinh 	}
4555d465952Smartinh }
4565d465952Smartinh 
4575d465952Smartinh static int
mpage_cmp(struct mpage * a,struct mpage * b)4585d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b)
4595d465952Smartinh {
4605d465952Smartinh 	if (a->pgno > b->pgno)
4615d465952Smartinh 		return 1;
4625d465952Smartinh 	if (a->pgno < b->pgno)
4635d465952Smartinh 		return -1;
4645d465952Smartinh 	return 0;
4655d465952Smartinh }
4665d465952Smartinh 
4675d465952Smartinh static struct mpage *
mpage_lookup(struct btree * bt,pgno_t pgno)4685d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno)
4695d465952Smartinh {
4705d465952Smartinh 	struct mpage	 find, *mp;
4715d465952Smartinh 
4725d465952Smartinh 	find.pgno = pgno;
4735d465952Smartinh 	mp = RB_FIND(page_cache, bt->page_cache, &find);
4745d465952Smartinh 	if (mp) {
4755d465952Smartinh 		bt->stat.hits++;
4765d465952Smartinh 		/* Update LRU queue. Move page to the end. */
4775d465952Smartinh 		TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
4785d465952Smartinh 		TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
4795d465952Smartinh 	}
4805d465952Smartinh 	return mp;
4815d465952Smartinh }
4825d465952Smartinh 
4835d465952Smartinh static void
mpage_add(struct btree * bt,struct mpage * mp)4845d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp)
4855d465952Smartinh {
4865d465952Smartinh 	assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);
4875d465952Smartinh 	bt->stat.cache_size++;
4885d465952Smartinh 	TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
48937ee45a1Smartinh }
49037ee45a1Smartinh 
49137ee45a1Smartinh static void
mpage_free(struct mpage * mp)49237ee45a1Smartinh mpage_free(struct mpage *mp)
49337ee45a1Smartinh {
49437ee45a1Smartinh 	if (mp != NULL) {
49537ee45a1Smartinh 		free(mp->page);
49637ee45a1Smartinh 		free(mp);
49737ee45a1Smartinh 	}
4985d465952Smartinh }
4995d465952Smartinh 
5005d465952Smartinh static void
mpage_del(struct btree * bt,struct mpage * mp)5015d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp)
5025d465952Smartinh {
5035d465952Smartinh 	assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);
5045d465952Smartinh 	assert(bt->stat.cache_size > 0);
5055d465952Smartinh 	bt->stat.cache_size--;
5065d465952Smartinh 	TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
5075d465952Smartinh }
5085d465952Smartinh 
509e14f8e24Smartinh static void
mpage_flush(struct btree * bt)510e14f8e24Smartinh mpage_flush(struct btree *bt)
511e14f8e24Smartinh {
512e14f8e24Smartinh 	struct mpage	*mp;
513e14f8e24Smartinh 
514e14f8e24Smartinh 	while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) {
515e14f8e24Smartinh 		mpage_del(bt, mp);
51637ee45a1Smartinh 		mpage_free(mp);
517e14f8e24Smartinh 	}
518e14f8e24Smartinh }
519e14f8e24Smartinh 
5205d465952Smartinh static struct mpage *
mpage_copy(struct btree * bt,struct mpage * mp)52137ee45a1Smartinh mpage_copy(struct btree *bt, struct mpage *mp)
5225d465952Smartinh {
5235d465952Smartinh 	struct mpage	*copy;
5245d465952Smartinh 
5255d465952Smartinh 	if ((copy = calloc(1, sizeof(*copy))) == NULL)
5265d465952Smartinh 		return NULL;
52737ee45a1Smartinh 	if ((copy->page = malloc(bt->head.psize)) == NULL) {
52837ee45a1Smartinh 		free(copy);
52937ee45a1Smartinh 		return NULL;
53037ee45a1Smartinh 	}
53137ee45a1Smartinh 	bcopy(mp->page, copy->page, bt->head.psize);
5325d465952Smartinh 	bcopy(&mp->prefix, &copy->prefix, sizeof(mp->prefix));
5335d465952Smartinh 	copy->parent = mp->parent;
5345d465952Smartinh 	copy->parent_index = mp->parent_index;
5355d465952Smartinh 	copy->pgno = mp->pgno;
5365d465952Smartinh 
5375d465952Smartinh 	return copy;
5385d465952Smartinh }
5395d465952Smartinh 
5405d465952Smartinh /* Remove the least recently used memory pages until the cache size is
5415d465952Smartinh  * within the configured bounds. Pages referenced by cursors or returned
5425d465952Smartinh  * key/data are not pruned.
5435d465952Smartinh  */
5445d465952Smartinh static void
mpage_prune(struct btree * bt)5455d465952Smartinh mpage_prune(struct btree *bt)
5465d465952Smartinh {
5475d465952Smartinh 	struct mpage	*mp, *next;
5485d465952Smartinh 
5495d465952Smartinh 	for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) {
5505d465952Smartinh 		if (bt->stat.cache_size <= bt->stat.max_cache)
5515d465952Smartinh 			break;
5525d465952Smartinh 		next = TAILQ_NEXT(mp, lru_next);
5535d465952Smartinh 		if (!mp->dirty && mp->ref <= 0) {
5545d465952Smartinh 			mpage_del(bt, mp);
55537ee45a1Smartinh 			mpage_free(mp);
5565d465952Smartinh 		}
5575d465952Smartinh 	}
5585d465952Smartinh }
5595d465952Smartinh 
5605d465952Smartinh /* Mark a page as dirty and push it on the dirty queue.
5615d465952Smartinh  */
5625d465952Smartinh static void
mpage_dirty(struct btree * bt,struct mpage * mp)5635d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp)
5645d465952Smartinh {
5655d465952Smartinh 	assert(bt != NULL);
5665d465952Smartinh 	assert(bt->txn != NULL);
5675d465952Smartinh 
5685d465952Smartinh 	if (!mp->dirty) {
5695d465952Smartinh 		mp->dirty = 1;
5705d465952Smartinh 		SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);
5715d465952Smartinh 	}
5725d465952Smartinh }
5735d465952Smartinh 
5745d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno.
5755d465952Smartinh  */
5767f8a466aSmartinh static struct mpage *
mpage_touch(struct btree * bt,struct mpage * mp)5775d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp)
5785d465952Smartinh {
5795d465952Smartinh 	assert(bt != NULL);
5805d465952Smartinh 	assert(bt->txn != NULL);
5815d465952Smartinh 	assert(mp != NULL);
5825d465952Smartinh 
5835d465952Smartinh 	if (!mp->dirty) {
5845d465952Smartinh 		DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno);
5855d465952Smartinh 		if (mp->ref == 0)
5865d465952Smartinh 			mpage_del(bt, mp);
5877f8a466aSmartinh 		else {
5887f8a466aSmartinh 			if ((mp = mpage_copy(bt, mp)) == NULL)
5897f8a466aSmartinh 				return NULL;
5907f8a466aSmartinh 		}
59137ee45a1Smartinh 		mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
5925d465952Smartinh 		mpage_dirty(bt, mp);
5935d465952Smartinh 		mpage_add(bt, mp);
5945d465952Smartinh 
5955d465952Smartinh 		/* Update the page number to new touched page. */
5965d465952Smartinh 		if (mp->parent != NULL)
5975d465952Smartinh 			NODEPGNO(NODEPTR(mp->parent,
5985d465952Smartinh 			    mp->parent_index)) = mp->pgno;
5995d465952Smartinh 	}
6007f8a466aSmartinh 
6017f8a466aSmartinh 	return mp;
6025d465952Smartinh }
6035d465952Smartinh 
6045d465952Smartinh static int
btree_read_page(struct btree * bt,pgno_t pgno,struct page * page)6055d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)
6065d465952Smartinh {
6075d465952Smartinh 	ssize_t		 rc;
6085d465952Smartinh 
6095d465952Smartinh 	DPRINTF("reading page %u", pgno);
6105d465952Smartinh 	bt->stat.reads++;
61137ee45a1Smartinh 	if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) {
6125d465952Smartinh 		DPRINTF("page %u doesn't exist", pgno);
6130279e526Smartinh 		errno = ENOENT;
6140279e526Smartinh 		return BT_FAIL;
61536e226d9Smartinh 	} else if (rc != (ssize_t)bt->head.psize) {
61637ee45a1Smartinh 		if (rc > 0)
61737ee45a1Smartinh 			errno = EINVAL;
6185d465952Smartinh 		DPRINTF("read: %s", strerror(errno));
6195d465952Smartinh 		return BT_FAIL;
6205d465952Smartinh 	}
6215d465952Smartinh 
6225d465952Smartinh 	if (page->pgno != pgno) {
6235d465952Smartinh 		DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno);
6245d465952Smartinh 		errno = EINVAL;
6255d465952Smartinh 		return BT_FAIL;
6265d465952Smartinh 	}
6275d465952Smartinh 
62837ee45a1Smartinh 	DPRINTF("page %u has flags 0x%X", pgno, page->flags);
62937ee45a1Smartinh 
6305d465952Smartinh 	return BT_SUCCESS;
6315d465952Smartinh }
6325d465952Smartinh 
6335d465952Smartinh int
btree_sync(struct btree * bt)6345d465952Smartinh btree_sync(struct btree *bt)
6355d465952Smartinh {
6365d465952Smartinh 	if (!F_ISSET(bt->flags, BT_NOSYNC))
6375d465952Smartinh 		return fsync(bt->fd);
6385d465952Smartinh 	return 0;
6395d465952Smartinh }
6405d465952Smartinh 
6415d465952Smartinh struct btree_txn *
btree_txn_begin(struct btree * bt,int rdonly)6425d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly)
6435d465952Smartinh {
6445d465952Smartinh 	struct btree_txn	*txn;
6455d465952Smartinh 
6465d465952Smartinh 	if (!rdonly && bt->txn != NULL) {
6475d465952Smartinh 		DPRINTF("write transaction already begun");
6480279e526Smartinh 		errno = EBUSY;
6495d465952Smartinh 		return NULL;
6505d465952Smartinh 	}
6515d465952Smartinh 
6525d465952Smartinh 	if ((txn = calloc(1, sizeof(*txn))) == NULL) {
6535d465952Smartinh 		DPRINTF("calloc: %s", strerror(errno));
6545d465952Smartinh 		return NULL;
6555d465952Smartinh 	}
6565d465952Smartinh 
6575d465952Smartinh 	if (rdonly) {
6585d465952Smartinh 		txn->flags |= BT_TXN_RDONLY;
6595d465952Smartinh 	} else {
6605d465952Smartinh 		txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue));
6615d465952Smartinh 		if (txn->dirty_queue == NULL) {
6625d465952Smartinh 			free(txn);
6635d465952Smartinh 			return NULL;
6645d465952Smartinh 		}
6655d465952Smartinh 		SIMPLEQ_INIT(txn->dirty_queue);
6665d465952Smartinh 
6675d465952Smartinh 		DPRINTF("taking write lock on txn %p", txn);
6685d465952Smartinh 		if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
6690279e526Smartinh 			DPRINTF("flock: %s", strerror(errno));
6700279e526Smartinh 			errno = EBUSY;
6715d465952Smartinh 			free(txn->dirty_queue);
6725d465952Smartinh 			free(txn);
6735d465952Smartinh 			return NULL;
6745d465952Smartinh 		}
6755d465952Smartinh 		bt->txn = txn;
6765d465952Smartinh 	}
6775d465952Smartinh 
6785d465952Smartinh 	txn->bt = bt;
679408be415Smartinh 	btree_ref(bt);
6805d465952Smartinh 
6810279e526Smartinh 	if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) {
6825d465952Smartinh 		btree_txn_abort(txn);
6835d465952Smartinh 		return NULL;
6845d465952Smartinh 	}
6855d465952Smartinh 
68637ee45a1Smartinh 	txn->root = bt->meta.root;
6875d465952Smartinh 	DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root);
6885d465952Smartinh 
6895d465952Smartinh 	return txn;
6905d465952Smartinh }
6915d465952Smartinh 
6925d465952Smartinh void
btree_txn_abort(struct btree_txn * txn)6935d465952Smartinh btree_txn_abort(struct btree_txn *txn)
6945d465952Smartinh {
6955d465952Smartinh 	struct mpage	*mp;
6965d465952Smartinh 	struct btree	*bt;
6975d465952Smartinh 
6985d465952Smartinh 	if (txn == NULL)
6995d465952Smartinh 		return;
7005d465952Smartinh 
7015d465952Smartinh 	bt = txn->bt;
7025d465952Smartinh 	DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root);
7035d465952Smartinh 
704f189d82aSmartinh 	if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7055d465952Smartinh 		/* Discard all dirty pages.
7065d465952Smartinh 		 */
7075d465952Smartinh 		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
7085d465952Smartinh 			mp = SIMPLEQ_FIRST(txn->dirty_queue);
7095d465952Smartinh 			assert(mp->ref == 0);	/* cursors should be closed */
7105d465952Smartinh 			mpage_del(bt, mp);
7115d465952Smartinh 			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
71246fd5648Smartinh 			mpage_free(mp);
7135d465952Smartinh 		}
7145d465952Smartinh 
7155d465952Smartinh 		DPRINTF("releasing write lock on txn %p", txn);
7165d465952Smartinh 		txn->bt->txn = NULL;
7175d465952Smartinh 		if (flock(txn->bt->fd, LOCK_UN) != 0) {
7185d465952Smartinh 			DPRINTF("failed to unlock fd %d: %s",
7195d465952Smartinh 			    txn->bt->fd, strerror(errno));
7205d465952Smartinh 		}
7215d465952Smartinh 		free(txn->dirty_queue);
7225d465952Smartinh 	}
7235d465952Smartinh 
7245d465952Smartinh 	btree_close(txn->bt);
7255d465952Smartinh 	free(txn);
7265d465952Smartinh }
7275d465952Smartinh 
7285d465952Smartinh int
btree_txn_commit(struct btree_txn * txn)7295d465952Smartinh btree_txn_commit(struct btree_txn *txn)
7305d465952Smartinh {
7315d465952Smartinh 	int		 n, done;
7325d465952Smartinh 	ssize_t		 rc;
7335d465952Smartinh 	off_t		 size;
7345d465952Smartinh 	struct mpage	*mp;
7355d465952Smartinh 	struct btree	*bt;
7365d465952Smartinh 	struct iovec	 iov[BT_COMMIT_PAGES];
7375d465952Smartinh 
7385d465952Smartinh 	assert(txn != NULL);
7395d465952Smartinh 	assert(txn->bt != NULL);
7405d465952Smartinh 
7415d465952Smartinh 	bt = txn->bt;
7425d465952Smartinh 
7435d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7445d465952Smartinh 		DPRINTF("attempt to commit read-only transaction");
7455d465952Smartinh 		btree_txn_abort(txn);
7460279e526Smartinh 		errno = EPERM;
7475d465952Smartinh 		return BT_FAIL;
7485d465952Smartinh 	}
7495d465952Smartinh 
7505d465952Smartinh 	if (txn != bt->txn) {
7515d465952Smartinh 		DPRINTF("attempt to commit unknown transaction");
7525d465952Smartinh 		btree_txn_abort(txn);
7530279e526Smartinh 		errno = EINVAL;
7545d465952Smartinh 		return BT_FAIL;
7555d465952Smartinh 	}
7565d465952Smartinh 
7575d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
7585d465952Smartinh 		DPRINTF("error flag is set, can't commit");
7595d465952Smartinh 		btree_txn_abort(txn);
7600279e526Smartinh 		errno = EINVAL;
7615d465952Smartinh 		return BT_FAIL;
7625d465952Smartinh 	}
7635d465952Smartinh 
7645d465952Smartinh 	if (SIMPLEQ_EMPTY(txn->dirty_queue))
7655d465952Smartinh 		goto done;
7665d465952Smartinh 
7675d465952Smartinh 	if (F_ISSET(bt->flags, BT_FIXPADDING)) {
7685d465952Smartinh 		size = lseek(bt->fd, 0, SEEK_END);
76937ee45a1Smartinh 		size += bt->head.psize - (size % bt->head.psize);
7705d465952Smartinh 		DPRINTF("extending to multiple of page size: %llu", size);
7715d465952Smartinh 		if (ftruncate(bt->fd, size) != 0) {
7725d465952Smartinh 			DPRINTF("ftruncate: %s", strerror(errno));
7735d465952Smartinh 			btree_txn_abort(txn);
7745d465952Smartinh 			return BT_FAIL;
7755d465952Smartinh 		}
7765d465952Smartinh 		bt->flags &= ~BT_FIXPADDING;
7775d465952Smartinh 	}
7785d465952Smartinh 
7795d465952Smartinh 	DPRINTF("committing transaction on btree %p, root page %u",
7805d465952Smartinh 	    bt, txn->root);
7815d465952Smartinh 
7825d465952Smartinh 	/* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.
7835d465952Smartinh 	 */
7845d465952Smartinh 	do {
7855d465952Smartinh 		n = 0;
7865d465952Smartinh 		done = 1;
7875d465952Smartinh 		SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) {
788ca70fba4Smmcc 			DPRINTF("committing page %u", mp->pgno);
78937ee45a1Smartinh 			iov[n].iov_len = bt->head.psize;
79037ee45a1Smartinh 			iov[n].iov_base = mp->page;
7915d465952Smartinh 			if (++n >= BT_COMMIT_PAGES) {
7925d465952Smartinh 				done = 0;
7935d465952Smartinh 				break;
7945d465952Smartinh 			}
7955d465952Smartinh 		}
7965d465952Smartinh 
7975d465952Smartinh 		if (n == 0)
7985d465952Smartinh 			break;
7995d465952Smartinh 
800ca70fba4Smmcc 		DPRINTF("committing %u dirty pages", n);
8015d465952Smartinh 		rc = writev(bt->fd, iov, n);
80236e226d9Smartinh 		if (rc != (ssize_t)bt->head.psize*n) {
8035d465952Smartinh 			if (rc > 0)
8045d465952Smartinh 				DPRINTF("short write, filesystem full?");
8055d465952Smartinh 			else
8065d465952Smartinh 				DPRINTF("writev: %s", strerror(errno));
8075d465952Smartinh 			btree_txn_abort(txn);
8085d465952Smartinh 			return BT_FAIL;
8095d465952Smartinh 		}
8105d465952Smartinh 
8115d465952Smartinh 		/* Remove the dirty flag from the written pages.
8125d465952Smartinh 		 */
8135d465952Smartinh 		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
8145d465952Smartinh 			mp = SIMPLEQ_FIRST(txn->dirty_queue);
8155d465952Smartinh 			mp->dirty = 0;
8165d465952Smartinh 			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
8175d465952Smartinh 			if (--n == 0)
8185d465952Smartinh 				break;
8195d465952Smartinh 		}
8205d465952Smartinh 	} while (!done);
8215d465952Smartinh 
8225d465952Smartinh 	if (btree_sync(bt) != 0 ||
823e14f8e24Smartinh 	    btree_write_meta(bt, txn->root, 0) != BT_SUCCESS ||
8245d465952Smartinh 	    btree_sync(bt) != 0) {
8255d465952Smartinh 		btree_txn_abort(txn);
8265d465952Smartinh 		return BT_FAIL;
8275d465952Smartinh 	}
8285d465952Smartinh 
8295d465952Smartinh done:
8305d465952Smartinh 	mpage_prune(bt);
8315d465952Smartinh 	btree_txn_abort(txn);
8325d465952Smartinh 
8335d465952Smartinh 	return BT_SUCCESS;
8345d465952Smartinh }
8355d465952Smartinh 
8365d465952Smartinh static int
btree_write_header(struct btree * bt,int fd)83737ee45a1Smartinh btree_write_header(struct btree *bt, int fd)
83837ee45a1Smartinh {
83937ee45a1Smartinh 	struct stat	 sb;
84037ee45a1Smartinh 	struct bt_head	*h;
84137ee45a1Smartinh 	struct page	*p;
84237ee45a1Smartinh 	ssize_t		 rc;
84337ee45a1Smartinh 	unsigned int	 psize;
84437ee45a1Smartinh 
84537ee45a1Smartinh 	DPRINTF("writing header page");
84637ee45a1Smartinh 	assert(bt != NULL);
84737ee45a1Smartinh 
848*3572a0e3Ssthen 	/*
849*3572a0e3Ssthen 	 * Ask stat for 'optimal blocksize for I/O', but cap to fit in indx_t.
85037ee45a1Smartinh 	 */
85137ee45a1Smartinh 	if (fstat(fd, &sb) == 0)
852*3572a0e3Ssthen 		psize = MINIMUM(32*1024, sb.st_blksize);
85337ee45a1Smartinh 	else
85437ee45a1Smartinh 		psize = PAGESIZE;
85537ee45a1Smartinh 
85637ee45a1Smartinh 	if ((p = calloc(1, psize)) == NULL)
85737ee45a1Smartinh 		return -1;
85837ee45a1Smartinh 	p->flags = P_HEAD;
85937ee45a1Smartinh 
86037ee45a1Smartinh 	h = METADATA(p);
86137ee45a1Smartinh 	h->magic = BT_MAGIC;
86237ee45a1Smartinh 	h->version = BT_VERSION;
86337ee45a1Smartinh 	h->psize = psize;
86437ee45a1Smartinh 	bcopy(h, &bt->head, sizeof(*h));
86537ee45a1Smartinh 
86637ee45a1Smartinh 	rc = write(fd, p, bt->head.psize);
86737ee45a1Smartinh 	free(p);
86836e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize) {
86937ee45a1Smartinh 		if (rc > 0)
87037ee45a1Smartinh 			DPRINTF("short write, filesystem full?");
87137ee45a1Smartinh 		return BT_FAIL;
87237ee45a1Smartinh 	}
87337ee45a1Smartinh 
87437ee45a1Smartinh 	return BT_SUCCESS;
87537ee45a1Smartinh }
87637ee45a1Smartinh 
87737ee45a1Smartinh static int
btree_read_header(struct btree * bt)87837ee45a1Smartinh btree_read_header(struct btree *bt)
87937ee45a1Smartinh {
88037ee45a1Smartinh 	char		 page[PAGESIZE];
88137ee45a1Smartinh 	struct page	*p;
88237ee45a1Smartinh 	struct bt_head	*h;
88337ee45a1Smartinh 	int		 rc;
88437ee45a1Smartinh 
88537ee45a1Smartinh 	assert(bt != NULL);
88637ee45a1Smartinh 
88737ee45a1Smartinh 	/* We don't know the page size yet, so use a minimum value.
88837ee45a1Smartinh 	 */
88937ee45a1Smartinh 
89037ee45a1Smartinh 	if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) {
89137ee45a1Smartinh 		errno = ENOENT;
89237ee45a1Smartinh 		return -1;
89337ee45a1Smartinh 	} else if (rc != PAGESIZE) {
89437ee45a1Smartinh 		if (rc > 0)
89537ee45a1Smartinh 			errno = EINVAL;
89637ee45a1Smartinh 		DPRINTF("read: %s", strerror(errno));
89737ee45a1Smartinh 		return -1;
89837ee45a1Smartinh 	}
89937ee45a1Smartinh 
90037ee45a1Smartinh 	p = (struct page *)page;
90137ee45a1Smartinh 
90237ee45a1Smartinh 	if (!F_ISSET(p->flags, P_HEAD)) {
90337ee45a1Smartinh 		DPRINTF("page %d not a header page", p->pgno);
90437ee45a1Smartinh 		errno = EINVAL;
90537ee45a1Smartinh 		return -1;
90637ee45a1Smartinh 	}
90737ee45a1Smartinh 
90837ee45a1Smartinh 	h = METADATA(p);
90937ee45a1Smartinh 	if (h->magic != BT_MAGIC) {
91037ee45a1Smartinh 		DPRINTF("header has invalid magic");
91137ee45a1Smartinh 		errno = EINVAL;
91237ee45a1Smartinh 		return -1;
91337ee45a1Smartinh 	}
91437ee45a1Smartinh 
91537ee45a1Smartinh 	if (h->version != BT_VERSION) {
91637ee45a1Smartinh 		DPRINTF("database is version %u, expected version %u",
91737ee45a1Smartinh 		    bt->head.version, BT_VERSION);
91837ee45a1Smartinh 		errno = EINVAL;
91937ee45a1Smartinh 		return -1;
92037ee45a1Smartinh 	}
92137ee45a1Smartinh 
92237ee45a1Smartinh 	bcopy(h, &bt->head, sizeof(*h));
92337ee45a1Smartinh 	return 0;
92437ee45a1Smartinh }
92537ee45a1Smartinh 
92637ee45a1Smartinh static int
btree_write_meta(struct btree * bt,pgno_t root,unsigned int flags)927e14f8e24Smartinh btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags)
9285d465952Smartinh {
92937ee45a1Smartinh 	struct mpage	*mp;
93037ee45a1Smartinh 	struct bt_meta	*meta;
9315d465952Smartinh 	ssize_t		 rc;
9325d465952Smartinh 
9335d465952Smartinh 	DPRINTF("writing meta page for root page %u", root);
9345d465952Smartinh 
9355d465952Smartinh 	assert(bt != NULL);
9365d465952Smartinh 	assert(bt->txn != NULL);
9375d465952Smartinh 
93837ee45a1Smartinh 	if ((mp = btree_new_page(bt, P_META)) == NULL)
93937ee45a1Smartinh 		return -1;
9405d465952Smartinh 
94137ee45a1Smartinh 	bt->meta.prev_meta = bt->meta.root;
94237ee45a1Smartinh 	bt->meta.root = root;
94337ee45a1Smartinh 	bt->meta.flags = flags;
94437ee45a1Smartinh 	bt->meta.created_at = time(0);
94537ee45a1Smartinh 	bt->meta.revisions++;
94637ee45a1Smartinh 	SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash);
9475d465952Smartinh 
94837ee45a1Smartinh 	/* Copy the meta data changes to the new meta page. */
94937ee45a1Smartinh 	meta = METADATA(mp->page);
95037ee45a1Smartinh 	bcopy(&bt->meta, meta, sizeof(*meta));
95137ee45a1Smartinh 
95236e226d9Smartinh 	rc = write(bt->fd, mp->page, bt->head.psize);
953b09f546cSmartinh 	mp->dirty = 0;
954b09f546cSmartinh 	SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next);
95536e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize) {
9565d465952Smartinh 		if (rc > 0)
9575d465952Smartinh 			DPRINTF("short write, filesystem full?");
9585d465952Smartinh 		return BT_FAIL;
9595d465952Smartinh 	}
9605d465952Smartinh 
9615d465952Smartinh 	if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) {
9625d465952Smartinh 		DPRINTF("failed to update file size: %s", strerror(errno));
9635d465952Smartinh 		bt->size = 0;
9645d465952Smartinh 	}
9655d465952Smartinh 
9665d465952Smartinh 	return BT_SUCCESS;
9675d465952Smartinh }
9685d465952Smartinh 
9695d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise.
9705d465952Smartinh  */
9715d465952Smartinh static int
btree_is_meta_page(struct page * p)9725d465952Smartinh btree_is_meta_page(struct page *p)
9735d465952Smartinh {
9745d465952Smartinh 	struct bt_meta	*m;
9755d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
9765d465952Smartinh 
9775d465952Smartinh 	m = METADATA(p);
9785d465952Smartinh 	if (!F_ISSET(p->flags, P_META)) {
9795d465952Smartinh 		DPRINTF("page %d not a meta page", p->pgno);
98037ee45a1Smartinh 		errno = EINVAL;
9815d465952Smartinh 		return 0;
9825d465952Smartinh 	}
9835d465952Smartinh 
9845d465952Smartinh 	if (m->root >= p->pgno && m->root != P_INVALID) {
9855d465952Smartinh 		DPRINTF("page %d points to an invalid root page", p->pgno);
98637ee45a1Smartinh 		errno = EINVAL;
9875d465952Smartinh 		return 0;
9885d465952Smartinh 	}
9895d465952Smartinh 
9905d465952Smartinh 	SHA1((unsigned char *)m, METAHASHLEN, hash);
9915d465952Smartinh 	if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) {
9925d465952Smartinh 		DPRINTF("page %d has an invalid digest", p->pgno);
99337ee45a1Smartinh 		errno = EINVAL;
9945d465952Smartinh 		return 0;
9955d465952Smartinh 	}
9965d465952Smartinh 
9975d465952Smartinh 	return 1;
9985d465952Smartinh }
9995d465952Smartinh 
10005d465952Smartinh static int
btree_read_meta(struct btree * bt,pgno_t * p_next)10015d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next)
10025d465952Smartinh {
100337ee45a1Smartinh 	struct mpage	*mp;
100437ee45a1Smartinh 	struct bt_meta	*meta;
10055d465952Smartinh 	pgno_t		 meta_pgno, next_pgno;
10065d465952Smartinh 	off_t		 size;
10075d465952Smartinh 
10085d465952Smartinh 	assert(bt != NULL);
10095d465952Smartinh 
10105d465952Smartinh 	if ((size = lseek(bt->fd, 0, SEEK_END)) == -1)
10115d465952Smartinh 		goto fail;
10125d465952Smartinh 
10135d465952Smartinh 	DPRINTF("btree_read_meta: size = %llu", size);
10145d465952Smartinh 
10155d465952Smartinh 	if (size < bt->size) {
10165d465952Smartinh 		DPRINTF("file has shrunk!");
10175d465952Smartinh 		errno = EIO;
10185d465952Smartinh 		goto fail;
10195d465952Smartinh 	}
10205d465952Smartinh 
102137ee45a1Smartinh 	if (size == bt->head.psize) {		/* there is only the header */
10225d465952Smartinh 		if (p_next != NULL)
102337ee45a1Smartinh 			*p_next = 1;
10240279e526Smartinh 		return BT_SUCCESS;		/* new file */
10255d465952Smartinh 	}
10265d465952Smartinh 
102737ee45a1Smartinh 	next_pgno = size / bt->head.psize;
10285d465952Smartinh 	if (next_pgno == 0) {
10295d465952Smartinh 		DPRINTF("corrupt file");
10305d465952Smartinh 		errno = EIO;
10315d465952Smartinh 		goto fail;
10325d465952Smartinh 	}
10335d465952Smartinh 
10345d465952Smartinh 	meta_pgno = next_pgno - 1;
10355d465952Smartinh 
103637ee45a1Smartinh 	if (size % bt->head.psize != 0) {
10375d465952Smartinh 		DPRINTF("filesize not a multiple of the page size!");
10385d465952Smartinh 		bt->flags |= BT_FIXPADDING;
10395d465952Smartinh 		next_pgno++;
10405d465952Smartinh 	}
10415d465952Smartinh 
10425d465952Smartinh 	if (p_next != NULL)
10435d465952Smartinh 		*p_next = next_pgno;
10445d465952Smartinh 
10455d465952Smartinh 	if (size == bt->size) {
10465d465952Smartinh 		DPRINTF("size unchanged, keeping current meta page");
104737ee45a1Smartinh 		if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) {
104862436105Smartinh 			DPRINTF("file is dead");
10490279e526Smartinh 			errno = ESTALE;
10500279e526Smartinh 			return BT_FAIL;
105162436105Smartinh 		} else
10525d465952Smartinh 			return BT_SUCCESS;
10535d465952Smartinh 	}
10545d465952Smartinh 	bt->size = size;
10555d465952Smartinh 
10565d465952Smartinh 	while (meta_pgno > 0) {
105737ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL)
105837ee45a1Smartinh 			break;
105937ee45a1Smartinh 		if (btree_is_meta_page(mp->page)) {
106037ee45a1Smartinh 			meta = METADATA(mp->page);
106137ee45a1Smartinh 			DPRINTF("flags = 0x%x", meta->flags);
106237ee45a1Smartinh 			if (F_ISSET(meta->flags, BT_TOMBSTONE)) {
1063e14f8e24Smartinh 				DPRINTF("file is dead");
10640279e526Smartinh 				errno = ESTALE;
10650279e526Smartinh 				return BT_FAIL;
106637ee45a1Smartinh 			} else {
106737ee45a1Smartinh 				/* Make copy of last meta page. */
106837ee45a1Smartinh 				bcopy(meta, &bt->meta, sizeof(bt->meta));
10695d465952Smartinh 				return BT_SUCCESS;
10705d465952Smartinh 			}
107137ee45a1Smartinh 		}
10725d465952Smartinh 		--meta_pgno;	/* scan backwards to first valid meta page */
10735d465952Smartinh 	}
10745d465952Smartinh 
10755d465952Smartinh 	errno = EIO;
10765d465952Smartinh fail:
10775d465952Smartinh 	if (p_next != NULL)
10785d465952Smartinh 		*p_next = P_INVALID;
10795d465952Smartinh 	return BT_FAIL;
10805d465952Smartinh }
10815d465952Smartinh 
10825d465952Smartinh struct btree *
btree_open_fd(int fd,unsigned int flags)1083176c8cbcSmartinh btree_open_fd(int fd, unsigned int flags)
10845d465952Smartinh {
10855d465952Smartinh 	struct btree	*bt;
108637ee45a1Smartinh 	int		 fl;
10875d465952Smartinh 
108883094010Skrw 	fl = fcntl(fd, F_GETFL);
10895d465952Smartinh 	if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1)
10905d465952Smartinh 		return NULL;
10915d465952Smartinh 
10925d465952Smartinh 	if ((bt = calloc(1, sizeof(*bt))) == NULL)
10935d465952Smartinh 		return NULL;
10945d465952Smartinh 	bt->fd = fd;
10955d465952Smartinh 	bt->flags = flags;
10965d465952Smartinh 	bt->flags &= ~BT_FIXPADDING;
1097408be415Smartinh 	bt->ref = 1;
109837ee45a1Smartinh 	bt->meta.root = P_INVALID;
10995d465952Smartinh 
11005d465952Smartinh 	if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL)
11015d465952Smartinh 		goto fail;
11025d465952Smartinh 	bt->stat.max_cache = BT_MAXCACHE_DEF;
11035d465952Smartinh 	RB_INIT(bt->page_cache);
11045d465952Smartinh 
11055d465952Smartinh 	if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL)
11065d465952Smartinh 		goto fail;
11075d465952Smartinh 	TAILQ_INIT(bt->lru_queue);
11085d465952Smartinh 
110937ee45a1Smartinh 	if (btree_read_header(bt) != 0) {
111037ee45a1Smartinh 		if (errno != ENOENT)
11115d465952Smartinh 			goto fail;
111237ee45a1Smartinh 		DPRINTF("new database");
111337ee45a1Smartinh 		btree_write_header(bt, bt->fd);
111437ee45a1Smartinh 	}
11155d465952Smartinh 
11160279e526Smartinh 	if (btree_read_meta(bt, NULL) != 0)
11175d465952Smartinh 		goto fail;
11185d465952Smartinh 
11195d465952Smartinh 	DPRINTF("opened database version %u, pagesize %u",
112037ee45a1Smartinh 	    bt->head.version, bt->head.psize);
112137ee45a1Smartinh 	DPRINTF("timestamp: %s", ctime(&bt->meta.created_at));
112237ee45a1Smartinh 	DPRINTF("depth: %u", bt->meta.depth);
112337ee45a1Smartinh 	DPRINTF("entries: %llu", bt->meta.entries);
112437ee45a1Smartinh 	DPRINTF("revisions: %u", bt->meta.revisions);
112537ee45a1Smartinh 	DPRINTF("branch pages: %u", bt->meta.branch_pages);
112637ee45a1Smartinh 	DPRINTF("leaf pages: %u", bt->meta.leaf_pages);
112737ee45a1Smartinh 	DPRINTF("overflow pages: %u", bt->meta.overflow_pages);
112837ee45a1Smartinh 	DPRINTF("root: %u", bt->meta.root);
112937ee45a1Smartinh 	DPRINTF("previous meta page: %u", bt->meta.prev_meta);
11305d465952Smartinh 
11315d465952Smartinh 	return bt;
11325d465952Smartinh 
11335d465952Smartinh fail:
11345d465952Smartinh 	free(bt->lru_queue);
11355d465952Smartinh 	free(bt->page_cache);
11365d465952Smartinh 	free(bt);
11375d465952Smartinh 	return NULL;
11385d465952Smartinh }
11395d465952Smartinh 
11405d465952Smartinh struct btree *
btree_open(const char * path,unsigned int flags,mode_t mode)1141176c8cbcSmartinh btree_open(const char *path, unsigned int flags, mode_t mode)
11425d465952Smartinh {
11435d465952Smartinh 	int		 fd, oflags;
11445d465952Smartinh 	struct btree	*bt;
11455d465952Smartinh 
11465d465952Smartinh 	if (F_ISSET(flags, BT_RDONLY))
11475d465952Smartinh 		oflags = O_RDONLY;
11485d465952Smartinh 	else
11495d465952Smartinh 		oflags = O_RDWR | O_CREAT | O_APPEND;
11505d465952Smartinh 
11515d465952Smartinh 	if ((fd = open(path, oflags, mode)) == -1)
11525d465952Smartinh 		return NULL;
11535d465952Smartinh 
11545d465952Smartinh 	if ((bt = btree_open_fd(fd, flags)) == NULL)
11555d465952Smartinh 		close(fd);
11565d465952Smartinh 	else {
11575d465952Smartinh 		bt->path = strdup(path);
11585d465952Smartinh 		DPRINTF("opened btree %p", bt);
11595d465952Smartinh 	}
11605d465952Smartinh 
11615d465952Smartinh 	return bt;
11625d465952Smartinh }
11635d465952Smartinh 
1164408be415Smartinh static void
btree_ref(struct btree * bt)1165408be415Smartinh btree_ref(struct btree *bt)
1166408be415Smartinh {
1167408be415Smartinh 	bt->ref++;
1168408be415Smartinh 	DPRINTF("ref is now %d on btree %p", bt->ref, bt);
1169408be415Smartinh }
1170408be415Smartinh 
11715d465952Smartinh void
btree_close(struct btree * bt)11725d465952Smartinh btree_close(struct btree *bt)
11735d465952Smartinh {
1174e14f8e24Smartinh 	if (bt == NULL)
1175e14f8e24Smartinh 		return;
11765d465952Smartinh 
1177e14f8e24Smartinh 	if (--bt->ref == 0) {
11785d465952Smartinh 		DPRINTF("ref is zero, closing btree %p", bt);
11795d465952Smartinh 		close(bt->fd);
1180e14f8e24Smartinh 		mpage_flush(bt);
1181e83131ceSjmatthew 		free(bt->lru_queue);
1182e83131ceSjmatthew 		free(bt->path);
11835d465952Smartinh 		free(bt->page_cache);
11845d465952Smartinh 		free(bt);
1185e14f8e24Smartinh 	} else
1186408be415Smartinh 		DPRINTF("ref is now %d on btree %p", bt->ref, bt);
11875d465952Smartinh }
11885d465952Smartinh 
11895d465952Smartinh /* Search for key within a leaf page, using binary search.
11905d465952Smartinh  * Returns the smallest entry larger or equal to the key.
11915d465952Smartinh  * If exactp is non-null, stores whether the found entry was an exact match
11925d465952Smartinh  * in *exactp (1 or 0).
11935d465952Smartinh  * If kip is non-null, stores the index of the found entry in *kip.
11945d465952Smartinh  * If no entry larger of equal to the key is found, returns NULL.
11955d465952Smartinh  */
11965d465952Smartinh static struct node *
btree_search_node(struct btree * bt,struct mpage * mp,struct btval * key,int * exactp,unsigned int * kip)11975d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,
11985d465952Smartinh     int *exactp, unsigned int *kip)
11995d465952Smartinh {
12005d465952Smartinh 	unsigned int	 i = 0;
12015d465952Smartinh 	int		 low, high;
12025d465952Smartinh 	int		 rc = 0;
12035d465952Smartinh 	struct node	*node;
12045d465952Smartinh 	struct btval	 nodekey;
12055d465952Smartinh 
12065d465952Smartinh 	DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]",
12075d465952Smartinh 	    NUMKEYS(mp),
12085d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
12095d465952Smartinh 	    mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);
12105d465952Smartinh 
12115d465952Smartinh 	assert(NUMKEYS(mp) > 0);
12125d465952Smartinh 
121337f4b933Smmcc 	memset(&nodekey, 0, sizeof(nodekey));
12145d465952Smartinh 
12155d465952Smartinh 	low = IS_LEAF(mp) ? 0 : 1;
12165d465952Smartinh 	high = NUMKEYS(mp) - 1;
12175d465952Smartinh 	while (low <= high) {
12185d465952Smartinh 		i = (low + high) >> 1;
12195d465952Smartinh 		node = NODEPTR(mp, i);
12205d465952Smartinh 
12215d465952Smartinh 		nodekey.size = node->ksize;
12225d465952Smartinh 		nodekey.data = NODEKEY(node);
12235d465952Smartinh 
12245d465952Smartinh 		if (bt->cmp)
12255d465952Smartinh 			rc = bt->cmp(key, &nodekey);
12265d465952Smartinh 		else
12275d465952Smartinh 			rc = bt_cmp(bt, key, &nodekey, &mp->prefix);
12285d465952Smartinh 
12295d465952Smartinh 		if (IS_LEAF(mp))
123048318e0bSderaadt 			DPRINTF("found leaf index %u [%.*s], rc = %d",
12315d465952Smartinh 			    i, (int)nodekey.size, (char *)nodekey.data, rc);
12325d465952Smartinh 		else
123348318e0bSderaadt 			DPRINTF("found branch index %u [%.*s -> %u], rc = %d",
12345d465952Smartinh 			    i, (int)node->ksize, (char *)NODEKEY(node),
12355d465952Smartinh 			    node->n_pgno, rc);
12365d465952Smartinh 
12375d465952Smartinh 		if (rc == 0)
12385d465952Smartinh 			break;
12395d465952Smartinh 		if (rc > 0)
12405d465952Smartinh 			low = i + 1;
12415d465952Smartinh 		else
12425d465952Smartinh 			high = i - 1;
12435d465952Smartinh 	}
12445d465952Smartinh 
12455d465952Smartinh 	if (rc > 0) {	/* Found entry is less than the key. */
12465d465952Smartinh 		i++;	/* Skip to get the smallest entry larger than key. */
12475d465952Smartinh 		if (i >= NUMKEYS(mp))
12485d465952Smartinh 			/* There is no entry larger or equal to the key. */
12495d465952Smartinh 			return NULL;
12505d465952Smartinh 	}
12515d465952Smartinh 	if (exactp)
12525d465952Smartinh 		*exactp = (rc == 0);
12535d465952Smartinh 	if (kip)	/* Store the key index if requested. */
12545d465952Smartinh 		*kip = i;
12555d465952Smartinh 
12565d465952Smartinh 	return NODEPTR(mp, i);
12575d465952Smartinh }
12585d465952Smartinh 
12595d465952Smartinh static void
cursor_pop_page(struct cursor * cursor)12605d465952Smartinh cursor_pop_page(struct cursor *cursor)
12615d465952Smartinh {
12625d465952Smartinh 	struct ppage	*top;
12635d465952Smartinh 
12645d465952Smartinh 	top = CURSOR_TOP(cursor);
12655d465952Smartinh 	CURSOR_POP(cursor);
12665d465952Smartinh 	top->mpage->ref--;
12675d465952Smartinh 
12685d465952Smartinh 	DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor);
12695d465952Smartinh 
12705d465952Smartinh 	free(top);
12715d465952Smartinh }
12725d465952Smartinh 
12735d465952Smartinh static struct ppage *
cursor_push_page(struct cursor * cursor,struct mpage * mp)12745d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp)
12755d465952Smartinh {
12765d465952Smartinh 	struct ppage	*ppage;
12775d465952Smartinh 
12785d465952Smartinh 	DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor);
12795d465952Smartinh 
12805d465952Smartinh 	if ((ppage = calloc(1, sizeof(*ppage))) == NULL)
12815d465952Smartinh 		return NULL;
12825d465952Smartinh 	ppage->mpage = mp;
12835d465952Smartinh 	mp->ref++;
12845d465952Smartinh 	CURSOR_PUSH(cursor, ppage);
12855d465952Smartinh 	return ppage;
12865d465952Smartinh }
12875d465952Smartinh 
128837ee45a1Smartinh static struct mpage *
btree_get_mpage(struct btree * bt,pgno_t pgno)128937ee45a1Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno)
12905d465952Smartinh {
12915d465952Smartinh 	struct mpage	*mp;
12925d465952Smartinh 
12935d465952Smartinh 	mp = mpage_lookup(bt, pgno);
12945d465952Smartinh 	if (mp == NULL) {
12955d465952Smartinh 		if ((mp = calloc(1, sizeof(*mp))) == NULL)
129637ee45a1Smartinh 			return NULL;
129737ee45a1Smartinh 		if ((mp->page = malloc(bt->head.psize)) == NULL) {
12985d465952Smartinh 			free(mp);
129937ee45a1Smartinh 			return NULL;
130037ee45a1Smartinh 		}
13011c05783eSmartinh 		if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) {
130237ee45a1Smartinh 			mpage_free(mp);
130337ee45a1Smartinh 			return NULL;
13045d465952Smartinh 		}
13055d465952Smartinh 		mp->pgno = pgno;
13065d465952Smartinh 		mpage_add(bt, mp);
13075d465952Smartinh 	} else
13085d465952Smartinh 		DPRINTF("returning page %u from cache", pgno);
13095d465952Smartinh 
131037ee45a1Smartinh 	return mp;
13115d465952Smartinh }
13125d465952Smartinh 
13135d465952Smartinh static void
concat_prefix(struct btree * bt,char * s1,size_t n1,char * s2,size_t n2,char * cs,size_t * cn)13145d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,
13155d465952Smartinh     char *cs, size_t *cn)
13165d465952Smartinh {
13175d465952Smartinh 	assert(*cn >= n1 + n2);
13185d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
13195d465952Smartinh 		bcopy(s2, cs, n2);
13205d465952Smartinh 		bcopy(s1, cs + n2, n1);
13215d465952Smartinh 	} else {
13225d465952Smartinh 		bcopy(s1, cs, n1);
13235d465952Smartinh 		bcopy(s2, cs + n1, n2);
13245d465952Smartinh 	}
13255d465952Smartinh 	*cn = n1 + n2;
13265d465952Smartinh }
13275d465952Smartinh 
13285d465952Smartinh static void
find_common_prefix(struct btree * bt,struct mpage * mp)13295d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp)
13305d465952Smartinh {
13315d465952Smartinh 	indx_t			 lbound = 0, ubound = 0;
13325d465952Smartinh 	struct mpage		*lp, *up;
13335d465952Smartinh 	struct btkey		 lprefix, uprefix;
13345d465952Smartinh 
13355d465952Smartinh 	mp->prefix.len = 0;
13365d465952Smartinh 	if (bt->cmp != NULL)
13375d465952Smartinh 		return;
13385d465952Smartinh 
13395d465952Smartinh 	lp = mp;
13405d465952Smartinh 	while (lp->parent != NULL) {
13415d465952Smartinh 		if (lp->parent_index > 0) {
13425d465952Smartinh 			lbound = lp->parent_index;
13435d465952Smartinh 			break;
13445d465952Smartinh 		}
13455d465952Smartinh 		lp = lp->parent;
13465d465952Smartinh 	}
13475d465952Smartinh 
13485d465952Smartinh 	up = mp;
13495d465952Smartinh 	while (up->parent != NULL) {
13505d465952Smartinh 		if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) {
13515d465952Smartinh 			ubound = up->parent_index + 1;
13525d465952Smartinh 			break;
13535d465952Smartinh 		}
13545d465952Smartinh 		up = up->parent;
13555d465952Smartinh 	}
13565d465952Smartinh 
13575d465952Smartinh 	if (lp->parent != NULL && up->parent != NULL) {
13585d465952Smartinh 		expand_prefix(bt, lp->parent, lbound, &lprefix);
13595d465952Smartinh 		expand_prefix(bt, up->parent, ubound, &uprefix);
13605d465952Smartinh 		common_prefix(bt, &lprefix, &uprefix, &mp->prefix);
13615d465952Smartinh 	}
13625d465952Smartinh 	else if (mp->parent)
13635d465952Smartinh 		bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));
13645d465952Smartinh 
13655d465952Smartinh 	DPRINTF("found common prefix [%.*s] (len %zu) for page %u",
13665d465952Smartinh 	    (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);
13675d465952Smartinh }
13685d465952Smartinh 
13695d465952Smartinh static int
btree_search_page_root(struct btree * bt,struct mpage * root,struct btval * key,struct cursor * cursor,int modify,struct mpage ** mpp)13705d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,
13715d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
13725d465952Smartinh {
13735d465952Smartinh 	struct mpage	*mp, *parent;
13745d465952Smartinh 
13755d465952Smartinh 	if (cursor && cursor_push_page(cursor, root) == NULL)
13765d465952Smartinh 		return BT_FAIL;
13775d465952Smartinh 
13785d465952Smartinh 	mp = root;
13795d465952Smartinh 	while (IS_BRANCH(mp)) {
13805d465952Smartinh 		unsigned int	 i = 0;
13815d465952Smartinh 		struct node	*node;
13825d465952Smartinh 
13835d465952Smartinh 		DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp));
13845d465952Smartinh 		assert(NUMKEYS(mp) > 1);
13851c05783eSmartinh 		DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0)));
13865d465952Smartinh 
13875d465952Smartinh 		if (key == NULL)	/* Initialize cursor to first page. */
13885d465952Smartinh 			i = 0;
13895d465952Smartinh 		else {
13905d465952Smartinh 			int	 exact;
13915d465952Smartinh 			node = btree_search_node(bt, mp, key, &exact, &i);
13925d465952Smartinh 			if (node == NULL)
13935d465952Smartinh 				i = NUMKEYS(mp) - 1;
13945d465952Smartinh 			else if (!exact) {
13955d465952Smartinh 				assert(i > 0);
13965d465952Smartinh 				i--;
13975d465952Smartinh 			}
13985d465952Smartinh 		}
13995d465952Smartinh 
14005d465952Smartinh 		if (key)
14015d465952Smartinh 			DPRINTF("following index %u for key %.*s",
14025d465952Smartinh 			    i, (int)key->size, (char *)key->data);
14035d465952Smartinh 		assert(i >= 0 && i < NUMKEYS(mp));
14045d465952Smartinh 		node = NODEPTR(mp, i);
14055d465952Smartinh 
14065d465952Smartinh 		if (cursor)
14075d465952Smartinh 			CURSOR_TOP(cursor)->ki = i;
14085d465952Smartinh 
14095d465952Smartinh 		parent = mp;
141037ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
14115d465952Smartinh 			return BT_FAIL;
14125d465952Smartinh 		mp->parent = parent;
14135d465952Smartinh 		mp->parent_index = i;
14145d465952Smartinh 		find_common_prefix(bt, mp);
14155d465952Smartinh 
14165d465952Smartinh 		if (cursor && cursor_push_page(cursor, mp) == NULL)
14175d465952Smartinh 			return BT_FAIL;
14185d465952Smartinh 
14197f8a466aSmartinh 		if (modify && (mp = mpage_touch(bt, mp)) == NULL)
14207f8a466aSmartinh 			return BT_FAIL;
14215d465952Smartinh 	}
14225d465952Smartinh 
14235d465952Smartinh 	if (!IS_LEAF(mp)) {
14245d465952Smartinh 		DPRINTF("internal error, index points to a %02X page!?",
142537ee45a1Smartinh 		    mp->page->flags);
14265d465952Smartinh 		return BT_FAIL;
14275d465952Smartinh 	}
14285d465952Smartinh 
14295d465952Smartinh 	DPRINTF("found leaf page %u for key %.*s", mp->pgno,
14305d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL);
14315d465952Smartinh 
14325d465952Smartinh 	*mpp = mp;
14335d465952Smartinh 	return BT_SUCCESS;
14345d465952Smartinh }
14355d465952Smartinh 
14365d465952Smartinh /* Search for the page a given key should be in.
14375d465952Smartinh  * Stores a pointer to the found page in *mpp.
14385d465952Smartinh  * If key is NULL, search for the lowest page (used by btree_cursor_first).
14395d465952Smartinh  * If cursor is non-null, pushes parent pages on the cursor stack.
14405d465952Smartinh  * If modify is true, visited pages are updated with new page numbers.
14415d465952Smartinh  */
14425d465952Smartinh static int
btree_search_page(struct btree * bt,struct btree_txn * txn,struct btval * key,struct cursor * cursor,int modify,struct mpage ** mpp)14435d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,
14445d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
14455d465952Smartinh {
14465d465952Smartinh 	int		 rc;
14475d465952Smartinh 	pgno_t		 root;
14485d465952Smartinh 	struct mpage	*mp;
14495d465952Smartinh 
14505d465952Smartinh 	/* Can't modify pages outside a transaction. */
14515d465952Smartinh 	if (txn == NULL && modify) {
14525d465952Smartinh 		errno = EINVAL;
14535d465952Smartinh 		return BT_FAIL;
14545d465952Smartinh 	}
14555d465952Smartinh 
14565d465952Smartinh 	/* Choose which root page to start with. If a transaction is given
14575d465952Smartinh          * use the root page from the transaction, otherwise read the last
14585d465952Smartinh          * committed root page.
14595d465952Smartinh 	 */
14605d465952Smartinh 	if (txn == NULL) {
14615d465952Smartinh 		if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)
14625d465952Smartinh 			return rc;
146337ee45a1Smartinh 		root = bt->meta.root;
14645d465952Smartinh 	} else if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
14655d465952Smartinh 		DPRINTF("transaction has failed, must abort");
14660279e526Smartinh 		errno = EINVAL;
14675d465952Smartinh 		return BT_FAIL;
14685d465952Smartinh 	} else
14695d465952Smartinh 		root = txn->root;
14705d465952Smartinh 
14710279e526Smartinh 	if (root == P_INVALID) {		/* Tree is empty. */
147237ee45a1Smartinh 		DPRINTF("tree is empty");
14730279e526Smartinh 		errno = ENOENT;
14740279e526Smartinh 		return BT_FAIL;
14750279e526Smartinh 	}
14765d465952Smartinh 
147737ee45a1Smartinh 	if ((mp = btree_get_mpage(bt, root)) == NULL)
147837ee45a1Smartinh 		return BT_FAIL;
147937ee45a1Smartinh 
148037ee45a1Smartinh 	DPRINTF("root page has flags 0x%X", mp->page->flags);
14815d465952Smartinh 
14825d465952Smartinh 	assert(mp->parent == NULL);
14835d465952Smartinh 	assert(mp->prefix.len == 0);
14845d465952Smartinh 
14855d465952Smartinh 	if (modify && !mp->dirty) {
14867f8a466aSmartinh 		if ((mp = mpage_touch(bt, mp)) == NULL)
14877f8a466aSmartinh 			return BT_FAIL;
14885d465952Smartinh 		txn->root = mp->pgno;
14895d465952Smartinh 	}
14905d465952Smartinh 
14915d465952Smartinh 	return btree_search_page_root(bt, mp, key, cursor, modify, mpp);
14925d465952Smartinh }
14935d465952Smartinh 
14945d465952Smartinh static int
btree_read_data(struct btree * bt,struct mpage * mp,struct node * leaf,struct btval * data)14955d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,
14965d465952Smartinh     struct btval *data)
14975d465952Smartinh {
149837ee45a1Smartinh 	struct mpage	*omp;		/* overflow mpage */
14995d465952Smartinh 	size_t		 psz;
15005d465952Smartinh 	size_t		 max;
15015d465952Smartinh 	size_t		 sz = 0;
15025d465952Smartinh 	pgno_t		 pgno;
15035d465952Smartinh 
150437f4b933Smmcc 	memset(data, 0, sizeof(*data));
150537ee45a1Smartinh 	max = bt->head.psize - PAGEHDRSZ;
15065d465952Smartinh 
15075d465952Smartinh 	if (!F_ISSET(leaf->flags, F_BIGDATA)) {
15085d465952Smartinh 		data->size = leaf->n_dsize;
15095d465952Smartinh 		if (data->size > 0) {
15105d465952Smartinh 			if (mp == NULL) {
15115d465952Smartinh 				if ((data->data = malloc(data->size)) == NULL)
15125d465952Smartinh 					return BT_FAIL;
15135d465952Smartinh 				bcopy(NODEDATA(leaf), data->data, data->size);
15145d465952Smartinh 				data->free_data = 1;
15155d465952Smartinh 				data->mp = NULL;
15165d465952Smartinh 			} else {
15175d465952Smartinh 				data->data = NODEDATA(leaf);
15185d465952Smartinh 				data->free_data = 0;
15195d465952Smartinh 				data->mp = mp;
15205d465952Smartinh 				mp->ref++;
15215d465952Smartinh 			}
15225d465952Smartinh 		}
15235d465952Smartinh 		return BT_SUCCESS;
15245d465952Smartinh 	}
15255d465952Smartinh 
15265d465952Smartinh 	/* Read overflow data.
15275d465952Smartinh 	 */
15285d465952Smartinh 	DPRINTF("allocating %u byte for overflow data", leaf->n_dsize);
15295d465952Smartinh 	if ((data->data = malloc(leaf->n_dsize)) == NULL)
15305d465952Smartinh 		return BT_FAIL;
15315d465952Smartinh 	data->size = leaf->n_dsize;
15325d465952Smartinh 	data->free_data = 1;
15335d465952Smartinh 	data->mp = NULL;
153445ac9d85Smartinh 	bcopy(NODEDATA(leaf), &pgno, sizeof(pgno));
15355d465952Smartinh 	for (sz = 0; sz < data->size; ) {
153637ee45a1Smartinh 		if ((omp = btree_get_mpage(bt, pgno)) == NULL ||
153737ee45a1Smartinh 		    !F_ISSET(omp->page->flags, P_OVERFLOW)) {
15382c6c9175Smartinh 			DPRINTF("read overflow page %u failed", pgno);
15395d465952Smartinh 			free(data->data);
154037ee45a1Smartinh 			mpage_free(omp);
15415d465952Smartinh 			return BT_FAIL;
15425d465952Smartinh 		}
15435d465952Smartinh 		psz = data->size - sz;
15445d465952Smartinh 		if (psz > max)
15455d465952Smartinh 			psz = max;
154637ee45a1Smartinh 		bcopy(omp->page->ptrs, (char *)data->data + sz, psz);
15475d465952Smartinh 		sz += psz;
154837ee45a1Smartinh 		pgno = omp->page->p_next_pgno;
15495d465952Smartinh 	}
15505d465952Smartinh 
15515d465952Smartinh 	return BT_SUCCESS;
15525d465952Smartinh }
15535d465952Smartinh 
15545d465952Smartinh int
btree_txn_get(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data)15555d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn,
15565d465952Smartinh     struct btval *key, struct btval *data)
15575d465952Smartinh {
15585d465952Smartinh 	int		 rc, exact;
15595d465952Smartinh 	struct node	*leaf;
15605d465952Smartinh 	struct mpage	*mp;
15615d465952Smartinh 
15625d465952Smartinh 	assert(key);
15635d465952Smartinh 	assert(data);
15645d465952Smartinh 	DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data);
15655d465952Smartinh 
156659916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
156759916c99Smartinh 		errno = EINVAL;
156859916c99Smartinh 		return BT_FAIL;
156959916c99Smartinh 	}
157059916c99Smartinh 
157159916c99Smartinh 	if (bt == NULL) {
157259916c99Smartinh 		if (txn == NULL) {
157359916c99Smartinh 			errno = EINVAL;
157459916c99Smartinh 			return BT_FAIL;
157559916c99Smartinh 		}
157659916c99Smartinh 		bt = txn->bt;
157759916c99Smartinh 	}
157859916c99Smartinh 
15795d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
15805d465952Smartinh 		errno = EINVAL;
15815d465952Smartinh 		return BT_FAIL;
15825d465952Smartinh 	}
15835d465952Smartinh 
15845d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS)
15855d465952Smartinh 		return rc;
15865d465952Smartinh 
15875d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, NULL);
15885d465952Smartinh 	if (leaf && exact)
15895d465952Smartinh 		rc = btree_read_data(bt, mp, leaf, data);
15900279e526Smartinh 	else {
15910279e526Smartinh 		errno = ENOENT;
15920279e526Smartinh 		rc = BT_FAIL;
15930279e526Smartinh 	}
15945d465952Smartinh 
15955d465952Smartinh 	mpage_prune(bt);
15965d465952Smartinh 	return rc;
15975d465952Smartinh }
15985d465952Smartinh 
15995d465952Smartinh static int
btree_sibling(struct cursor * cursor,int move_right)16005d465952Smartinh btree_sibling(struct cursor *cursor, int move_right)
16015d465952Smartinh {
16025d465952Smartinh 	int		 rc;
16035d465952Smartinh 	struct node	*indx;
16045d465952Smartinh 	struct ppage	*parent, *top;
16055d465952Smartinh 	struct mpage	*mp;
16065d465952Smartinh 
16075d465952Smartinh 	top = CURSOR_TOP(cursor);
16080279e526Smartinh 	if ((parent = SLIST_NEXT(top, entry)) == NULL) {
16090279e526Smartinh 		errno = ENOENT;
16100279e526Smartinh 		return BT_FAIL;			/* root has no siblings */
16110279e526Smartinh 	}
16125d465952Smartinh 
16135d465952Smartinh 	DPRINTF("parent page is page %u, index %u",
16145d465952Smartinh 	    parent->mpage->pgno, parent->ki);
16155d465952Smartinh 
16165d465952Smartinh 	cursor_pop_page(cursor);
16175d465952Smartinh 	if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))
16185d465952Smartinh 		       : (parent->ki == 0)) {
16195d465952Smartinh 		DPRINTF("no more keys left, moving to %s sibling",
16205d465952Smartinh 		    move_right ? "right" : "left");
16215d465952Smartinh 		if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS)
16225d465952Smartinh 			return rc;
16235d465952Smartinh 		parent = CURSOR_TOP(cursor);
16245d465952Smartinh 	} else {
16255d465952Smartinh 		if (move_right)
16265d465952Smartinh 			parent->ki++;
16275d465952Smartinh 		else
16285d465952Smartinh 			parent->ki--;
16295d465952Smartinh 		DPRINTF("just moving to %s index key %u",
16305d465952Smartinh 		    move_right ? "right" : "left", parent->ki);
16315d465952Smartinh 	}
16325d465952Smartinh 	assert(IS_BRANCH(parent->mpage));
16335d465952Smartinh 
16345d465952Smartinh 	indx = NODEPTR(parent->mpage, parent->ki);
163537ee45a1Smartinh 	if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL)
16365d465952Smartinh 		return BT_FAIL;
16375d465952Smartinh 	mp->parent = parent->mpage;
16385d465952Smartinh 	mp->parent_index = parent->ki;
16395d465952Smartinh 
16405d465952Smartinh 	cursor_push_page(cursor, mp);
16415d465952Smartinh 	find_common_prefix(cursor->bt, mp);
16425d465952Smartinh 
16435d465952Smartinh 	return BT_SUCCESS;
16445d465952Smartinh }
16455d465952Smartinh 
16465d465952Smartinh static int
bt_set_key(struct btree * bt,struct mpage * mp,struct node * node,struct btval * key)16475d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,
16485d465952Smartinh     struct btval *key)
16495d465952Smartinh {
16505d465952Smartinh 	if (key == NULL)
16515d465952Smartinh 		return 0;
16525d465952Smartinh 
16535d465952Smartinh 	if (mp->prefix.len > 0) {
16545d465952Smartinh 		key->size = node->ksize + mp->prefix.len;
16555d465952Smartinh 		key->data = malloc(key->size);
16565d465952Smartinh 		if (key->data == NULL)
16575d465952Smartinh 			return -1;
16585d465952Smartinh 		concat_prefix(bt,
16595d465952Smartinh 		    mp->prefix.str, mp->prefix.len,
16605d465952Smartinh 		    NODEKEY(node), node->ksize,
16615d465952Smartinh 		    key->data, &key->size);
16625d465952Smartinh 		key->free_data = 1;
16635d465952Smartinh 	} else {
16645d465952Smartinh 		key->size = node->ksize;
16655d465952Smartinh 		key->data = NODEKEY(node);
16665d465952Smartinh 		key->free_data = 0;
16675d465952Smartinh 		key->mp = mp;
16685d465952Smartinh 		mp->ref++;
16695d465952Smartinh 	}
16705d465952Smartinh 
16715d465952Smartinh 	return 0;
16725d465952Smartinh }
16735d465952Smartinh 
16745d465952Smartinh static int
btree_cursor_next(struct cursor * cursor,struct btval * key,struct btval * data)16755d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)
16765d465952Smartinh {
16775d465952Smartinh 	struct ppage	*top;
16785d465952Smartinh 	struct mpage	*mp;
16795d465952Smartinh 	struct node	*leaf;
16805d465952Smartinh 
16810279e526Smartinh 	if (cursor->eof) {
16820279e526Smartinh 		errno = ENOENT;
16830279e526Smartinh 		return BT_FAIL;
16840279e526Smartinh 	}
16855d465952Smartinh 
16865d465952Smartinh 	assert(cursor->initialized);
16875d465952Smartinh 
16885d465952Smartinh 	top = CURSOR_TOP(cursor);
16895d465952Smartinh 	mp = top->mpage;
16905d465952Smartinh 
16915d465952Smartinh 	DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor);
16925d465952Smartinh 
16935d465952Smartinh 	if (top->ki + 1 >= NUMKEYS(mp)) {
16945d465952Smartinh 		DPRINTF("=====> move to next sibling page");
16955d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS) {
16965d465952Smartinh 			cursor->eof = 1;
16970279e526Smartinh 			return BT_FAIL;
16985d465952Smartinh 		}
16995d465952Smartinh 		top = CURSOR_TOP(cursor);
17005d465952Smartinh 		mp = top->mpage;
17015d465952Smartinh 		DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
17025d465952Smartinh 	} else
17035d465952Smartinh 		top->ki++;
17045d465952Smartinh 
17055d465952Smartinh 	DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
17065d465952Smartinh 	    mp->pgno, NUMKEYS(mp), top->ki);
17075d465952Smartinh 
17085d465952Smartinh 	assert(IS_LEAF(mp));
17095d465952Smartinh 	leaf = NODEPTR(mp, top->ki);
17105d465952Smartinh 
17115d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17125d465952Smartinh 		return BT_FAIL;
17135d465952Smartinh 
17145d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17155d465952Smartinh 		return BT_FAIL;
17165d465952Smartinh 
17175d465952Smartinh 	return BT_SUCCESS;
17185d465952Smartinh }
17195d465952Smartinh 
17205d465952Smartinh static int
btree_cursor_set(struct cursor * cursor,struct btval * key,struct btval * data,int * exactp)17215cea9f8fSmartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data,
17225cea9f8fSmartinh     int *exactp)
17235d465952Smartinh {
17245d465952Smartinh 	int		 rc;
17255d465952Smartinh 	struct node	*leaf;
17265d465952Smartinh 	struct mpage	*mp;
17275d465952Smartinh 	struct ppage	*top;
17285d465952Smartinh 
17295d465952Smartinh 	assert(cursor);
17305d465952Smartinh 	assert(key);
17315d465952Smartinh 	assert(key->size > 0);
17325d465952Smartinh 
17335d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp);
17345d465952Smartinh 	if (rc != BT_SUCCESS)
17355d465952Smartinh 		return rc;
17365d465952Smartinh 	assert(IS_LEAF(mp));
17375d465952Smartinh 
17385d465952Smartinh 	top = CURSOR_TOP(cursor);
17395cea9f8fSmartinh 	leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki);
17405cea9f8fSmartinh 	if (exactp != NULL && !*exactp) {
17415cea9f8fSmartinh 		/* BT_CURSOR_EXACT specified and not an exact match. */
17425cea9f8fSmartinh 		errno = ENOENT;
17435cea9f8fSmartinh 		return BT_FAIL;
17445cea9f8fSmartinh 	}
17455cea9f8fSmartinh 
17465d465952Smartinh 	if (leaf == NULL) {
17475d465952Smartinh 		DPRINTF("===> inexact leaf not found, goto sibling");
17485d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS)
17490279e526Smartinh 			return BT_FAIL;		/* no entries matched */
17505d465952Smartinh 		top = CURSOR_TOP(cursor);
17515d465952Smartinh 		top->ki = 0;
17525d465952Smartinh 		mp = top->mpage;
17535d465952Smartinh 		assert(IS_LEAF(mp));
17545d465952Smartinh 		leaf = NODEPTR(mp, 0);
17555d465952Smartinh 	}
17565d465952Smartinh 
17575d465952Smartinh 	cursor->initialized = 1;
17585d465952Smartinh 	cursor->eof = 0;
17595d465952Smartinh 
17605d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17615d465952Smartinh 		return BT_FAIL;
17625d465952Smartinh 
17635d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17645d465952Smartinh 		return BT_FAIL;
17655d465952Smartinh 	DPRINTF("==> cursor placed on key %.*s",
17665d465952Smartinh 	    (int)key->size, (char *)key->data);
17675d465952Smartinh 
17685d465952Smartinh 	return BT_SUCCESS;
17695d465952Smartinh }
17705d465952Smartinh 
17715d465952Smartinh static int
btree_cursor_first(struct cursor * cursor,struct btval * key,struct btval * data)17725d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)
17735d465952Smartinh {
17745d465952Smartinh 	int		 rc;
17755d465952Smartinh 	struct mpage	*mp;
17765d465952Smartinh 	struct node	*leaf;
17775d465952Smartinh 
17785d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp);
17795d465952Smartinh 	if (rc != BT_SUCCESS)
17805d465952Smartinh 		return rc;
17815d465952Smartinh 	assert(IS_LEAF(mp));
17825d465952Smartinh 
17835d465952Smartinh 	leaf = NODEPTR(mp, 0);
17845d465952Smartinh 	cursor->initialized = 1;
17855d465952Smartinh 	cursor->eof = 0;
17865d465952Smartinh 
17875d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17885d465952Smartinh 		return BT_FAIL;
17895d465952Smartinh 
17905d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17915d465952Smartinh 		return BT_FAIL;
17925d465952Smartinh 
17935d465952Smartinh 	return BT_SUCCESS;
17945d465952Smartinh }
17955d465952Smartinh 
17965d465952Smartinh int
btree_cursor_get(struct cursor * cursor,struct btval * key,struct btval * data,enum cursor_op op)17975d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,
17985d465952Smartinh     enum cursor_op op)
17995d465952Smartinh {
18005d465952Smartinh 	int		 rc;
18015cea9f8fSmartinh 	int		 exact = 0;
18025d465952Smartinh 
18035d465952Smartinh 	assert(cursor);
18045d465952Smartinh 
18055d465952Smartinh 	switch (op) {
18065d465952Smartinh 	case BT_CURSOR:
18075cea9f8fSmartinh 	case BT_CURSOR_EXACT:
18085d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
18095d465952Smartinh 			cursor_pop_page(cursor);
18105d465952Smartinh 		if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) {
18115d465952Smartinh 			errno = EINVAL;
18125d465952Smartinh 			rc = BT_FAIL;
18135cea9f8fSmartinh 		} else if (op == BT_CURSOR_EXACT)
18145cea9f8fSmartinh 			rc = btree_cursor_set(cursor, key, data, &exact);
18155cea9f8fSmartinh 		else
18165cea9f8fSmartinh 			rc = btree_cursor_set(cursor, key, data, NULL);
18175d465952Smartinh 		break;
18185d465952Smartinh 	case BT_NEXT:
18195d465952Smartinh 		if (!cursor->initialized)
18205d465952Smartinh 			rc = btree_cursor_first(cursor, key, data);
18215d465952Smartinh 		else
18225d465952Smartinh 			rc = btree_cursor_next(cursor, key, data);
18235d465952Smartinh 		break;
18245d465952Smartinh 	case BT_FIRST:
18255d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
18265d465952Smartinh 			cursor_pop_page(cursor);
18275d465952Smartinh 		rc = btree_cursor_first(cursor, key, data);
18285d465952Smartinh 		break;
18295d465952Smartinh 	default:
18305d465952Smartinh 		DPRINTF("unhandled/unimplemented cursor operation %u", op);
18315d465952Smartinh 		rc = BT_FAIL;
18325d465952Smartinh 		break;
18335d465952Smartinh 	}
18345d465952Smartinh 
18355d465952Smartinh 	mpage_prune(cursor->bt);
18365d465952Smartinh 
18375d465952Smartinh 	return rc;
18385d465952Smartinh }
18395d465952Smartinh 
18405d465952Smartinh static struct mpage *
btree_new_page(struct btree * bt,uint32_t flags)18415d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags)
18425d465952Smartinh {
18435d465952Smartinh 	struct mpage	*mp;
18445d465952Smartinh 
18455d465952Smartinh 	assert(bt != NULL);
18465d465952Smartinh 	assert(bt->txn != NULL);
18475d465952Smartinh 
184837ee45a1Smartinh 	DPRINTF("allocating new mpage %u, page size %u",
184937ee45a1Smartinh 	    bt->txn->next_pgno, bt->head.psize);
18505d465952Smartinh 	if ((mp = calloc(1, sizeof(*mp))) == NULL)
18515d465952Smartinh 		return NULL;
185237ee45a1Smartinh 	if ((mp->page = malloc(bt->head.psize)) == NULL) {
185337ee45a1Smartinh 		free(mp);
185437ee45a1Smartinh 		return NULL;
185537ee45a1Smartinh 	}
185637ee45a1Smartinh 	mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
185737ee45a1Smartinh 	mp->page->flags = flags;
185837ee45a1Smartinh 	mp->page->lower = PAGEHDRSZ;
185937ee45a1Smartinh 	mp->page->upper = bt->head.psize;
18605d465952Smartinh 
18615d465952Smartinh 	if (IS_BRANCH(mp))
186237ee45a1Smartinh 		bt->meta.branch_pages++;
18635d465952Smartinh 	else if (IS_LEAF(mp))
186437ee45a1Smartinh 		bt->meta.leaf_pages++;
18655d465952Smartinh 	else if (IS_OVERFLOW(mp))
186637ee45a1Smartinh 		bt->meta.overflow_pages++;
18675d465952Smartinh 
18685d465952Smartinh 	mpage_add(bt, mp);
18695d465952Smartinh 	mpage_dirty(bt, mp);
18705d465952Smartinh 
18715d465952Smartinh 	return mp;
18725d465952Smartinh }
18735d465952Smartinh 
18745d465952Smartinh static size_t
bt_leaf_size(struct btree * bt,struct btval * key,struct btval * data)187537ee45a1Smartinh bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data)
18765d465952Smartinh {
18775d465952Smartinh 	size_t		 sz;
18785d465952Smartinh 
18795d465952Smartinh 	sz = LEAFSIZE(key, data);
188037ee45a1Smartinh 	if (data->size >= bt->head.psize / BT_MINKEYS) {
18815d465952Smartinh 		/* put on overflow page */
18825d465952Smartinh 		sz -= data->size - sizeof(pgno_t);
18835d465952Smartinh 	}
18845d465952Smartinh 
18855d465952Smartinh 	return sz + sizeof(indx_t);
18865d465952Smartinh }
18875d465952Smartinh 
18885d465952Smartinh static size_t
bt_branch_size(struct btree * bt,struct btval * key)188937ee45a1Smartinh bt_branch_size(struct btree *bt, struct btval *key)
18905d465952Smartinh {
18915d465952Smartinh 	size_t		 sz;
18925d465952Smartinh 
18935d465952Smartinh 	sz = INDXSIZE(key);
189437ee45a1Smartinh 	if (sz >= bt->head.psize / BT_MINKEYS) {
18955d465952Smartinh 		/* put on overflow page */
18965d465952Smartinh 		/* not implemented */
18975d465952Smartinh 		/* sz -= key->size - sizeof(pgno_t); */
18985d465952Smartinh 	}
18995d465952Smartinh 
19005d465952Smartinh 	return sz + sizeof(indx_t);
19015d465952Smartinh }
19025d465952Smartinh 
19035d465952Smartinh static int
btree_write_overflow_data(struct btree * bt,struct page * p,struct btval * data)19045d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)
19055d465952Smartinh {
19065d465952Smartinh 	size_t		 done = 0;
19075d465952Smartinh 	size_t		 sz;
19085d465952Smartinh 	size_t		 max;
19095d465952Smartinh 	pgno_t		*linkp;			/* linked page stored here */
19105d465952Smartinh 	struct mpage	*next = NULL;
19115d465952Smartinh 
191237ee45a1Smartinh 	max = bt->head.psize - PAGEHDRSZ;
19135d465952Smartinh 
19145d465952Smartinh 	while (done < data->size) {
19152c6c9175Smartinh 		if (next != NULL)
19162c6c9175Smartinh 			p = next->page;
19175d465952Smartinh 		linkp = &p->p_next_pgno;
19185d465952Smartinh 		if (data->size - done > max) {
19195d465952Smartinh 			/* need another overflow page */
19205d465952Smartinh 			if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)
19215d465952Smartinh 				return BT_FAIL;
19225d465952Smartinh 			*linkp = next->pgno;
19235d465952Smartinh 			DPRINTF("linking overflow page %u", next->pgno);
19245d465952Smartinh 		} else
19255d465952Smartinh 			*linkp = 0;		/* indicates end of list */
19265d465952Smartinh 		sz = data->size - done;
19275d465952Smartinh 		if (sz > max)
19285d465952Smartinh 			sz = max;
19295d465952Smartinh 		DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno);
19305d465952Smartinh 		bcopy((char *)data->data + done, p->ptrs, sz);
19315d465952Smartinh 		done += sz;
19325d465952Smartinh 	}
19335d465952Smartinh 
19345d465952Smartinh 	return BT_SUCCESS;
19355d465952Smartinh }
19365d465952Smartinh 
19375d465952Smartinh /* Key prefix should already be stripped.
19385d465952Smartinh  */
19395d465952Smartinh static int
btree_add_node(struct btree * bt,struct mpage * mp,indx_t indx,struct btval * key,struct btval * data,pgno_t pgno,uint8_t flags)19405d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,
19415d465952Smartinh     struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)
19425d465952Smartinh {
19435d465952Smartinh 	unsigned int	 i;
19445d465952Smartinh 	size_t		 node_size = NODESIZE;
19455d465952Smartinh 	indx_t		 ofs;
19465d465952Smartinh 	struct node	*node;
19475d465952Smartinh 	struct page	*p;
19485d465952Smartinh 	struct mpage	*ofp = NULL;		/* overflow page */
19495d465952Smartinh 
195037ee45a1Smartinh 	p = mp->page;
19515d465952Smartinh 	assert(p->upper >= p->lower);
19525d465952Smartinh 
195348318e0bSderaadt 	DPRINTF("add node [%.*s] to %s page %u at index %d, key size %zu",
19545d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL,
19555d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
195637ee45a1Smartinh 	    mp->pgno, indx, key ? key->size : 0);
19575d465952Smartinh 
19585d465952Smartinh 	if (key != NULL)
19595d465952Smartinh 		node_size += key->size;
19605d465952Smartinh 
19615d465952Smartinh 	if (IS_LEAF(mp)) {
19625d465952Smartinh 		assert(data);
19635d465952Smartinh 		node_size += data->size;
19645d465952Smartinh 		if (F_ISSET(flags, F_BIGDATA)) {
19655d465952Smartinh 			/* Data already on overflow page. */
19665d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
196737ee45a1Smartinh 		} else if (data->size >= bt->head.psize / BT_MINKEYS) {
19685d465952Smartinh 			/* Put data on overflow page. */
19695d465952Smartinh 			DPRINTF("data size is %zu, put on overflow page",
19705d465952Smartinh 			    data->size);
19715d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
19725d465952Smartinh 			if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)
19735d465952Smartinh 				return BT_FAIL;
19745d465952Smartinh 			DPRINTF("allocated overflow page %u", ofp->pgno);
19755d465952Smartinh 			flags |= F_BIGDATA;
19765d465952Smartinh 		}
19775d465952Smartinh 	}
19785d465952Smartinh 
19795d465952Smartinh 	if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
19805d465952Smartinh 		DPRINTF("not enough room in page %u, got %lu ptrs",
19815d465952Smartinh 		    mp->pgno, NUMKEYS(mp));
19825d465952Smartinh 		DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower,
19835d465952Smartinh 		    p->upper - p->lower);
198437ee45a1Smartinh 		DPRINTF("node size = %zu", node_size);
19855d465952Smartinh 		return BT_FAIL;
19865d465952Smartinh 	}
19875d465952Smartinh 
19885d465952Smartinh 	/* Move higher pointers up one slot. */
19895d465952Smartinh 	for (i = NUMKEYS(mp); i > indx; i--)
19905d465952Smartinh 		p->ptrs[i] = p->ptrs[i - 1];
19915d465952Smartinh 
19925d465952Smartinh 	/* Adjust free space offsets. */
19935d465952Smartinh 	ofs = p->upper - node_size;
19945d465952Smartinh 	assert(ofs >= p->lower + sizeof(indx_t));
19955d465952Smartinh 	p->ptrs[indx] = ofs;
19965d465952Smartinh 	p->upper = ofs;
19975d465952Smartinh 	p->lower += sizeof(indx_t);
19985d465952Smartinh 
19995d465952Smartinh 	/* Write the node data. */
20005d465952Smartinh 	node = NODEPTR(mp, indx);
20015d465952Smartinh 	node->ksize = (key == NULL) ? 0 : key->size;
20025d465952Smartinh 	node->flags = flags;
20035d465952Smartinh 	if (IS_LEAF(mp))
20045d465952Smartinh 		node->n_dsize = data->size;
20055d465952Smartinh 	else
20065d465952Smartinh 		node->n_pgno = pgno;
20075d465952Smartinh 
20085d465952Smartinh 	if (key)
20095d465952Smartinh 		bcopy(key->data, NODEKEY(node), key->size);
20105d465952Smartinh 
20115d465952Smartinh 	if (IS_LEAF(mp)) {
20122c6c9175Smartinh 		assert(key);
20135d465952Smartinh 		if (ofp == NULL) {
20145d465952Smartinh 			if (F_ISSET(flags, F_BIGDATA))
20155d465952Smartinh 				bcopy(data->data, node->data + key->size,
20165d465952Smartinh 				    sizeof(pgno_t));
20175d465952Smartinh 			else
20185d465952Smartinh 				bcopy(data->data, node->data + key->size,
20195d465952Smartinh 				    data->size);
20205d465952Smartinh 		} else {
20215d465952Smartinh 			bcopy(&ofp->pgno, node->data + key->size,
20225d465952Smartinh 			    sizeof(pgno_t));
202337ee45a1Smartinh 			if (btree_write_overflow_data(bt, ofp->page,
20245d465952Smartinh 			    data) == BT_FAIL)
20255d465952Smartinh 				return BT_FAIL;
20265d465952Smartinh 		}
20275d465952Smartinh 	}
20285d465952Smartinh 
20295d465952Smartinh 	return BT_SUCCESS;
20305d465952Smartinh }
20315d465952Smartinh 
20325d465952Smartinh static void
btree_del_node(struct btree * bt,struct mpage * mp,indx_t indx)20335d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx)
20345d465952Smartinh {
20355d465952Smartinh 	unsigned int	 sz;
20365d465952Smartinh 	indx_t		 i, j, numkeys, ptr;
20375d465952Smartinh 	struct node	*node;
20385d465952Smartinh 	char		*base;
20395d465952Smartinh 
20405d465952Smartinh 	DPRINTF("delete node %u on %s page %u", indx,
20415d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);
20425d465952Smartinh 	assert(indx < NUMKEYS(mp));
20435d465952Smartinh 
20445d465952Smartinh 	node = NODEPTR(mp, indx);
20455d465952Smartinh 	sz = NODESIZE + node->ksize;
20465d465952Smartinh 	if (IS_LEAF(mp)) {
20475d465952Smartinh 		if (F_ISSET(node->flags, F_BIGDATA))
20485d465952Smartinh 			sz += sizeof(pgno_t);
20495d465952Smartinh 		else
20505d465952Smartinh 			sz += NODEDSZ(node);
20515d465952Smartinh 	}
20525d465952Smartinh 
205337ee45a1Smartinh 	ptr = mp->page->ptrs[indx];
20545d465952Smartinh 	numkeys = NUMKEYS(mp);
20555d465952Smartinh 	for (i = j = 0; i < numkeys; i++) {
20565d465952Smartinh 		if (i != indx) {
205737ee45a1Smartinh 			mp->page->ptrs[j] = mp->page->ptrs[i];
205837ee45a1Smartinh 			if (mp->page->ptrs[i] < ptr)
205937ee45a1Smartinh 				mp->page->ptrs[j] += sz;
20605d465952Smartinh 			j++;
20615d465952Smartinh 		}
20625d465952Smartinh 	}
20635d465952Smartinh 
206437ee45a1Smartinh 	base = (char *)mp->page + mp->page->upper;
206537ee45a1Smartinh 	bcopy(base, base + sz, ptr - mp->page->upper);
20665d465952Smartinh 
206737ee45a1Smartinh 	mp->page->lower -= sizeof(indx_t);
206837ee45a1Smartinh 	mp->page->upper += sz;
20695d465952Smartinh }
20705d465952Smartinh 
20715d465952Smartinh struct cursor *
btree_txn_cursor_open(struct btree * bt,struct btree_txn * txn)20725d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)
20735d465952Smartinh {
20745d465952Smartinh 	struct cursor	*cursor;
20755d465952Smartinh 
207659916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
207759916c99Smartinh 		errno = EINVAL;
207859916c99Smartinh 		return NULL;
207959916c99Smartinh 	}
208059916c99Smartinh 
208159916c99Smartinh 	if (bt == NULL) {
208259916c99Smartinh 		if (txn == NULL) {
208359916c99Smartinh 			errno = EINVAL;
208459916c99Smartinh 			return NULL;
208559916c99Smartinh 		}
208659916c99Smartinh 		bt = txn->bt;
208759916c99Smartinh 	}
208859916c99Smartinh 
20895d465952Smartinh 	if ((cursor = calloc(1, sizeof(*cursor))) != NULL) {
20905d465952Smartinh 		SLIST_INIT(&cursor->stack);
20915d465952Smartinh 		cursor->bt = bt;
20925d465952Smartinh 		cursor->txn = txn;
2093408be415Smartinh 		btree_ref(bt);
20945d465952Smartinh 	}
20955d465952Smartinh 
20965d465952Smartinh 	return cursor;
20975d465952Smartinh }
20985d465952Smartinh 
20995d465952Smartinh void
btree_cursor_close(struct cursor * cursor)21005d465952Smartinh btree_cursor_close(struct cursor *cursor)
21015d465952Smartinh {
21025d465952Smartinh 	if (cursor != NULL) {
21035d465952Smartinh 		while (!CURSOR_EMPTY(cursor))
21045d465952Smartinh 			cursor_pop_page(cursor);
21055d465952Smartinh 
21065d465952Smartinh 		btree_close(cursor->bt);
21075d465952Smartinh 		free(cursor);
21085d465952Smartinh 	}
21095d465952Smartinh }
21105d465952Smartinh 
21115d465952Smartinh static int
btree_update_key(struct btree * bt,struct mpage * mp,indx_t indx,struct btval * key)21125d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx,
21135d465952Smartinh     struct btval *key)
21145d465952Smartinh {
21155d465952Smartinh 	indx_t			 ptr, i, numkeys;
21165d465952Smartinh 	int			 delta;
21175d465952Smartinh 	size_t			 len;
21185d465952Smartinh 	struct node		*node;
21195d465952Smartinh 	char			*base;
21205d465952Smartinh 
21215d465952Smartinh 	node = NODEPTR(mp, indx);
212237ee45a1Smartinh 	ptr = mp->page->ptrs[indx];
21235d465952Smartinh 	DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u",
21245d465952Smartinh 	    indx, ptr,
21255d465952Smartinh 	    (int)node->ksize, (char *)NODEKEY(node),
21265d465952Smartinh 	    (int)key->size, (char *)key->data,
21275d465952Smartinh 	    mp->pgno);
21285d465952Smartinh 
21295d465952Smartinh 	if (key->size != node->ksize) {
21305d465952Smartinh 		delta = key->size - node->ksize;
21315d465952Smartinh 		if (delta > 0 && SIZELEFT(mp) < delta) {
21325d465952Smartinh 			DPRINTF("OUCH! Not enough room, delta = %d", delta);
21335d465952Smartinh 			return BT_FAIL;
21345d465952Smartinh 		}
21355d465952Smartinh 
21365d465952Smartinh 		numkeys = NUMKEYS(mp);
21375d465952Smartinh 		for (i = 0; i < numkeys; i++) {
213837ee45a1Smartinh 			if (mp->page->ptrs[i] <= ptr)
213937ee45a1Smartinh 				mp->page->ptrs[i] -= delta;
21405d465952Smartinh 		}
21415d465952Smartinh 
214237ee45a1Smartinh 		base = (char *)mp->page + mp->page->upper;
214337ee45a1Smartinh 		len = ptr - mp->page->upper + NODESIZE;
21445d465952Smartinh 		bcopy(base, base - delta, len);
214537ee45a1Smartinh 		mp->page->upper -= delta;
21465d465952Smartinh 
21475d465952Smartinh 		node = NODEPTR(mp, indx);
21485d465952Smartinh 		node->ksize = key->size;
21495d465952Smartinh 	}
21505d465952Smartinh 
21515d465952Smartinh 	bcopy(key->data, NODEKEY(node), key->size);
21525d465952Smartinh 
21535d465952Smartinh 	return BT_SUCCESS;
21545d465952Smartinh }
21555d465952Smartinh 
21565d465952Smartinh static int
btree_adjust_prefix(struct btree * bt,struct mpage * src,int delta)21575d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)
21585d465952Smartinh {
21595d465952Smartinh 	indx_t		 i;
21605d465952Smartinh 	struct node	*node;
21615d465952Smartinh 	struct btkey	 tmpkey;
21625d465952Smartinh 	struct btval	 key;
21635d465952Smartinh 
21645d465952Smartinh 	DPRINTF("adjusting prefix lengths on page %u with delta %d",
21655d465952Smartinh 	    src->pgno, delta);
21665d465952Smartinh 	assert(delta != 0);
21675d465952Smartinh 
21685d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
21695d465952Smartinh 		node = NODEPTR(src, i);
21705d465952Smartinh 		tmpkey.len = node->ksize - delta;
21715d465952Smartinh 		if (delta > 0) {
21725d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY))
21735d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);
21745d465952Smartinh 			else
21755d465952Smartinh 				bcopy((char *)NODEKEY(node) + delta, tmpkey.str,
21765d465952Smartinh 				    tmpkey.len);
21775d465952Smartinh 		} else {
21785d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
21795d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, node->ksize);
21805d465952Smartinh 				bcopy(src->prefix.str, tmpkey.str + node->ksize,
21815d465952Smartinh 				    -delta);
21825d465952Smartinh 			} else {
21835d465952Smartinh 				bcopy(src->prefix.str + src->prefix.len + delta,
21845d465952Smartinh 				    tmpkey.str, -delta);
21855d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str - delta,
21865d465952Smartinh 				    node->ksize);
21875d465952Smartinh 			}
21885d465952Smartinh 		}
21895d465952Smartinh 		key.size = tmpkey.len;
21905d465952Smartinh 		key.data = tmpkey.str;
21915d465952Smartinh 		if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)
21925d465952Smartinh 			return BT_FAIL;
21935d465952Smartinh 	}
21945d465952Smartinh 
21955d465952Smartinh 	return BT_SUCCESS;
21965d465952Smartinh }
21975d465952Smartinh 
21985d465952Smartinh /* Move a node from src to dst.
21995d465952Smartinh  */
22005d465952Smartinh static int
btree_move_node(struct btree * bt,struct mpage * src,indx_t srcindx,struct mpage * dst,indx_t dstindx)22015d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,
22025d465952Smartinh     struct mpage *dst, indx_t dstindx)
22035d465952Smartinh {
22045d465952Smartinh 	int			 rc;
22055d465952Smartinh 	unsigned int		 pfxlen, mp_pfxlen = 0;
22061c05783eSmartinh 	struct node		*srcnode;
220737ee45a1Smartinh 	struct mpage		*mp = NULL;
22085d465952Smartinh 	struct btkey		 tmpkey, srckey;
22095d465952Smartinh 	struct btval		 key, data;
22105d465952Smartinh 
22115d465952Smartinh 	assert(src->parent);
22125d465952Smartinh 	assert(dst->parent);
22135d465952Smartinh 
22145d465952Smartinh 	srcnode = NODEPTR(src, srcindx);
22155d465952Smartinh 	DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u",
22165d465952Smartinh 	    IS_LEAF(src) ? "leaf" : "branch",
22175d465952Smartinh 	    srcindx,
22185d465952Smartinh 	    (int)srcnode->ksize, (char *)NODEKEY(srcnode),
22195d465952Smartinh 	    src->pgno,
22205d465952Smartinh 	    dstindx, dst->pgno);
22215d465952Smartinh 
22229074208fSmartinh 	find_common_prefix(bt, src);
22239074208fSmartinh 
22245d465952Smartinh 	if (IS_BRANCH(src)) {
22255d465952Smartinh 		/* Need to check if the page the moved node points to
22265d465952Smartinh 		 * changes prefix.
22275d465952Smartinh 		 */
222837ee45a1Smartinh 		if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL)
222937ee45a1Smartinh 			return BT_FAIL;
22305d465952Smartinh 		mp->parent = src;
22315d465952Smartinh 		mp->parent_index = srcindx;
22325d465952Smartinh 		find_common_prefix(bt, mp);
22335d465952Smartinh 		mp_pfxlen = mp->prefix.len;
22345d465952Smartinh 	}
22355d465952Smartinh 
22365d465952Smartinh 	/* Mark src and dst as dirty. */
22377f8a466aSmartinh 	if ((src = mpage_touch(bt, src)) == NULL ||
22387f8a466aSmartinh 	    (dst = mpage_touch(bt, dst)) == NULL)
22397f8a466aSmartinh 		return BT_FAIL;
22405d465952Smartinh 
22415d465952Smartinh 	find_common_prefix(bt, dst);
22425d465952Smartinh 
22435d465952Smartinh 	/* Check if src node has destination page prefix. Otherwise the
22445d465952Smartinh 	 * destination page must expand its prefix on all its nodes.
22455d465952Smartinh 	 */
22465d465952Smartinh 	srckey.len = srcnode->ksize;
22475d465952Smartinh 	bcopy(NODEKEY(srcnode), srckey.str, srckey.len);
22485d465952Smartinh 	common_prefix(bt, &srckey, &dst->prefix, &tmpkey);
22495d465952Smartinh 	if (tmpkey.len != dst->prefix.len) {
22505d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
22515d465952Smartinh 		    tmpkey.len - dst->prefix.len) != BT_SUCCESS)
22525d465952Smartinh 			return BT_FAIL;
22535d465952Smartinh 		bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));
22545d465952Smartinh 	}
22555d465952Smartinh 
22565d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
22575d465952Smartinh 		struct mpage	*low;
22585d465952Smartinh 
22595d465952Smartinh 		/* must find the lowest key below src
22605d465952Smartinh 		 */
22615d465952Smartinh 		assert(btree_search_page_root(bt, src, NULL, NULL, 0,
22625d465952Smartinh 		    &low) == BT_SUCCESS);
22635d465952Smartinh 		expand_prefix(bt, low, 0, &srckey);
22645d465952Smartinh 		DPRINTF("found lowest key [%.*s] on leaf page %u",
22655d465952Smartinh 		    (int)srckey.len, srckey.str, low->pgno);
22665d465952Smartinh 	} else {
22675d465952Smartinh 		srckey.len = srcnode->ksize;
22685d465952Smartinh 		bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);
22695d465952Smartinh 	}
22705d465952Smartinh 	find_common_prefix(bt, src);
22715d465952Smartinh 
22725d465952Smartinh 	/* expand the prefix */
22735d465952Smartinh 	tmpkey.len = sizeof(tmpkey.str);
22745d465952Smartinh 	concat_prefix(bt, src->prefix.str, src->prefix.len,
22755d465952Smartinh 	    srckey.str, srckey.len, tmpkey.str, &tmpkey.len);
22765d465952Smartinh 
22775d465952Smartinh 	/* Add the node to the destination page. Adjust prefix for
22785d465952Smartinh 	 * destination page.
22795d465952Smartinh 	 */
22805d465952Smartinh 	key.size = tmpkey.len;
22815d465952Smartinh 	key.data = tmpkey.str;
22825d465952Smartinh 	remove_prefix(bt, &key, dst->prefix.len);
22835d465952Smartinh 	data.size = NODEDSZ(srcnode);
22845d465952Smartinh 	data.data = NODEDATA(srcnode);
22855d465952Smartinh 	rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),
22865d465952Smartinh 	    srcnode->flags);
22875d465952Smartinh 	if (rc != BT_SUCCESS)
22885d465952Smartinh 		return rc;
22895d465952Smartinh 
22905d465952Smartinh 	/* Delete the node from the source page.
22915d465952Smartinh 	 */
22925d465952Smartinh 	btree_del_node(bt, src, srcindx);
22935d465952Smartinh 
22945d465952Smartinh 	/* Update the parent separators.
22955d465952Smartinh 	 */
22965d465952Smartinh 	if (srcindx == 0 && src->parent_index != 0) {
22975d465952Smartinh 		expand_prefix(bt, src, 0, &tmpkey);
22985d465952Smartinh 		key.size = tmpkey.len;
22995d465952Smartinh 		key.data = tmpkey.str;
23005d465952Smartinh 		remove_prefix(bt, &key, src->parent->prefix.len);
23015d465952Smartinh 
23025d465952Smartinh 		DPRINTF("update separator for source page %u to [%.*s]",
23035d465952Smartinh 		    src->pgno, (int)key.size, (char *)key.data);
23045d465952Smartinh 		if (btree_update_key(bt, src->parent, src->parent_index,
23055d465952Smartinh 		    &key) != BT_SUCCESS)
23065d465952Smartinh 			return BT_FAIL;
23075d465952Smartinh 	}
23085d465952Smartinh 
23095d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
23105d465952Smartinh 		struct btval	 nullkey;
23115d465952Smartinh 		nullkey.size = 0;
23125d465952Smartinh 		assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);
23135d465952Smartinh 	}
23145d465952Smartinh 
23155d465952Smartinh 	if (dstindx == 0 && dst->parent_index != 0) {
23165d465952Smartinh 		expand_prefix(bt, dst, 0, &tmpkey);
23175d465952Smartinh 		key.size = tmpkey.len;
23185d465952Smartinh 		key.data = tmpkey.str;
23195d465952Smartinh 		remove_prefix(bt, &key, dst->parent->prefix.len);
23205d465952Smartinh 
23215d465952Smartinh 		DPRINTF("update separator for destination page %u to [%.*s]",
23225d465952Smartinh 		    dst->pgno, (int)key.size, (char *)key.data);
23235d465952Smartinh 		if (btree_update_key(bt, dst->parent, dst->parent_index,
23245d465952Smartinh 		    &key) != BT_SUCCESS)
23255d465952Smartinh 			return BT_FAIL;
23265d465952Smartinh 	}
23275d465952Smartinh 
23285d465952Smartinh 	if (dstindx == 0 && IS_BRANCH(dst)) {
23295d465952Smartinh 		struct btval	 nullkey;
23305d465952Smartinh 		nullkey.size = 0;
23315d465952Smartinh 		assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);
23325d465952Smartinh 	}
23335d465952Smartinh 
23345d465952Smartinh 	/* We can get a new page prefix here!
23355d465952Smartinh 	 * Must update keys in all nodes of this page!
23365d465952Smartinh 	 */
23375d465952Smartinh 	pfxlen = src->prefix.len;
23385d465952Smartinh 	find_common_prefix(bt, src);
23395d465952Smartinh 	if (src->prefix.len != pfxlen) {
23405d465952Smartinh 		if (btree_adjust_prefix(bt, src,
23415d465952Smartinh 		    src->prefix.len - pfxlen) != BT_SUCCESS)
23425d465952Smartinh 			return BT_FAIL;
23435d465952Smartinh 	}
23445d465952Smartinh 
23455d465952Smartinh 	pfxlen = dst->prefix.len;
23465d465952Smartinh 	find_common_prefix(bt, dst);
23475d465952Smartinh 	if (dst->prefix.len != pfxlen) {
23485d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
23495d465952Smartinh 		    dst->prefix.len - pfxlen) != BT_SUCCESS)
23505d465952Smartinh 			return BT_FAIL;
23515d465952Smartinh 	}
23525d465952Smartinh 
23535d465952Smartinh 	if (IS_BRANCH(dst)) {
23542c6c9175Smartinh 		assert(mp);
23555d465952Smartinh 		mp->parent = dst;
23565d465952Smartinh 		mp->parent_index = dstindx;
23575d465952Smartinh 		find_common_prefix(bt, mp);
23585d465952Smartinh 		if (mp->prefix.len != mp_pfxlen) {
23595d465952Smartinh 			DPRINTF("moved branch node has changed prefix");
23607f8a466aSmartinh 			if ((mp = mpage_touch(bt, mp)) == NULL)
23617f8a466aSmartinh 				return BT_FAIL;
23625d465952Smartinh 			if (btree_adjust_prefix(bt, mp,
23635d465952Smartinh 			    mp->prefix.len - mp_pfxlen) != BT_SUCCESS)
23645d465952Smartinh 				return BT_FAIL;
23655d465952Smartinh 		}
23665d465952Smartinh 	}
23675d465952Smartinh 
23685d465952Smartinh 	return BT_SUCCESS;
23695d465952Smartinh }
23705d465952Smartinh 
23715d465952Smartinh static int
btree_merge(struct btree * bt,struct mpage * src,struct mpage * dst)23725d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)
23735d465952Smartinh {
23745d465952Smartinh 	int			 rc;
23755d465952Smartinh 	indx_t			 i;
2376a33d639aSmartinh 	unsigned int		 pfxlen;
23775d465952Smartinh 	struct node		*srcnode;
23785d465952Smartinh 	struct btkey		 tmpkey, dstpfx;
23795d465952Smartinh 	struct btval		 key, data;
23805d465952Smartinh 
23815d465952Smartinh 	DPRINTF("merging page %u and %u", src->pgno, dst->pgno);
23825d465952Smartinh 
23835d465952Smartinh 	assert(src->parent);	/* can't merge root page */
23845d465952Smartinh 	assert(dst->parent);
23855d465952Smartinh 	assert(bt->txn != NULL);
23865d465952Smartinh 
23875d465952Smartinh 	/* Mark src and dst as dirty. */
23887f8a466aSmartinh 	if ((src = mpage_touch(bt, src)) == NULL ||
23897f8a466aSmartinh 	    (dst = mpage_touch(bt, dst)) == NULL)
23907f8a466aSmartinh 		return BT_FAIL;
23915d465952Smartinh 
23925d465952Smartinh 	find_common_prefix(bt, src);
23935d465952Smartinh 	find_common_prefix(bt, dst);
23945d465952Smartinh 
23955d465952Smartinh 	/* Check if source nodes has destination page prefix. Otherwise
23965d465952Smartinh 	 * the destination page must expand its prefix on all its nodes.
23975d465952Smartinh 	 */
23985d465952Smartinh 	common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);
23995d465952Smartinh 	if (dstpfx.len != dst->prefix.len) {
24005d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
24015d465952Smartinh 		    dstpfx.len - dst->prefix.len) != BT_SUCCESS)
24025d465952Smartinh 			return BT_FAIL;
24035d465952Smartinh 		bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));
24045d465952Smartinh 	}
24055d465952Smartinh 
24065d465952Smartinh 	/* Move all nodes from src to dst.
24075d465952Smartinh 	 */
24085d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
24095d465952Smartinh 		srcnode = NODEPTR(src, i);
24105d465952Smartinh 
24115d465952Smartinh 		/* If branch node 0 (implicit key), find the real key.
24125d465952Smartinh 		 */
24135d465952Smartinh 		if (i == 0 && IS_BRANCH(src)) {
24145d465952Smartinh 			struct mpage	*low;
24155d465952Smartinh 
24165d465952Smartinh 			/* must find the lowest key below src
24175d465952Smartinh 			 */
24185d465952Smartinh 			assert(btree_search_page_root(bt, src, NULL, NULL, 0,
24195d465952Smartinh 			    &low) == BT_SUCCESS);
24205d465952Smartinh 			expand_prefix(bt, low, 0, &tmpkey);
24215d465952Smartinh 			DPRINTF("found lowest key [%.*s] on leaf page %u",
24225d465952Smartinh 			    (int)tmpkey.len, tmpkey.str, low->pgno);
24235d465952Smartinh 		} else {
24245d465952Smartinh 			expand_prefix(bt, src, i, &tmpkey);
24255d465952Smartinh 		}
24265d465952Smartinh 
24275d465952Smartinh 		key.size = tmpkey.len;
24285d465952Smartinh 		key.data = tmpkey.str;
24295d465952Smartinh 
24305d465952Smartinh 		remove_prefix(bt, &key, dst->prefix.len);
24315d465952Smartinh 		data.size = NODEDSZ(srcnode);
24325d465952Smartinh 		data.data = NODEDATA(srcnode);
24335d465952Smartinh 		rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,
24345d465952Smartinh 		    &data, NODEPGNO(srcnode), srcnode->flags);
24355d465952Smartinh 		if (rc != BT_SUCCESS)
24365d465952Smartinh 			return rc;
24375d465952Smartinh 	}
24385d465952Smartinh 
24395d465952Smartinh 	DPRINTF("dst page %u now has %lu keys (%.1f%% filled)",
2440944ad5d4Smartinh 	    dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10);
24415d465952Smartinh 
24425d465952Smartinh 	/* Unlink the src page from parent.
24435d465952Smartinh 	 */
24445d465952Smartinh 	btree_del_node(bt, src->parent, src->parent_index);
24455d465952Smartinh 	if (src->parent_index == 0) {
24465d465952Smartinh 		key.size = 0;
24475d465952Smartinh 		if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)
24485d465952Smartinh 			return BT_FAIL;
24495d465952Smartinh 
2450a33d639aSmartinh 		pfxlen = src->prefix.len;
24515d465952Smartinh 		find_common_prefix(bt, src);
24525d465952Smartinh 		assert (src->prefix.len == pfxlen);
24535d465952Smartinh 	}
24545d465952Smartinh 
24555d465952Smartinh 	if (IS_LEAF(src))
245637ee45a1Smartinh 		bt->meta.leaf_pages--;
24575d465952Smartinh 	else
245837ee45a1Smartinh 		bt->meta.branch_pages--;
24595d465952Smartinh 
24605d465952Smartinh 	return btree_rebalance(bt, src->parent);
24615d465952Smartinh }
24625d465952Smartinh 
2463944ad5d4Smartinh #define FILL_THRESHOLD	 250
24645d465952Smartinh 
24655d465952Smartinh static int
btree_rebalance(struct btree * bt,struct mpage * mp)24665d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp)
24675d465952Smartinh {
246837ee45a1Smartinh 	struct node	*node;
24695d465952Smartinh 	struct mpage	*parent;
247037ee45a1Smartinh 	struct mpage	*root;
24715d465952Smartinh 	struct mpage	*neighbor = NULL;
247237ee45a1Smartinh 	indx_t		 si = 0, di = 0;
24735d465952Smartinh 
24745d465952Smartinh 	assert(bt != NULL);
24755d465952Smartinh 	assert(bt->txn != NULL);
24765d465952Smartinh 	assert(mp != NULL);
24775d465952Smartinh 
24785d465952Smartinh 	DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)",
24795d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
2480944ad5d4Smartinh 	    mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10);
24815d465952Smartinh 
248237ee45a1Smartinh 	if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) {
24835d465952Smartinh 		DPRINTF("no need to rebalance page %u, above fill threshold",
24845d465952Smartinh 		    mp->pgno);
24855d465952Smartinh 		return BT_SUCCESS;
24865d465952Smartinh 	}
24875d465952Smartinh 
24885d465952Smartinh 	parent = mp->parent;
24895d465952Smartinh 
24905d465952Smartinh 	if (parent == NULL) {
24915d465952Smartinh 		if (NUMKEYS(mp) == 0) {
24925d465952Smartinh 			DPRINTF("tree is completely empty");
24935d465952Smartinh 			bt->txn->root = P_INVALID;
249437ee45a1Smartinh 			bt->meta.depth--;
249537ee45a1Smartinh 			bt->meta.leaf_pages--;
24965d465952Smartinh 		} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
24975d465952Smartinh 			DPRINTF("collapsing root page!");
24985d465952Smartinh 			bt->txn->root = NODEPGNO(NODEPTR(mp, 0));
249937ee45a1Smartinh 			if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL)
250037ee45a1Smartinh 				return BT_FAIL;
250137ee45a1Smartinh 			root->parent = NULL;
250237ee45a1Smartinh 			bt->meta.depth--;
250337ee45a1Smartinh 			bt->meta.branch_pages--;
25045d465952Smartinh 		} else
25055d465952Smartinh 			DPRINTF("root page doesn't need rebalancing");
25065d465952Smartinh 		return BT_SUCCESS;
25075d465952Smartinh 	}
25085d465952Smartinh 
25095d465952Smartinh 	/* The parent (branch page) must have at least 2 pointers,
25105d465952Smartinh 	 * otherwise the tree is invalid.
25115d465952Smartinh 	 */
25125d465952Smartinh 	assert(NUMKEYS(parent) > 1);
25135d465952Smartinh 
25145d465952Smartinh 	/* Leaf page fill factor is below the threshold.
25155d465952Smartinh 	 * Try to move keys from left or right neighbor, or
25165d465952Smartinh 	 * merge with a neighbor page.
25175d465952Smartinh 	 */
25185d465952Smartinh 
25195d465952Smartinh 	/* Find neighbors.
25205d465952Smartinh 	 */
25215d465952Smartinh 	if (mp->parent_index == 0) {
25225d465952Smartinh 		/* We're the leftmost leaf in our parent.
25235d465952Smartinh 		 */
25245d465952Smartinh 		DPRINTF("reading right neighbor");
252537ee45a1Smartinh 		node = NODEPTR(parent, mp->parent_index + 1);
252637ee45a1Smartinh 		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25275d465952Smartinh 			return BT_FAIL;
25285d465952Smartinh 		neighbor->parent_index = mp->parent_index + 1;
25295d465952Smartinh 		si = 0;
25305d465952Smartinh 		di = NUMKEYS(mp);
25315d465952Smartinh 	} else {
25325d465952Smartinh 		/* There is at least one neighbor to the left.
25335d465952Smartinh 		 */
25345d465952Smartinh 		DPRINTF("reading left neighbor");
253537ee45a1Smartinh 		node = NODEPTR(parent, mp->parent_index - 1);
253637ee45a1Smartinh 		if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25375d465952Smartinh 			return BT_FAIL;
25385d465952Smartinh 		neighbor->parent_index = mp->parent_index - 1;
25395d465952Smartinh 		si = NUMKEYS(neighbor) - 1;
25405d465952Smartinh 		di = 0;
25415d465952Smartinh 	}
25425d465952Smartinh 	neighbor->parent = parent;
25435d465952Smartinh 
25445d465952Smartinh 	DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)",
2545944ad5d4Smartinh 	    neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10);
25465d465952Smartinh 
25475d465952Smartinh 	/* If the neighbor page is above threshold and has at least two
25485d465952Smartinh 	 * keys, move one key from it.
25495d465952Smartinh 	 *
25505d465952Smartinh 	 * Otherwise we should try to merge them, but that might not be
25515d465952Smartinh 	 * possible, even if both are below threshold, as prefix expansion
25525d465952Smartinh 	 * might make keys larger. FIXME: detect this
25535d465952Smartinh 	 */
2554a73962c3Smartinh 	if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2)
25555d465952Smartinh 		return btree_move_node(bt, neighbor, si, mp, di);
25565d465952Smartinh 	else { /* FIXME: if (has_enough_room()) */
25575d465952Smartinh 		if (mp->parent_index == 0)
25585d465952Smartinh 			return btree_merge(bt, neighbor, mp);
25595d465952Smartinh 		else
25605d465952Smartinh 			return btree_merge(bt, mp, neighbor);
25615d465952Smartinh 	}
25625d465952Smartinh }
25635d465952Smartinh 
25645d465952Smartinh int
btree_txn_del(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data)25655d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn,
25665d465952Smartinh     struct btval *key, struct btval *data)
25675d465952Smartinh {
25685d465952Smartinh 	int		 rc, exact, close_txn = 0;
25695d465952Smartinh 	unsigned int	 ki;
25705d465952Smartinh 	struct node	*leaf;
25715d465952Smartinh 	struct mpage	*mp;
25725d465952Smartinh 
25735d465952Smartinh 	DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data);
25745d465952Smartinh 
25755d465952Smartinh 	assert(key != NULL);
25765d465952Smartinh 
257759916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
257859916c99Smartinh 		errno = EINVAL;
257959916c99Smartinh 		return BT_FAIL;
258059916c99Smartinh 	}
258159916c99Smartinh 
25827e04cf1eSmartinh 	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
25837e04cf1eSmartinh 		errno = EINVAL;
25847e04cf1eSmartinh 		return BT_FAIL;
25857e04cf1eSmartinh 	}
25867e04cf1eSmartinh 
258759916c99Smartinh 	if (bt == NULL) {
258859916c99Smartinh 		if (txn == NULL) {
258959916c99Smartinh 			errno = EINVAL;
259059916c99Smartinh 			return BT_FAIL;
259159916c99Smartinh 		}
259259916c99Smartinh 		bt = txn->bt;
259359916c99Smartinh 	}
259459916c99Smartinh 
25955d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
25965d465952Smartinh 		errno = EINVAL;
25975d465952Smartinh 		return BT_FAIL;
25985d465952Smartinh 	}
25995d465952Smartinh 
26005d465952Smartinh 	if (txn == NULL) {
26015d465952Smartinh 		close_txn = 1;
26025d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
26035d465952Smartinh 			return BT_FAIL;
26045d465952Smartinh 	}
26055d465952Smartinh 
26065d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS)
26075d465952Smartinh 		goto done;
26085d465952Smartinh 
26095d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, &ki);
26105d465952Smartinh 	if (leaf == NULL || !exact) {
26110279e526Smartinh 		errno = ENOENT;
26120279e526Smartinh 		rc = BT_FAIL;
26135d465952Smartinh 		goto done;
26145d465952Smartinh 	}
26155d465952Smartinh 
26165d465952Smartinh 	if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)
26175d465952Smartinh 		goto done;
26185d465952Smartinh 
26195d465952Smartinh 	btree_del_node(bt, mp, ki);
262037ee45a1Smartinh 	bt->meta.entries--;
26215d465952Smartinh 	rc = btree_rebalance(bt, mp);
26225d465952Smartinh 	if (rc != BT_SUCCESS)
26235d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
26245d465952Smartinh 
26255d465952Smartinh done:
26265d465952Smartinh 	if (close_txn) {
26275d465952Smartinh 		if (rc == BT_SUCCESS)
26285d465952Smartinh 			rc = btree_txn_commit(txn);
26295d465952Smartinh 		else
26305d465952Smartinh 			btree_txn_abort(txn);
26315d465952Smartinh 	}
26325d465952Smartinh 	mpage_prune(bt);
26335d465952Smartinh 	return rc;
26345d465952Smartinh }
26355d465952Smartinh 
26365d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that
26375d465952Smartinh  * still makes it uniquely distinguishable from <min>.
26385d465952Smartinh  *
26395d465952Smartinh  * <min> is guaranteed to be sorted less than <sep>
26405d465952Smartinh  *
26415d465952Smartinh  * On return, <sep> is modified to the minimum length.
26425d465952Smartinh  */
26435d465952Smartinh static void
bt_reduce_separator(struct btree * bt,struct node * min,struct btval * sep)26445d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)
26455d465952Smartinh {
26465d465952Smartinh 	size_t		 n = 0;
26475d465952Smartinh 	char		*p1;
26485d465952Smartinh 	char		*p2;
26495d465952Smartinh 
26505d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
26515d465952Smartinh 
26525d465952Smartinh 		assert(sep->size > 0);
26535d465952Smartinh 
26545d465952Smartinh 		p1 = (char *)NODEKEY(min) + min->ksize - 1;
26555d465952Smartinh 		p2 = (char *)sep->data + sep->size - 1;
26565d465952Smartinh 
26575d465952Smartinh 		while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) {
26585d465952Smartinh 			assert(p2 > (char *)sep->data);
26595d465952Smartinh 			p1--;
26605d465952Smartinh 			p2--;
26615d465952Smartinh 			n++;
26625d465952Smartinh 		}
26635d465952Smartinh 
26645d465952Smartinh 		sep->data = p2;
26655d465952Smartinh 		sep->size = n + 1;
26665d465952Smartinh 	} else {
26675d465952Smartinh 
26685d465952Smartinh 		assert(min->ksize > 0);
26695d465952Smartinh 		assert(sep->size > 0);
26705d465952Smartinh 
26715d465952Smartinh 		p1 = (char *)NODEKEY(min);
26725d465952Smartinh 		p2 = (char *)sep->data;
26735d465952Smartinh 
26745d465952Smartinh 		while (*p1 == *p2) {
26755d465952Smartinh 			p1++;
26765d465952Smartinh 			p2++;
26775d465952Smartinh 			n++;
26785d465952Smartinh 			if (n == min->ksize || n == sep->size)
26795d465952Smartinh 				break;
26805d465952Smartinh 		}
26815d465952Smartinh 
26825d465952Smartinh 		sep->size = n + 1;
26835d465952Smartinh 	}
26845d465952Smartinh 
26855d465952Smartinh 	DPRINTF("reduced separator to [%.*s] > [%.*s]",
26865d465952Smartinh 	    (int)sep->size, (char *)sep->data,
26875d465952Smartinh 	    (int)min->ksize, (char *)NODEKEY(min));
26885d465952Smartinh }
26895d465952Smartinh 
26905d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
26915d465952Smartinh  * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
26925d465952Smartinh  * *newindxp with the actual values after split, ie if *mpp and *newindxp
26935d465952Smartinh  * refer to a node in the new right sibling page.
26945d465952Smartinh  */
26955d465952Smartinh static int
btree_split(struct btree * bt,struct mpage ** mpp,unsigned int * newindxp,struct btval * newkey,struct btval * newdata,pgno_t newpgno)26965d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,
26975d465952Smartinh     struct btval *newkey, struct btval *newdata, pgno_t newpgno)
26985d465952Smartinh {
26995d465952Smartinh 	uint8_t		 flags;
27005d465952Smartinh 	int		 rc = BT_SUCCESS, ins_new = 0;
27015d465952Smartinh 	indx_t		 newindx;
27025d465952Smartinh 	pgno_t		 pgno = 0;
27035d465952Smartinh 	size_t		 orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;
27045d465952Smartinh 	unsigned int	 i, j, split_indx;
27055d465952Smartinh 	struct node	*node;
27065d465952Smartinh 	struct mpage	*pright, *p, *mp;
27075d465952Smartinh 	struct btval	 sepkey, rkey, rdata;
27085d465952Smartinh 	struct btkey	 tmpkey;
270937ee45a1Smartinh 	struct page	*copy;
27105d465952Smartinh 
27115d465952Smartinh 	assert(bt != NULL);
27125d465952Smartinh 	assert(bt->txn != NULL);
27135d465952Smartinh 
27145d465952Smartinh 	mp = *mpp;
27155d465952Smartinh 	newindx = *newindxp;
27165d465952Smartinh 
271748318e0bSderaadt 	DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %d",
27185d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,
27195d465952Smartinh 	    (int)newkey->size, (char *)newkey->data, *newindxp);
27205d465952Smartinh 	DPRINTF("page %u has prefix [%.*s]", mp->pgno,
27215d465952Smartinh 	    (int)mp->prefix.len, (char *)mp->prefix.str);
27225d465952Smartinh 	orig_pfx_len = mp->prefix.len;
27235d465952Smartinh 
27245d465952Smartinh 	if (mp->parent == NULL) {
27255d465952Smartinh 		if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)
27265d465952Smartinh 			return BT_FAIL;
27275d465952Smartinh 		mp->parent_index = 0;
27285d465952Smartinh 		bt->txn->root = mp->parent->pgno;
27295d465952Smartinh 		DPRINTF("root split! new root = %u", mp->parent->pgno);
273037ee45a1Smartinh 		bt->meta.depth++;
27315d465952Smartinh 
27325d465952Smartinh 		/* Add left (implicit) pointer. */
27335d465952Smartinh 		if (btree_add_node(bt, mp->parent, 0, NULL, NULL,
27345d465952Smartinh 		    mp->pgno, 0) != BT_SUCCESS)
27355d465952Smartinh 			return BT_FAIL;
27365d465952Smartinh 	} else {
27375d465952Smartinh 		DPRINTF("parent branch page is %u", mp->parent->pgno);
27385d465952Smartinh 	}
27395d465952Smartinh 
27405d465952Smartinh 	/* Create a right sibling. */
274137ee45a1Smartinh 	if ((pright = btree_new_page(bt, mp->page->flags)) == NULL)
27425d465952Smartinh 		return BT_FAIL;
27435d465952Smartinh 	pright->parent = mp->parent;
27445d465952Smartinh 	pright->parent_index = mp->parent_index + 1;
27455d465952Smartinh 	DPRINTF("new right sibling: page %u", pright->pgno);
27465d465952Smartinh 
27475d465952Smartinh 	/* Move half of the keys to the right sibling. */
274837ee45a1Smartinh 	if ((copy = malloc(bt->head.psize)) == NULL)
274937ee45a1Smartinh 		return BT_FAIL;
275037ee45a1Smartinh 	bcopy(mp->page, copy, bt->head.psize);
27515d465952Smartinh 	assert(mp->ref == 0);				/* XXX */
275237f4b933Smmcc 	memset(&mp->page->ptrs, 0, bt->head.psize - PAGEHDRSZ);
275337ee45a1Smartinh 	mp->page->lower = PAGEHDRSZ;
275437ee45a1Smartinh 	mp->page->upper = bt->head.psize;
27555d465952Smartinh 
275637ee45a1Smartinh 	split_indx = NUMKEYSP(copy) / 2 + 1;
27575d465952Smartinh 
27585d465952Smartinh 	/* First find the separating key between the split pages.
27595d465952Smartinh 	 */
276037f4b933Smmcc 	memset(&sepkey, 0, sizeof(sepkey));
27615d465952Smartinh 	if (newindx == split_indx) {
27625d465952Smartinh 		sepkey.size = newkey->size;
27635d465952Smartinh 		sepkey.data = newkey->data;
27645d465952Smartinh 		remove_prefix(bt, &sepkey, mp->prefix.len);
27655d465952Smartinh 	} else {
276637ee45a1Smartinh 		node = NODEPTRP(copy, split_indx);
27675d465952Smartinh 		sepkey.size = node->ksize;
27685d465952Smartinh 		sepkey.data = NODEKEY(node);
27695d465952Smartinh 	}
27705d465952Smartinh 
27715d465952Smartinh 	if (IS_LEAF(mp) && bt->cmp == NULL) {
27725d465952Smartinh 		/* Find the smallest separator. */
27735d465952Smartinh 		/* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */
277437ee45a1Smartinh 		node = NODEPTRP(copy, split_indx - 1);
27755d465952Smartinh 		bt_reduce_separator(bt, node, &sepkey);
27765d465952Smartinh 	}
27775d465952Smartinh 
27785d465952Smartinh 	/* Fix separator wrt parent prefix. */
27795d465952Smartinh 	if (bt->cmp == NULL) {
27805d465952Smartinh 		tmpkey.len = sizeof(tmpkey.str);
27815d465952Smartinh 		concat_prefix(bt, mp->prefix.str, mp->prefix.len,
27825d465952Smartinh 		    sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len);
27835d465952Smartinh 		sepkey.data = tmpkey.str;
27845d465952Smartinh 		sepkey.size = tmpkey.len;
27855d465952Smartinh 	}
27865d465952Smartinh 
27875d465952Smartinh 	DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data);
27885d465952Smartinh 
27895d465952Smartinh 	/* Copy separator key to the parent.
27905d465952Smartinh 	 */
279137ee45a1Smartinh 	if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) {
27925d465952Smartinh 		rc = btree_split(bt, &pright->parent, &pright->parent_index,
27935d465952Smartinh 		    &sepkey, NULL, pright->pgno);
27945d465952Smartinh 
27955d465952Smartinh 		/* Right page might now have changed parent.
27965d465952Smartinh 		 * Check if left page also changed parent.
27975d465952Smartinh 		 */
27985d465952Smartinh 		if (pright->parent != mp->parent &&
27995d465952Smartinh 		    mp->parent_index >= NUMKEYS(mp->parent)) {
28005d465952Smartinh 			mp->parent = pright->parent;
28015d465952Smartinh 			mp->parent_index = pright->parent_index - 1;
28025d465952Smartinh 		}
28035d465952Smartinh 	} else {
28045d465952Smartinh 		remove_prefix(bt, &sepkey, pright->parent->prefix.len);
28055d465952Smartinh 		rc = btree_add_node(bt, pright->parent, pright->parent_index,
28065d465952Smartinh 		    &sepkey, NULL, pright->pgno, 0);
28075d465952Smartinh 	}
280837ee45a1Smartinh 	if (rc != BT_SUCCESS) {
280937ee45a1Smartinh 		free(copy);
28105d465952Smartinh 		return BT_FAIL;
281137ee45a1Smartinh 	}
28125d465952Smartinh 
28135d465952Smartinh 	/* Update prefix for right and left page, if the parent was split.
28145d465952Smartinh 	 */
28155d465952Smartinh 	find_common_prefix(bt, pright);
28165d465952Smartinh 	assert(orig_pfx_len <= pright->prefix.len);
28175d465952Smartinh 	right_pfx_diff = pright->prefix.len - orig_pfx_len;
28185d465952Smartinh 
28195d465952Smartinh 	find_common_prefix(bt, mp);
28205d465952Smartinh 	assert(orig_pfx_len <= mp->prefix.len);
28215d465952Smartinh 	left_pfx_diff = mp->prefix.len - orig_pfx_len;
28225d465952Smartinh 
282337ee45a1Smartinh 	for (i = j = 0; i <= NUMKEYSP(copy); j++) {
28245d465952Smartinh 		if (i < split_indx) {
28255d465952Smartinh 			/* Re-insert in left sibling. */
28265d465952Smartinh 			p = mp;
28275d465952Smartinh 			pfx_diff = left_pfx_diff;
28285d465952Smartinh 		} else {
28295d465952Smartinh 			/* Insert in right sibling. */
28305d465952Smartinh 			if (i == split_indx)
28315d465952Smartinh 				/* Reset insert index for right sibling. */
28325d465952Smartinh 				j = (i == newindx && ins_new);
28335d465952Smartinh 			p = pright;
28345d465952Smartinh 			pfx_diff = right_pfx_diff;
28355d465952Smartinh 		}
28365d465952Smartinh 
28375d465952Smartinh 		if (i == newindx && !ins_new) {
28385d465952Smartinh 			/* Insert the original entry that caused the split. */
28395d465952Smartinh 			rkey.data = newkey->data;
28405d465952Smartinh 			rkey.size = newkey->size;
28415d465952Smartinh 			if (IS_LEAF(mp)) {
28425d465952Smartinh 				rdata.data = newdata->data;
28435d465952Smartinh 				rdata.size = newdata->size;
28445d465952Smartinh 			} else
28455d465952Smartinh 				pgno = newpgno;
28465d465952Smartinh 			flags = 0;
28475d465952Smartinh 			pfx_diff = p->prefix.len;
28485d465952Smartinh 
28495d465952Smartinh 			ins_new = 1;
28505d465952Smartinh 
28515d465952Smartinh 			/* Update page and index for the new key. */
28525d465952Smartinh 			*newindxp = j;
28535d465952Smartinh 			*mpp = p;
285437ee45a1Smartinh 		} else if (i == NUMKEYSP(copy)) {
28555d465952Smartinh 			break;
28565d465952Smartinh 		} else {
285737ee45a1Smartinh 			node = NODEPTRP(copy, i);
28585d465952Smartinh 			rkey.data = NODEKEY(node);
28595d465952Smartinh 			rkey.size = node->ksize;
28605d465952Smartinh 			if (IS_LEAF(mp)) {
28615d465952Smartinh 				rdata.data = NODEDATA(node);
28625d465952Smartinh 				rdata.size = node->n_dsize;
28635d465952Smartinh 			} else
28645d465952Smartinh 				pgno = node->n_pgno;
28655d465952Smartinh 			flags = node->flags;
28665d465952Smartinh 
28675d465952Smartinh 			i++;
28685d465952Smartinh 		}
28695d465952Smartinh 
28705d465952Smartinh 		if (!IS_LEAF(mp) && j == 0) {
28715d465952Smartinh 			/* First branch index doesn't need key data. */
28725d465952Smartinh 			rkey.size = 0;
28735d465952Smartinh 		} else
28745d465952Smartinh 			remove_prefix(bt, &rkey, pfx_diff);
28755d465952Smartinh 
28765d465952Smartinh 		rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);
28775d465952Smartinh 	}
28785d465952Smartinh 
287937ee45a1Smartinh 	free(copy);
28805d465952Smartinh 	return rc;
28815d465952Smartinh }
28825d465952Smartinh 
28835d465952Smartinh int
btree_txn_put(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data,unsigned int flags)28845d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn,
28855d465952Smartinh     struct btval *key, struct btval *data, unsigned int flags)
28865d465952Smartinh {
28875d465952Smartinh 	int		 rc = BT_SUCCESS, exact, close_txn = 0;
28885d465952Smartinh 	unsigned int	 ki;
28895d465952Smartinh 	struct node	*leaf;
28905d465952Smartinh 	struct mpage	*mp;
28915d465952Smartinh 	struct btval	 xkey;
28925d465952Smartinh 
28935d465952Smartinh 	assert(key != NULL);
28945d465952Smartinh 	assert(data != NULL);
28955d465952Smartinh 
289659916c99Smartinh 	if (bt != NULL && txn != NULL && bt != txn->bt) {
289759916c99Smartinh 		errno = EINVAL;
289859916c99Smartinh 		return BT_FAIL;
289959916c99Smartinh 	}
290059916c99Smartinh 
29017e04cf1eSmartinh 	if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
29027e04cf1eSmartinh 		errno = EINVAL;
29037e04cf1eSmartinh 		return BT_FAIL;
29047e04cf1eSmartinh 	}
29057e04cf1eSmartinh 
290659916c99Smartinh 	if (bt == NULL) {
290759916c99Smartinh 		if (txn == NULL) {
290859916c99Smartinh 			errno = EINVAL;
290959916c99Smartinh 			return BT_FAIL;
291059916c99Smartinh 		}
291159916c99Smartinh 		bt = txn->bt;
291259916c99Smartinh 	}
291359916c99Smartinh 
29145d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
29155d465952Smartinh 		errno = EINVAL;
29165d465952Smartinh 		return BT_FAIL;
29175d465952Smartinh 	}
29185d465952Smartinh 
29195d465952Smartinh 	DPRINTF("==> put key %.*s, size %zu, data size %zu",
29205d465952Smartinh 		(int)key->size, (char *)key->data, key->size, data->size);
29215d465952Smartinh 
29225d465952Smartinh 	if (txn == NULL) {
29235d465952Smartinh 		close_txn = 1;
29245d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
29255d465952Smartinh 			return BT_FAIL;
29265d465952Smartinh 	}
29275d465952Smartinh 
29285d465952Smartinh 	rc = btree_search_page(bt, txn, key, NULL, 1, &mp);
29295d465952Smartinh 	if (rc == BT_SUCCESS) {
29305d465952Smartinh 		leaf = btree_search_node(bt, mp, key, &exact, &ki);
29315d465952Smartinh 		if (leaf && exact) {
29325d465952Smartinh 			if (F_ISSET(flags, BT_NOOVERWRITE)) {
29335d465952Smartinh 				DPRINTF("duplicate key %.*s",
29345d465952Smartinh 				    (int)key->size, (char *)key->data);
29350279e526Smartinh 				errno = EEXIST;
29360279e526Smartinh 				rc = BT_FAIL;
29375d465952Smartinh 				goto done;
29385d465952Smartinh 			}
29395d465952Smartinh 			btree_del_node(bt, mp, ki);
29405d465952Smartinh 		}
29415d465952Smartinh 		if (leaf == NULL) {		/* append if not found */
29425d465952Smartinh 			ki = NUMKEYS(mp);
294348318e0bSderaadt 			DPRINTF("appending key at index %d", ki);
29445d465952Smartinh 		}
29450279e526Smartinh 	} else if (errno == ENOENT) {
29465d465952Smartinh 		/* new file, just write a root leaf page */
29475d465952Smartinh 		DPRINTF("allocating new root leaf page");
29485d465952Smartinh 		if ((mp = btree_new_page(bt, P_LEAF)) == NULL) {
29495d465952Smartinh 			rc = BT_FAIL;
29505d465952Smartinh 			goto done;
29515d465952Smartinh 		}
29525d465952Smartinh 		txn->root = mp->pgno;
295337ee45a1Smartinh 		bt->meta.depth++;
29545d465952Smartinh 		ki = 0;
29555d465952Smartinh 	}
29565d465952Smartinh 	else
29575d465952Smartinh 		goto done;
29585d465952Smartinh 
29595d465952Smartinh 	assert(IS_LEAF(mp));
296048318e0bSderaadt 	DPRINTF("there are %lu keys, should insert new key at index %u",
29615d465952Smartinh 		NUMKEYS(mp), ki);
29625d465952Smartinh 
29635d465952Smartinh 	/* Copy the key pointer as it is modified by the prefix code. The
29645d465952Smartinh 	 * caller might have malloc'ed the data.
29655d465952Smartinh 	 */
29665d465952Smartinh 	xkey.data = key->data;
29675d465952Smartinh 	xkey.size = key->size;
29685d465952Smartinh 
296937ee45a1Smartinh 	if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) {
29705d465952Smartinh 		rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);
29715d465952Smartinh 	} else {
29725d465952Smartinh 		/* There is room already in this leaf page. */
29735d465952Smartinh 		remove_prefix(bt, &xkey, mp->prefix.len);
29745d465952Smartinh 		rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);
29755d465952Smartinh 	}
29765d465952Smartinh 
29775d465952Smartinh 	if (rc != BT_SUCCESS)
29785d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
29795d465952Smartinh 	else
298037ee45a1Smartinh 		bt->meta.entries++;
29815d465952Smartinh 
29825d465952Smartinh done:
29835d465952Smartinh 	if (close_txn) {
29845d465952Smartinh 		if (rc == BT_SUCCESS)
29855d465952Smartinh 			rc = btree_txn_commit(txn);
29865d465952Smartinh 		else
29875d465952Smartinh 			btree_txn_abort(txn);
29885d465952Smartinh 	}
29895d465952Smartinh 	mpage_prune(bt);
29905d465952Smartinh 	return rc;
29915d465952Smartinh }
29925d465952Smartinh 
29935d465952Smartinh static pgno_t
btree_compact_tree(struct btree * bt,pgno_t pgno,struct btree * btc)299437ee45a1Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc)
29955d465952Smartinh {
299637ee45a1Smartinh 	ssize_t		 rc;
29975d465952Smartinh 	indx_t		 i;
2998023c5341Smartinh 	pgno_t		*pnext, next;
29995d465952Smartinh 	struct node	*node;
30005d465952Smartinh 	struct page	*p;
30015d465952Smartinh 	struct mpage	*mp;
30025d465952Smartinh 
300337ee45a1Smartinh 	/* Get the page and make a copy of it.
300437ee45a1Smartinh 	 */
300537ee45a1Smartinh 	if ((mp = btree_get_mpage(bt, pgno)) == NULL)
30065d465952Smartinh 		return P_INVALID;
300737ee45a1Smartinh 	if ((p = malloc(bt->head.psize)) == NULL)
300837ee45a1Smartinh 		return P_INVALID;
300937ee45a1Smartinh 	bcopy(mp->page, p, bt->head.psize);
30105d465952Smartinh 
301137ee45a1Smartinh         /* Go through all nodes in the (copied) page and update the
301237ee45a1Smartinh          * page pointers.
301337ee45a1Smartinh 	 */
30145d465952Smartinh 	if (F_ISSET(p->flags, P_BRANCH)) {
30155d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
30165d465952Smartinh 			node = NODEPTRP(p, i);
301737ee45a1Smartinh 			node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc);
301837ee45a1Smartinh 			if (node->n_pgno == P_INVALID) {
301937ee45a1Smartinh 				free(p);
30205d465952Smartinh 				return P_INVALID;
30215d465952Smartinh 			}
302237ee45a1Smartinh 		}
30235d465952Smartinh 	} else if (F_ISSET(p->flags, P_LEAF)) {
30245d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
30255d465952Smartinh 			node = NODEPTRP(p, i);
30265d465952Smartinh 			if (F_ISSET(node->flags, F_BIGDATA)) {
3027023c5341Smartinh 				bcopy(NODEDATA(node), &next, sizeof(next));
3028023c5341Smartinh 				next = btree_compact_tree(bt, next, btc);
3029023c5341Smartinh 				if (next == P_INVALID) {
303037ee45a1Smartinh 					free(p);
30315d465952Smartinh 					return P_INVALID;
30325d465952Smartinh 				}
3033023c5341Smartinh 				bcopy(&next, NODEDATA(node), sizeof(next));
30345d465952Smartinh 			}
303537ee45a1Smartinh 		}
30365d465952Smartinh 	} else if (F_ISSET(p->flags, P_OVERFLOW)) {
30375d465952Smartinh 		pnext = &p->p_next_pgno;
30385d465952Smartinh 		if (*pnext > 0) {
303937ee45a1Smartinh 			*pnext = btree_compact_tree(bt, *pnext, btc);
304037ee45a1Smartinh 			if (*pnext == P_INVALID) {
304137ee45a1Smartinh 				free(p);
30425d465952Smartinh 				return P_INVALID;
30435d465952Smartinh 			}
304437ee45a1Smartinh 		}
30455d465952Smartinh 	} else
30465d465952Smartinh 		assert(0);
30475d465952Smartinh 
304837ee45a1Smartinh 	pgno = p->pgno = btc->txn->next_pgno++;
304937ee45a1Smartinh 	rc = write(btc->fd, p, bt->head.psize);
305037ee45a1Smartinh 	free(p);
305136e226d9Smartinh 	if (rc != (ssize_t)bt->head.psize)
30525d465952Smartinh 		return P_INVALID;
30533357b322Smartinh 	mpage_prune(bt);
305437ee45a1Smartinh 	return pgno;
30555d465952Smartinh }
30565d465952Smartinh 
30575d465952Smartinh int
btree_compact(struct btree * bt)30585d465952Smartinh btree_compact(struct btree *bt)
30595d465952Smartinh {
30605d465952Smartinh 	char			*compact_path = NULL;
306137ee45a1Smartinh 	struct btree		*btc;
306237ee45a1Smartinh 	struct btree_txn	*txn, *txnc = NULL;
306337ee45a1Smartinh 	int			 fd;
306437ee45a1Smartinh 	pgno_t			 root;
30655d465952Smartinh 
30665d465952Smartinh 	assert(bt != NULL);
30675d465952Smartinh 
30680279e526Smartinh 	DPRINTF("compacting btree %p with path %s", bt, bt->path);
30690279e526Smartinh 
30700279e526Smartinh 	if (bt->path == NULL) {
30710279e526Smartinh 		errno = EINVAL;
30720279e526Smartinh 		return BT_FAIL;
30730279e526Smartinh 	}
30740279e526Smartinh 
30750279e526Smartinh 	if ((txn = btree_txn_begin(bt, 0)) == NULL)
30765d465952Smartinh 		return BT_FAIL;
30775d465952Smartinh 
3078aa48e8d1Smillert 	if (asprintf(&compact_path, "%s.compact.XXXXXX", bt->path) == -1) {
3079aa48e8d1Smillert 		btree_txn_abort(txn);
3080aa48e8d1Smillert 		return BT_FAIL;
3081aa48e8d1Smillert 	}
30825d465952Smartinh 	fd = mkstemp(compact_path);
30835d465952Smartinh 	if (fd == -1) {
30845d465952Smartinh 		free(compact_path);
30850279e526Smartinh 		btree_txn_abort(txn);
30865d465952Smartinh 		return BT_FAIL;
30875d465952Smartinh 	}
30885d465952Smartinh 
308937ee45a1Smartinh 	if ((btc = btree_open_fd(fd, 0)) == NULL)
309037ee45a1Smartinh 		goto failed;
309137f85b55Smartinh 	bcopy(&bt->meta, &btc->meta, sizeof(bt->meta));
30925bca2891Smartinh 	btc->meta.revisions = 0;
30935d465952Smartinh 
309437ee45a1Smartinh 	if ((txnc = btree_txn_begin(btc, 0)) == NULL)
309537ee45a1Smartinh 		goto failed;
309637ee45a1Smartinh 
309737ee45a1Smartinh 	if (bt->meta.root != P_INVALID) {
309837ee45a1Smartinh 		root = btree_compact_tree(bt, bt->meta.root, btc);
30995d465952Smartinh 		if (root == P_INVALID)
31005d465952Smartinh 			goto failed;
310137ee45a1Smartinh 		if (btree_write_meta(btc, root, 0) != BT_SUCCESS)
31025d465952Smartinh 			goto failed;
310337ee45a1Smartinh 	}
31045d465952Smartinh 
31055d465952Smartinh 	fsync(fd);
31065d465952Smartinh 
31075d465952Smartinh 	DPRINTF("renaming %s to %s", compact_path, bt->path);
31085d465952Smartinh 	if (rename(compact_path, bt->path) != 0)
31095d465952Smartinh 		goto failed;
31105d465952Smartinh 
3111e14f8e24Smartinh 	/* Write a "tombstone" meta page so other processes can pick up
3112e14f8e24Smartinh 	 * the change and re-open the file.
3113e14f8e24Smartinh 	 */
3114e14f8e24Smartinh 	if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS)
3115e14f8e24Smartinh 		goto failed;
31165d465952Smartinh 
3117e14f8e24Smartinh 	btree_txn_abort(txn);
311837ee45a1Smartinh 	btree_txn_abort(txnc);
31195d465952Smartinh 	free(compact_path);
312037ee45a1Smartinh 	btree_close(btc);
31213357b322Smartinh 	mpage_prune(bt);
312237ee45a1Smartinh 	return 0;
31235d465952Smartinh 
31245d465952Smartinh failed:
31255d465952Smartinh 	btree_txn_abort(txn);
312637ee45a1Smartinh 	btree_txn_abort(txnc);
31275d465952Smartinh 	unlink(compact_path);
31285d465952Smartinh 	free(compact_path);
312937ee45a1Smartinh 	btree_close(btc);
31303357b322Smartinh 	mpage_prune(bt);
31315d465952Smartinh 	return BT_FAIL;
31325d465952Smartinh }
31335d465952Smartinh 
31345d465952Smartinh /* Reverts the last change. Truncates the file at the last root page.
31355d465952Smartinh  */
31365d465952Smartinh int
btree_revert(struct btree * bt)31375d465952Smartinh btree_revert(struct btree *bt)
31385d465952Smartinh {
313937ee45a1Smartinh 	if (btree_read_meta(bt, NULL) != 0)
314037ee45a1Smartinh 		return -1;
31415d465952Smartinh 
314237ee45a1Smartinh 	DPRINTF("truncating file at page %u", bt->meta.root);
314337ee45a1Smartinh 	return ftruncate(bt->fd, bt->head.psize * bt->meta.root);
31445d465952Smartinh }
31455d465952Smartinh 
31465d465952Smartinh void
btree_set_cache_size(struct btree * bt,unsigned int cache_size)31475d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size)
31485d465952Smartinh {
31495d465952Smartinh 	bt->stat.max_cache = cache_size;
31505d465952Smartinh }
31515d465952Smartinh 
31525d465952Smartinh unsigned int
btree_get_flags(struct btree * bt)31535d465952Smartinh btree_get_flags(struct btree *bt)
31545d465952Smartinh {
31555d465952Smartinh 	return (bt->flags & ~BT_FIXPADDING);
31565d465952Smartinh }
31575d465952Smartinh 
31585d465952Smartinh const char *
btree_get_path(struct btree * bt)31595d465952Smartinh btree_get_path(struct btree *bt)
31605d465952Smartinh {
31615d465952Smartinh 	return bt->path;
31625d465952Smartinh }
31635d465952Smartinh 
31645d465952Smartinh const struct btree_stat *
btree_stat(struct btree * bt)31655d465952Smartinh btree_stat(struct btree *bt)
31665d465952Smartinh {
31674074e1b0Smartinh 	if (bt == NULL)
31684074e1b0Smartinh 		return NULL;
31694074e1b0Smartinh 
317037ee45a1Smartinh 	bt->stat.branch_pages = bt->meta.branch_pages;
317137ee45a1Smartinh 	bt->stat.leaf_pages = bt->meta.leaf_pages;
317237ee45a1Smartinh 	bt->stat.overflow_pages = bt->meta.overflow_pages;
317337ee45a1Smartinh 	bt->stat.revisions = bt->meta.revisions;
317437ee45a1Smartinh 	bt->stat.depth = bt->meta.depth;
317537ee45a1Smartinh 	bt->stat.entries = bt->meta.entries;
317637ee45a1Smartinh 	bt->stat.psize = bt->head.psize;
317737ee45a1Smartinh 	bt->stat.created_at = bt->meta.created_at;
31785d465952Smartinh 
31795d465952Smartinh 	return &bt->stat;
31805d465952Smartinh }
31815d465952Smartinh 
3182