xref: /openbsd/usr.sbin/ldapd/btree.c (revision 5d465952)
1*5d465952Smartinh /*	$OpenBSD: btree.c,v 1.1 2010/05/31 17:36:31 martinh Exp $ */
2*5d465952Smartinh 
3*5d465952Smartinh /*
4*5d465952Smartinh  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
5*5d465952Smartinh  *
6*5d465952Smartinh  * Permission to use, copy, modify, and distribute this software for any
7*5d465952Smartinh  * purpose with or without fee is hereby granted, provided that the above
8*5d465952Smartinh  * copyright notice and this permission notice appear in all copies.
9*5d465952Smartinh  *
10*5d465952Smartinh  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11*5d465952Smartinh  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12*5d465952Smartinh  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13*5d465952Smartinh  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14*5d465952Smartinh  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15*5d465952Smartinh  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16*5d465952Smartinh  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17*5d465952Smartinh  */
18*5d465952Smartinh 
19*5d465952Smartinh #include <sys/types.h>
20*5d465952Smartinh #include <sys/tree.h>
21*5d465952Smartinh #include <sys/queue.h>
22*5d465952Smartinh #include <sys/param.h>
23*5d465952Smartinh #include <sys/uio.h>
24*5d465952Smartinh 
25*5d465952Smartinh #include <assert.h>
26*5d465952Smartinh #include <err.h>
27*5d465952Smartinh #include <errno.h>
28*5d465952Smartinh #include <fcntl.h>
29*5d465952Smartinh #include <stddef.h>
30*5d465952Smartinh #include <stdint.h>
31*5d465952Smartinh #include <stdio.h>
32*5d465952Smartinh #include <stdlib.h>
33*5d465952Smartinh #include <string.h>
34*5d465952Smartinh #include <time.h>
35*5d465952Smartinh #include <unistd.h>
36*5d465952Smartinh 
37*5d465952Smartinh #include "btree.h"
38*5d465952Smartinh 
39*5d465952Smartinh /* #define DEBUG */
40*5d465952Smartinh 
41*5d465952Smartinh #ifdef DEBUG
42*5d465952Smartinh # define DPRINTF(...)	do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
43*5d465952Smartinh 			     fprintf(stderr, __VA_ARGS__); \
44*5d465952Smartinh 			     fprintf(stderr, "\n"); } while(0)
45*5d465952Smartinh #else
46*5d465952Smartinh # define DPRINTF(...)
47*5d465952Smartinh #endif
48*5d465952Smartinh 
49*5d465952Smartinh #define PAGESIZE	 4096
50*5d465952Smartinh #define BT_MINKEYS	 4
51*5d465952Smartinh #define BT_MAGIC	 0xB3DBB3DB
52*5d465952Smartinh #define BT_VERSION	 3
53*5d465952Smartinh #define MAXKEYSIZE	 255
54*5d465952Smartinh 
55*5d465952Smartinh #define P_INVALID	 0xFFFFFFFF
56*5d465952Smartinh 
57*5d465952Smartinh #define F_ISSET(w, f)	 (((w) & (f)) == (f))
58*5d465952Smartinh 
59*5d465952Smartinh /* There are four page types: meta, index, leaf and overflow.
60*5d465952Smartinh  * They all share the same page header.
61*5d465952Smartinh  */
62*5d465952Smartinh struct page {				/* represents an on-disk page */
63*5d465952Smartinh 	pgno_t		 pgno;
64*5d465952Smartinh #define	P_BRANCH	 0x01		/* branch page */
65*5d465952Smartinh #define	P_LEAF		 0x02		/* leaf page */
66*5d465952Smartinh #define	P_OVERFLOW	 0x04		/* overflow page */
67*5d465952Smartinh #define	P_META		 0x08		/* meta page */
68*5d465952Smartinh 	uint32_t	 flags;
69*5d465952Smartinh #define lower		 b.fb.fb_lower
70*5d465952Smartinh #define upper		 b.fb.fb_upper
71*5d465952Smartinh #define p_next_pgno	 b.pb_next_pgno
72*5d465952Smartinh 	union page_bounds {
73*5d465952Smartinh 		struct {
74*5d465952Smartinh 			indx_t	 fb_lower;	/* lower bound of free space */
75*5d465952Smartinh 			indx_t	 fb_upper;	/* upper bound of free space */
76*5d465952Smartinh 		} fb;
77*5d465952Smartinh 		pgno_t		 pb_next_pgno;	/* overflow page linked list */
78*5d465952Smartinh 	} b;
79*5d465952Smartinh #define PAGEHDRSZ	 (sizeof(pgno_t) + sizeof(uint32_t) + \
80*5d465952Smartinh 				sizeof(union page_bounds))
81*5d465952Smartinh 	indx_t		 ptrs[(PAGESIZE - PAGEHDRSZ) / sizeof(indx_t)];
82*5d465952Smartinh };
83*5d465952Smartinh 
84*5d465952Smartinh #define NUMKEYSP(p)	 (((p)->lower - PAGEHDRSZ) >> 1)
85*5d465952Smartinh #define NUMKEYS(mp)	 (((mp)->page.lower - PAGEHDRSZ) >> 1)
86*5d465952Smartinh #define SIZELEFT(mp)	 (indx_t)((mp)->page.upper - (mp)->page.lower)
87*5d465952Smartinh #define PAGEFILL(mp)	 ((float)(PAGESIZE - PAGEHDRSZ - SIZELEFT(mp)) / \
88*5d465952Smartinh 				(PAGESIZE - PAGEHDRSZ))
89*5d465952Smartinh #define IS_LEAF(mp)	 F_ISSET((mp)->page.flags, P_LEAF)
90*5d465952Smartinh #define IS_BRANCH(mp)	 F_ISSET((mp)->page.flags, P_BRANCH)
91*5d465952Smartinh #define IS_OVERFLOW(mp)	 F_ISSET((mp)->page.flags, P_OVERFLOW)
92*5d465952Smartinh 
93*5d465952Smartinh struct bt_meta {				/* meta (footer) page content */
94*5d465952Smartinh 	uint32_t	 magic;			/* really needed? */
95*5d465952Smartinh 	uint32_t	 version;
96*5d465952Smartinh 	uint32_t	 flags;
97*5d465952Smartinh 	uint32_t	 psize;			/* page size */
98*5d465952Smartinh 	pgno_t		 root;			/* page number of root page */
99*5d465952Smartinh 	pgno_t		 prev_meta;		/* previous meta page number */
100*5d465952Smartinh 	time_t		 created_at;
101*5d465952Smartinh 	uint32_t	 branch_pages;
102*5d465952Smartinh 	uint32_t	 leaf_pages;
103*5d465952Smartinh 	uint32_t	 overflow_pages;
104*5d465952Smartinh 	uint32_t	 revisions;
105*5d465952Smartinh 	uint32_t	 depth;
106*5d465952Smartinh 	uint64_t	 entries;
107*5d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
108*5d465952Smartinh };
109*5d465952Smartinh 
110*5d465952Smartinh struct btkey {
111*5d465952Smartinh 	size_t			 len;
112*5d465952Smartinh 	char			 str[MAXKEYSIZE];
113*5d465952Smartinh };
114*5d465952Smartinh 
115*5d465952Smartinh struct mpage {					/* an in-memory cached page */
116*5d465952Smartinh 	RB_ENTRY(mpage)		 entry;		/* page cache entry */
117*5d465952Smartinh 	SIMPLEQ_ENTRY(mpage)	 next;		/* queue of dirty pages */
118*5d465952Smartinh 	TAILQ_ENTRY(mpage)	 lru_next;	/* LRU queue */
119*5d465952Smartinh 	struct mpage		*parent;	/* NULL if root */
120*5d465952Smartinh 	unsigned int		 parent_index;	/* keep track of node index */
121*5d465952Smartinh 	struct btkey		 prefix;
122*5d465952Smartinh 	struct page		 page;
123*5d465952Smartinh 	pgno_t			 pgno;		/* copy of page->pgno */
124*5d465952Smartinh 	short			 ref;		/* increased by cursors */
125*5d465952Smartinh 	short			 dirty;		/* 1 if on dirty queue */
126*5d465952Smartinh };
127*5d465952Smartinh RB_HEAD(page_cache, mpage);
128*5d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage);
129*5d465952Smartinh TAILQ_HEAD(lru_queue, mpage);
130*5d465952Smartinh 
131*5d465952Smartinh static int		 mpage_cmp(struct mpage *a, struct mpage *b);
132*5d465952Smartinh static struct mpage	*mpage_lookup(struct btree *bt, pgno_t pgno);
133*5d465952Smartinh static void		 mpage_add(struct btree *bt, struct mpage *mp);
134*5d465952Smartinh static void		 mpage_del(struct btree *bt, struct mpage *mp);
135*5d465952Smartinh static struct mpage	*mpage_copy(struct mpage *mp);
136*5d465952Smartinh static void		 mpage_prune(struct btree *bt);
137*5d465952Smartinh static void		 mpage_dirty(struct btree *bt, struct mpage *mp);
138*5d465952Smartinh static void		 mpage_touch(struct btree *bt, struct mpage *mp);
139*5d465952Smartinh 
140*5d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);
141*5d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp);
142*5d465952Smartinh 
143*5d465952Smartinh struct ppage {					/* ordered list of pages */
144*5d465952Smartinh 	SLIST_ENTRY(ppage)	 entry;
145*5d465952Smartinh 	struct mpage		*mpage;
146*5d465952Smartinh 	unsigned int		 ki;		/* cursor index on page */
147*5d465952Smartinh };
148*5d465952Smartinh SLIST_HEAD(page_stack, ppage);
149*5d465952Smartinh 
150*5d465952Smartinh #define CURSOR_EMPTY(c)		 SLIST_EMPTY(&(c)->stack)
151*5d465952Smartinh #define CURSOR_TOP(c)		 SLIST_FIRST(&(c)->stack)
152*5d465952Smartinh #define CURSOR_POP(c)		 SLIST_REMOVE_HEAD(&(c)->stack, entry)
153*5d465952Smartinh #define CURSOR_PUSH(c,p)	 SLIST_INSERT_HEAD(&(c)->stack, p, entry)
154*5d465952Smartinh 
155*5d465952Smartinh struct cursor {
156*5d465952Smartinh 	struct btree		*bt;
157*5d465952Smartinh 	struct btree_txn	*txn;
158*5d465952Smartinh 	struct page_stack	 stack;		/* stack of parent pages */
159*5d465952Smartinh 	short			 initialized;	/* 1 if initialized */
160*5d465952Smartinh 	short			 eof;		/* 1 if end is reached */
161*5d465952Smartinh };
162*5d465952Smartinh 
163*5d465952Smartinh #define METAHASHLEN	 offsetof(struct bt_meta, hash)
164*5d465952Smartinh #define METADATA(p)	 ((struct bt_meta *)((char *)p + PAGEHDRSZ))
165*5d465952Smartinh 
166*5d465952Smartinh struct node {
167*5d465952Smartinh #define n_pgno		 p.np_pgno
168*5d465952Smartinh #define n_dsize		 p.np_dsize
169*5d465952Smartinh 	union {
170*5d465952Smartinh 		pgno_t		 np_pgno;	/* child page number */
171*5d465952Smartinh 		uint32_t	 np_dsize;	/* leaf data size */
172*5d465952Smartinh 	}		 p;
173*5d465952Smartinh 	uint16_t	 ksize;			/* key size */
174*5d465952Smartinh #define F_BIGDATA	 0x01			/* data put on overflow page */
175*5d465952Smartinh 	uint8_t		 flags;
176*5d465952Smartinh 	char		 data[];
177*5d465952Smartinh };
178*5d465952Smartinh 
179*5d465952Smartinh struct btree_txn {
180*5d465952Smartinh 	pgno_t			 root;		/* current / new root page */
181*5d465952Smartinh 	pgno_t			 next_pgno;	/* next unallocated page */
182*5d465952Smartinh 	struct btree		*bt;		/* btree is ref'd */
183*5d465952Smartinh 	struct dirty_queue	*dirty_queue;	/* modified pages */
184*5d465952Smartinh #define BT_TXN_RDONLY		 0x01		/* read-only transaction */
185*5d465952Smartinh #define BT_TXN_ERROR		 0x02		/* an error has occurred */
186*5d465952Smartinh 	unsigned int		 flags;
187*5d465952Smartinh };
188*5d465952Smartinh 
189*5d465952Smartinh struct btree {
190*5d465952Smartinh 	int			 fd;
191*5d465952Smartinh 	char			*path;
192*5d465952Smartinh #define BT_FIXPADDING		 0x01		/* internal */
193*5d465952Smartinh 	unsigned int		 flags;
194*5d465952Smartinh 	bt_cmp_func		 cmp;		/* user compare function */
195*5d465952Smartinh 	struct page		*metapage;	/* current last meta page */
196*5d465952Smartinh 	struct bt_meta		*meta;
197*5d465952Smartinh 	struct page_cache	*page_cache;
198*5d465952Smartinh 	struct lru_queue	*lru_queue;
199*5d465952Smartinh 	struct btree_txn	*txn;		/* current write transaction */
200*5d465952Smartinh 	int			 ref;		/* increased by cursors */
201*5d465952Smartinh 	struct btree_stat	 stat;
202*5d465952Smartinh 	off_t			 size;		/* current file size */
203*5d465952Smartinh };
204*5d465952Smartinh 
205*5d465952Smartinh #define NODESIZE	 offsetof(struct node, data)
206*5d465952Smartinh 
207*5d465952Smartinh #define INDXSIZE(k)	 (NODESIZE + ((k) == NULL ? 0 : (k)->size))
208*5d465952Smartinh #define LEAFSIZE(k, d)	 (NODESIZE + (k)->size + (d)->size)
209*5d465952Smartinh #define NODEPTRP(p, i)	 ((struct node *)((char *)(p) + (p)->ptrs[i]))
210*5d465952Smartinh #define NODEPTR(mp, i)	 NODEPTRP(&(mp)->page, i)
211*5d465952Smartinh #define NODEKEY(node)	 (void *)((node)->data)
212*5d465952Smartinh #define NODEDATA(node)	 (void *)((char *)(node)->data + (node)->ksize)
213*5d465952Smartinh #define NODEPGNO(node)	 ((node)->p.np_pgno)
214*5d465952Smartinh #define NODEDSZ(node)	 ((node)->p.np_dsize)
215*5d465952Smartinh 
216*5d465952Smartinh #define BT_COMMIT_PAGES	 64	/* max number of pages to write in one commit */
217*5d465952Smartinh #define BT_MAXCACHE_DEF	 1024	/* max number of pages to keep in cache  */
218*5d465952Smartinh 
219*5d465952Smartinh static int		 btree_read_page(struct btree *bt, pgno_t pgno,
220*5d465952Smartinh 			    struct page *page);
221*5d465952Smartinh static int		 btree_get_mpage(struct btree *bt, pgno_t pgno,
222*5d465952Smartinh 			    struct mpage **mpp);
223*5d465952Smartinh static int		 btree_search_page_root(struct btree *bt,
224*5d465952Smartinh 			    struct mpage *root, struct btval *key,
225*5d465952Smartinh 			    struct cursor *cursor, int modify,
226*5d465952Smartinh 			    struct mpage **mpp);
227*5d465952Smartinh static int		 btree_search_page(struct btree *bt,
228*5d465952Smartinh 			    struct btree_txn *txn, struct btval *key,
229*5d465952Smartinh 			    struct cursor *cursor, int modify,
230*5d465952Smartinh 			    struct mpage **mpp);
231*5d465952Smartinh 
232*5d465952Smartinh static void		 btree_init_meta(struct btree *bt);
233*5d465952Smartinh static int		 btree_is_meta_page(struct page *p);
234*5d465952Smartinh static int		 btree_read_meta(struct btree *bt, pgno_t *p_next);
235*5d465952Smartinh static int		 btree_write_meta(struct btree *bt, pgno_t root);
236*5d465952Smartinh 
237*5d465952Smartinh static struct node	*btree_search_node(struct btree *bt, struct mpage *mp,
238*5d465952Smartinh 			    struct btval *key, int *exactp, unsigned int *kip);
239*5d465952Smartinh static int		 btree_add_node(struct btree *bt, struct mpage *mp,
240*5d465952Smartinh 			    indx_t indx, struct btval *key, struct btval *data,
241*5d465952Smartinh 			    pgno_t pgno, uint8_t flags);
242*5d465952Smartinh static void		 btree_del_node(struct btree *bt, struct mpage *mp,
243*5d465952Smartinh 			    indx_t indx);
244*5d465952Smartinh static int		 btree_read_data(struct btree *bt, struct mpage *mp,
245*5d465952Smartinh 			    struct node *leaf, struct btval *data);
246*5d465952Smartinh 
247*5d465952Smartinh static int		 btree_rebalance(struct btree *bt, struct mpage *mp);
248*5d465952Smartinh static int		 btree_update_key(struct btree *bt, struct mpage *mp,
249*5d465952Smartinh 			    indx_t indx, struct btval *key);
250*5d465952Smartinh static int		 btree_adjust_prefix(struct btree *bt,
251*5d465952Smartinh 			    struct mpage *src, int delta);
252*5d465952Smartinh static int		 btree_move_node(struct btree *bt, struct mpage *src,
253*5d465952Smartinh 			    indx_t srcindx, struct mpage *dst, indx_t dstindx);
254*5d465952Smartinh static int		 btree_merge(struct btree *bt, struct mpage *src,
255*5d465952Smartinh 			    struct mpage *dst);
256*5d465952Smartinh static int		 btree_split(struct btree *bt, struct mpage **mpp,
257*5d465952Smartinh 			    unsigned int *newindxp, struct btval *newkey,
258*5d465952Smartinh 			    struct btval *newdata, pgno_t newpgno);
259*5d465952Smartinh static struct mpage	*btree_new_page(struct btree *bt, uint32_t flags);
260*5d465952Smartinh static int		 btree_write_overflow_data(struct btree *bt,
261*5d465952Smartinh 			    struct page *p, struct btval *data);
262*5d465952Smartinh 
263*5d465952Smartinh static void		 cursor_pop_page(struct cursor *cursor);
264*5d465952Smartinh static struct ppage	 *cursor_push_page(struct cursor *cursor,
265*5d465952Smartinh 			    struct mpage *mp);
266*5d465952Smartinh 
267*5d465952Smartinh static int		 bt_set_key(struct btree *bt, struct mpage *mp,
268*5d465952Smartinh 			    struct node *node, struct btval *key);
269*5d465952Smartinh static int		 btree_sibling(struct cursor *cursor, int move_right);
270*5d465952Smartinh static int		 btree_cursor_next(struct cursor *cursor,
271*5d465952Smartinh 			    struct btval *key, struct btval *data);
272*5d465952Smartinh static int		 btree_cursor_set(struct cursor *cursor,
273*5d465952Smartinh 			    struct btval *key, struct btval *data);
274*5d465952Smartinh static int		 btree_cursor_first(struct cursor *cursor,
275*5d465952Smartinh 			    struct btval *key, struct btval *data);
276*5d465952Smartinh 
277*5d465952Smartinh static void		 bt_reduce_separator(struct btree *bt, struct node *min,
278*5d465952Smartinh 			    struct btval *sep);
279*5d465952Smartinh static void		 remove_prefix(struct btree *bt, struct btval *key,
280*5d465952Smartinh 			    size_t pfxlen);
281*5d465952Smartinh static void		 expand_prefix(struct btree *bt, struct mpage *mp,
282*5d465952Smartinh 			    indx_t indx, struct btkey *expkey);
283*5d465952Smartinh static void		 concat_prefix(struct btree *bt, char *s1, size_t n1,
284*5d465952Smartinh 			    char *s2, size_t n2, char *cs, size_t *cn);
285*5d465952Smartinh static void		 common_prefix(struct btree *bt, struct btkey *min,
286*5d465952Smartinh 			    struct btkey *max, struct btkey *pfx);
287*5d465952Smartinh static void		 find_common_prefix(struct btree *bt, struct mpage *mp);
288*5d465952Smartinh 
289*5d465952Smartinh static size_t		 bt_leaf_size(struct btval *key, struct btval *data);
290*5d465952Smartinh static size_t		 bt_branch_size(struct btval *key);
291*5d465952Smartinh 
292*5d465952Smartinh static pgno_t		 btree_compact_tree(struct btree *bt, pgno_t pgno,
293*5d465952Smartinh 			    int fd);
294*5d465952Smartinh 
295*5d465952Smartinh static int		 memncmp(const void *s1, size_t n1,
296*5d465952Smartinh 				 const void *s2, size_t n2);
297*5d465952Smartinh static int		 memnrcmp(const void *s1, size_t n1,
298*5d465952Smartinh 				  const void *s2, size_t n2);
299*5d465952Smartinh 
300*5d465952Smartinh static int
301*5d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
302*5d465952Smartinh {
303*5d465952Smartinh 	if (n1 < n2) {
304*5d465952Smartinh 		if (memcmp(s1, s2, n1) == 0)
305*5d465952Smartinh 			return -1;
306*5d465952Smartinh 	}
307*5d465952Smartinh 	else if (n1 > n2) {
308*5d465952Smartinh 		if (memcmp(s1, s2, n2) == 0)
309*5d465952Smartinh 			return 1;
310*5d465952Smartinh 	}
311*5d465952Smartinh 	return memcmp(s1, s2, n1);
312*5d465952Smartinh }
313*5d465952Smartinh 
314*5d465952Smartinh static int
315*5d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
316*5d465952Smartinh {
317*5d465952Smartinh 	const unsigned char	*p1;
318*5d465952Smartinh 	const unsigned char	*p2;
319*5d465952Smartinh 
320*5d465952Smartinh 	if (n1 == 0)
321*5d465952Smartinh 		return n2 == 0 ? 0 : -1;
322*5d465952Smartinh 
323*5d465952Smartinh 	if (n2 == 0)
324*5d465952Smartinh 		return n1 == 0 ? 0 : 1;
325*5d465952Smartinh 
326*5d465952Smartinh  	p1 = (const unsigned char *)s1 + n1 - 1;
327*5d465952Smartinh  	p2 = (const unsigned char *)s2 + n2 - 1;
328*5d465952Smartinh 
329*5d465952Smartinh 	while (*p1 == *p2) {
330*5d465952Smartinh 		if (p1 == s1)
331*5d465952Smartinh 			return (p2 == s2) ? 0 : -1;
332*5d465952Smartinh 		if (p2 == s2)
333*5d465952Smartinh 			return (p1 == p2) ? 0 : 1;
334*5d465952Smartinh 		p1--;
335*5d465952Smartinh 		p2--;
336*5d465952Smartinh 	}
337*5d465952Smartinh 	return *p1 - *p2;
338*5d465952Smartinh }
339*5d465952Smartinh 
340*5d465952Smartinh int
341*5d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)
342*5d465952Smartinh {
343*5d465952Smartinh 	return bt->cmp(a, b);
344*5d465952Smartinh }
345*5d465952Smartinh 
346*5d465952Smartinh static void
347*5d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,
348*5d465952Smartinh     struct btkey *pfx)
349*5d465952Smartinh {
350*5d465952Smartinh 	size_t		 n = 0;
351*5d465952Smartinh 	char		*p1;
352*5d465952Smartinh 	char		*p2;
353*5d465952Smartinh 
354*5d465952Smartinh 	if (min->len == 0 || max->len == 0) {
355*5d465952Smartinh 		pfx->len = 0;
356*5d465952Smartinh 		return;
357*5d465952Smartinh 	}
358*5d465952Smartinh 
359*5d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
360*5d465952Smartinh 		p1 = min->str + min->len - 1;
361*5d465952Smartinh 		p2 = max->str + max->len - 1;
362*5d465952Smartinh 
363*5d465952Smartinh 		while (*p1 == *p2) {
364*5d465952Smartinh 			if (p1 < min->str || p2 < max->str)
365*5d465952Smartinh 				break;
366*5d465952Smartinh 			p1--;
367*5d465952Smartinh 			p2--;
368*5d465952Smartinh 			n++;
369*5d465952Smartinh 		}
370*5d465952Smartinh 
371*5d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
372*5d465952Smartinh 		pfx->len = n;
373*5d465952Smartinh 		bcopy(p2 + 1, pfx->str, n);
374*5d465952Smartinh 	} else {
375*5d465952Smartinh 		p1 = min->str;
376*5d465952Smartinh 		p2 = max->str;
377*5d465952Smartinh 
378*5d465952Smartinh 		while (*p1 == *p2) {
379*5d465952Smartinh 			if (n == min->len || n == max->len)
380*5d465952Smartinh 				break;
381*5d465952Smartinh 			p1++;
382*5d465952Smartinh 			p2++;
383*5d465952Smartinh 			n++;
384*5d465952Smartinh 		}
385*5d465952Smartinh 
386*5d465952Smartinh 		assert(n <= (int)sizeof(pfx->str));
387*5d465952Smartinh 		pfx->len = n;
388*5d465952Smartinh 		bcopy(max->str, pfx->str, n);
389*5d465952Smartinh 	}
390*5d465952Smartinh }
391*5d465952Smartinh 
392*5d465952Smartinh static void
393*5d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)
394*5d465952Smartinh {
395*5d465952Smartinh 	if (pfxlen == 0 || bt->cmp != NULL)
396*5d465952Smartinh 		return;
397*5d465952Smartinh 
398*5d465952Smartinh 	DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen,
399*5d465952Smartinh 	    (int)key->size, (char *)key->data);
400*5d465952Smartinh 	assert(pfxlen <= key->size);
401*5d465952Smartinh 	key->size -= pfxlen;
402*5d465952Smartinh 	if (!F_ISSET(bt->flags, BT_REVERSEKEY))
403*5d465952Smartinh 		key->data = (char *)key->data + pfxlen;
404*5d465952Smartinh }
405*5d465952Smartinh 
406*5d465952Smartinh static void
407*5d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,
408*5d465952Smartinh     struct btkey *expkey)
409*5d465952Smartinh {
410*5d465952Smartinh 	struct node	*node;
411*5d465952Smartinh 
412*5d465952Smartinh 	node = NODEPTR(mp, indx);
413*5d465952Smartinh 	expkey->len = sizeof(expkey->str);
414*5d465952Smartinh 	concat_prefix(bt, mp->prefix.str, mp->prefix.len,
415*5d465952Smartinh 	    NODEKEY(node), node->ksize, expkey->str, &expkey->len);
416*5d465952Smartinh }
417*5d465952Smartinh 
418*5d465952Smartinh static int
419*5d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,
420*5d465952Smartinh     struct btkey *pfx)
421*5d465952Smartinh {
422*5d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY))
423*5d465952Smartinh 		return memnrcmp(key1->data, key1->size - pfx->len,
424*5d465952Smartinh 		    key2->data, key2->size);
425*5d465952Smartinh 	else
426*5d465952Smartinh 		return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,
427*5d465952Smartinh 		    key2->data, key2->size);
428*5d465952Smartinh }
429*5d465952Smartinh 
430*5d465952Smartinh void
431*5d465952Smartinh btval_reset(struct btval *btv)
432*5d465952Smartinh {
433*5d465952Smartinh 	if (btv) {
434*5d465952Smartinh 		if (btv->mp)
435*5d465952Smartinh 			btv->mp->ref--;
436*5d465952Smartinh 		if (btv->free_data)
437*5d465952Smartinh 			free(btv->data);
438*5d465952Smartinh 		bzero(btv, sizeof(*btv));
439*5d465952Smartinh 	}
440*5d465952Smartinh }
441*5d465952Smartinh 
442*5d465952Smartinh static int
443*5d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b)
444*5d465952Smartinh {
445*5d465952Smartinh 	if (a->pgno > b->pgno)
446*5d465952Smartinh 		return 1;
447*5d465952Smartinh 	if (a->pgno < b->pgno)
448*5d465952Smartinh 		return -1;
449*5d465952Smartinh 	return 0;
450*5d465952Smartinh }
451*5d465952Smartinh 
452*5d465952Smartinh static struct mpage *
453*5d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno)
454*5d465952Smartinh {
455*5d465952Smartinh 	struct mpage	 find, *mp;
456*5d465952Smartinh 
457*5d465952Smartinh 	find.pgno = pgno;
458*5d465952Smartinh 	mp = RB_FIND(page_cache, bt->page_cache, &find);
459*5d465952Smartinh 	if (mp) {
460*5d465952Smartinh 		bt->stat.hits++;
461*5d465952Smartinh 		/* Update LRU queue. Move page to the end. */
462*5d465952Smartinh 		TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
463*5d465952Smartinh 		TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
464*5d465952Smartinh 	}
465*5d465952Smartinh 	return mp;
466*5d465952Smartinh }
467*5d465952Smartinh 
468*5d465952Smartinh static void
469*5d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp)
470*5d465952Smartinh {
471*5d465952Smartinh 	assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);
472*5d465952Smartinh 	bt->stat.cache_size++;
473*5d465952Smartinh 	TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
474*5d465952Smartinh }
475*5d465952Smartinh 
476*5d465952Smartinh static void
477*5d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp)
478*5d465952Smartinh {
479*5d465952Smartinh 	assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);
480*5d465952Smartinh 	assert(bt->stat.cache_size > 0);
481*5d465952Smartinh 	bt->stat.cache_size--;
482*5d465952Smartinh 	TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
483*5d465952Smartinh }
484*5d465952Smartinh 
485*5d465952Smartinh static struct mpage *
486*5d465952Smartinh mpage_copy(struct mpage *mp)
487*5d465952Smartinh {
488*5d465952Smartinh 	struct mpage	*copy;
489*5d465952Smartinh 
490*5d465952Smartinh 	if ((copy = calloc(1, sizeof(*copy))) == NULL)
491*5d465952Smartinh 		return NULL;
492*5d465952Smartinh 	bcopy(&mp->page, &copy->page, sizeof(mp->page));
493*5d465952Smartinh 	bcopy(&mp->prefix, &copy->prefix, sizeof(mp->prefix));
494*5d465952Smartinh 	copy->parent = mp->parent;
495*5d465952Smartinh 	copy->parent_index = mp->parent_index;
496*5d465952Smartinh 	copy->pgno = mp->pgno;
497*5d465952Smartinh 	copy->pgno = mp->pgno;
498*5d465952Smartinh 
499*5d465952Smartinh 	return copy;
500*5d465952Smartinh }
501*5d465952Smartinh 
502*5d465952Smartinh /* Remove the least recently used memory pages until the cache size is
503*5d465952Smartinh  * within the configured bounds. Pages referenced by cursors or returned
504*5d465952Smartinh  * key/data are not pruned.
505*5d465952Smartinh  */
506*5d465952Smartinh static void
507*5d465952Smartinh mpage_prune(struct btree *bt)
508*5d465952Smartinh {
509*5d465952Smartinh 	struct mpage	*mp, *next;
510*5d465952Smartinh 
511*5d465952Smartinh 	for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) {
512*5d465952Smartinh 		if (bt->stat.cache_size <= bt->stat.max_cache)
513*5d465952Smartinh 			break;
514*5d465952Smartinh 		next = TAILQ_NEXT(mp, lru_next);
515*5d465952Smartinh 		if (!mp->dirty && mp->ref <= 0) {
516*5d465952Smartinh 			mpage_del(bt, mp);
517*5d465952Smartinh 			free(mp);
518*5d465952Smartinh 		}
519*5d465952Smartinh 	}
520*5d465952Smartinh }
521*5d465952Smartinh 
522*5d465952Smartinh /* Mark a page as dirty and push it on the dirty queue.
523*5d465952Smartinh  */
524*5d465952Smartinh static void
525*5d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp)
526*5d465952Smartinh {
527*5d465952Smartinh 	assert(bt != NULL);
528*5d465952Smartinh 	assert(bt->txn != NULL);
529*5d465952Smartinh 
530*5d465952Smartinh 	if (!mp->dirty) {
531*5d465952Smartinh 		mp->dirty = 1;
532*5d465952Smartinh 		SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);
533*5d465952Smartinh 	}
534*5d465952Smartinh }
535*5d465952Smartinh 
536*5d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno.
537*5d465952Smartinh  */
538*5d465952Smartinh static void
539*5d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp)
540*5d465952Smartinh {
541*5d465952Smartinh 	assert(bt != NULL);
542*5d465952Smartinh 	assert(bt->txn != NULL);
543*5d465952Smartinh 	assert(mp != NULL);
544*5d465952Smartinh 
545*5d465952Smartinh 	if (!mp->dirty) {
546*5d465952Smartinh 		DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno);
547*5d465952Smartinh 		if (mp->ref == 0)
548*5d465952Smartinh 			mpage_del(bt, mp);
549*5d465952Smartinh 		else
550*5d465952Smartinh 			mp = mpage_copy(mp);
551*5d465952Smartinh 		mp->pgno = mp->page.pgno = bt->txn->next_pgno++;
552*5d465952Smartinh 		mpage_dirty(bt, mp);
553*5d465952Smartinh 		mpage_add(bt, mp);
554*5d465952Smartinh 
555*5d465952Smartinh 		/* Update the page number to new touched page. */
556*5d465952Smartinh 		if (mp->parent != NULL)
557*5d465952Smartinh 			NODEPGNO(NODEPTR(mp->parent,
558*5d465952Smartinh 			    mp->parent_index)) = mp->pgno;
559*5d465952Smartinh 	}
560*5d465952Smartinh }
561*5d465952Smartinh 
562*5d465952Smartinh static int
563*5d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)
564*5d465952Smartinh {
565*5d465952Smartinh 	ssize_t		 rc;
566*5d465952Smartinh 
567*5d465952Smartinh 	DPRINTF("reading page %u", pgno);
568*5d465952Smartinh 	bt->stat.reads++;
569*5d465952Smartinh 	if ((rc = pread(bt->fd, page, PAGESIZE, (off_t)pgno*PAGESIZE)) == 0) {
570*5d465952Smartinh 		DPRINTF("page %u doesn't exist", pgno);
571*5d465952Smartinh 		return BT_NOTFOUND;
572*5d465952Smartinh 	}
573*5d465952Smartinh 	else if (rc != PAGESIZE) {
574*5d465952Smartinh 		DPRINTF("read: %s", strerror(errno));
575*5d465952Smartinh 		return BT_FAIL;
576*5d465952Smartinh 	}
577*5d465952Smartinh 
578*5d465952Smartinh 	if (page->pgno != pgno) {
579*5d465952Smartinh 		DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno);
580*5d465952Smartinh 		errno = EINVAL;
581*5d465952Smartinh 		return BT_FAIL;
582*5d465952Smartinh 	}
583*5d465952Smartinh 
584*5d465952Smartinh 	return BT_SUCCESS;
585*5d465952Smartinh }
586*5d465952Smartinh 
587*5d465952Smartinh int
588*5d465952Smartinh btree_sync(struct btree *bt)
589*5d465952Smartinh {
590*5d465952Smartinh 	if (!F_ISSET(bt->flags, BT_NOSYNC))
591*5d465952Smartinh 		return fsync(bt->fd);
592*5d465952Smartinh 	return 0;
593*5d465952Smartinh }
594*5d465952Smartinh 
595*5d465952Smartinh struct btree_txn *
596*5d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly)
597*5d465952Smartinh {
598*5d465952Smartinh 	int			 rc;
599*5d465952Smartinh 	struct btree_txn	*txn;
600*5d465952Smartinh 
601*5d465952Smartinh 	if (!rdonly && bt->txn != NULL) {
602*5d465952Smartinh 		DPRINTF("write transaction already begun");
603*5d465952Smartinh 		return NULL;
604*5d465952Smartinh 	}
605*5d465952Smartinh 
606*5d465952Smartinh 	if ((txn = calloc(1, sizeof(*txn))) == NULL) {
607*5d465952Smartinh 		DPRINTF("calloc: %s", strerror(errno));
608*5d465952Smartinh 		return NULL;
609*5d465952Smartinh 	}
610*5d465952Smartinh 
611*5d465952Smartinh 	if (rdonly) {
612*5d465952Smartinh 		txn->flags |= BT_TXN_RDONLY;
613*5d465952Smartinh 	} else {
614*5d465952Smartinh 		txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue));
615*5d465952Smartinh 		if (txn->dirty_queue == NULL) {
616*5d465952Smartinh 			free(txn);
617*5d465952Smartinh 			return NULL;
618*5d465952Smartinh 		}
619*5d465952Smartinh 		SIMPLEQ_INIT(txn->dirty_queue);
620*5d465952Smartinh 
621*5d465952Smartinh 		DPRINTF("taking write lock on txn %p", txn);
622*5d465952Smartinh 		if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
623*5d465952Smartinh 			free(txn->dirty_queue);
624*5d465952Smartinh 			free(txn);
625*5d465952Smartinh 			return NULL;
626*5d465952Smartinh 		}
627*5d465952Smartinh 		bt->txn = txn;
628*5d465952Smartinh 	}
629*5d465952Smartinh 
630*5d465952Smartinh 	txn->bt = bt;
631*5d465952Smartinh 	bt->ref++;
632*5d465952Smartinh 
633*5d465952Smartinh 	if ((rc = btree_read_meta(bt, &txn->next_pgno)) == BT_FAIL) {
634*5d465952Smartinh 		btree_txn_abort(txn);
635*5d465952Smartinh 		return NULL;
636*5d465952Smartinh 	}
637*5d465952Smartinh 
638*5d465952Smartinh 	txn->root = bt->meta->root;
639*5d465952Smartinh 	DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root);
640*5d465952Smartinh 
641*5d465952Smartinh 	return txn;
642*5d465952Smartinh }
643*5d465952Smartinh 
644*5d465952Smartinh void
645*5d465952Smartinh btree_txn_abort(struct btree_txn *txn)
646*5d465952Smartinh {
647*5d465952Smartinh 	struct mpage	*mp;
648*5d465952Smartinh 	struct btree	*bt;
649*5d465952Smartinh 
650*5d465952Smartinh 	if (txn == NULL)
651*5d465952Smartinh 		return;
652*5d465952Smartinh 
653*5d465952Smartinh 	bt = txn->bt;
654*5d465952Smartinh 	DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root);
655*5d465952Smartinh 
656*5d465952Smartinh 	/* Discard all dirty pages.
657*5d465952Smartinh 	 */
658*5d465952Smartinh 	while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
659*5d465952Smartinh 		mp = SIMPLEQ_FIRST(txn->dirty_queue);
660*5d465952Smartinh 		assert(mp->ref == 0);		/* cursors should be closed */
661*5d465952Smartinh 		mpage_del(bt, mp);
662*5d465952Smartinh 		SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
663*5d465952Smartinh 	}
664*5d465952Smartinh 
665*5d465952Smartinh 	if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) {
666*5d465952Smartinh 		DPRINTF("releasing write lock on txn %p", txn);
667*5d465952Smartinh 		txn->bt->txn = NULL;
668*5d465952Smartinh 		if (flock(txn->bt->fd, LOCK_UN) != 0) {
669*5d465952Smartinh 			DPRINTF("failed to unlock fd %d: %s",
670*5d465952Smartinh 			    txn->bt->fd, strerror(errno));
671*5d465952Smartinh 		}
672*5d465952Smartinh 		free(txn->dirty_queue);
673*5d465952Smartinh 	}
674*5d465952Smartinh 
675*5d465952Smartinh 	btree_close(txn->bt);
676*5d465952Smartinh 	free(txn);
677*5d465952Smartinh }
678*5d465952Smartinh 
679*5d465952Smartinh int
680*5d465952Smartinh btree_txn_commit(struct btree_txn *txn)
681*5d465952Smartinh {
682*5d465952Smartinh 	int		 n, done;
683*5d465952Smartinh 	ssize_t		 rc;
684*5d465952Smartinh 	off_t		 size;
685*5d465952Smartinh 	struct mpage	*mp;
686*5d465952Smartinh 	struct btree	*bt;
687*5d465952Smartinh 	struct iovec	 iov[BT_COMMIT_PAGES];
688*5d465952Smartinh 
689*5d465952Smartinh 	assert(txn != NULL);
690*5d465952Smartinh 	assert(txn->bt != NULL);
691*5d465952Smartinh 
692*5d465952Smartinh  	bt = txn->bt;
693*5d465952Smartinh 
694*5d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_RDONLY)) {
695*5d465952Smartinh 		DPRINTF("attempt to commit read-only transaction");
696*5d465952Smartinh 		btree_txn_abort(txn);
697*5d465952Smartinh 		return BT_FAIL;
698*5d465952Smartinh 	}
699*5d465952Smartinh 
700*5d465952Smartinh 	if (txn != bt->txn) {
701*5d465952Smartinh 		DPRINTF("attempt to commit unknown transaction");
702*5d465952Smartinh 		btree_txn_abort(txn);
703*5d465952Smartinh 		return BT_FAIL;
704*5d465952Smartinh 	}
705*5d465952Smartinh 
706*5d465952Smartinh 	if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
707*5d465952Smartinh 		DPRINTF("error flag is set, can't commit");
708*5d465952Smartinh 		btree_txn_abort(txn);
709*5d465952Smartinh 		return BT_FAIL;
710*5d465952Smartinh 	}
711*5d465952Smartinh 
712*5d465952Smartinh 	if (SIMPLEQ_EMPTY(txn->dirty_queue))
713*5d465952Smartinh 		goto done;
714*5d465952Smartinh 
715*5d465952Smartinh 	if (F_ISSET(bt->flags, BT_FIXPADDING)) {
716*5d465952Smartinh 		size = lseek(bt->fd, 0, SEEK_END);
717*5d465952Smartinh 		size += PAGESIZE - (size % PAGESIZE);
718*5d465952Smartinh 		DPRINTF("extending to multiple of page size: %llu", size);
719*5d465952Smartinh 		if (ftruncate(bt->fd, size) != 0) {
720*5d465952Smartinh 			DPRINTF("ftruncate: %s", strerror(errno));
721*5d465952Smartinh 			btree_txn_abort(txn);
722*5d465952Smartinh 			return BT_FAIL;
723*5d465952Smartinh 		}
724*5d465952Smartinh 		bt->flags &= ~BT_FIXPADDING;
725*5d465952Smartinh 	}
726*5d465952Smartinh 
727*5d465952Smartinh 	DPRINTF("committing transaction on btree %p, root page %u",
728*5d465952Smartinh 	    bt, txn->root);
729*5d465952Smartinh 
730*5d465952Smartinh 	/* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.
731*5d465952Smartinh 	 */
732*5d465952Smartinh 	do {
733*5d465952Smartinh 		n = 0;
734*5d465952Smartinh 		done = 1;
735*5d465952Smartinh 		SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) {
736*5d465952Smartinh 			DPRINTF("commiting page %u", mp->pgno);
737*5d465952Smartinh 			iov[n].iov_len = PAGESIZE;
738*5d465952Smartinh 			iov[n].iov_base = &mp->page;
739*5d465952Smartinh 			if (++n >= BT_COMMIT_PAGES) {
740*5d465952Smartinh 				done = 0;
741*5d465952Smartinh 				break;
742*5d465952Smartinh 			}
743*5d465952Smartinh 		}
744*5d465952Smartinh 
745*5d465952Smartinh 		if (n == 0)
746*5d465952Smartinh 			break;
747*5d465952Smartinh 
748*5d465952Smartinh 		DPRINTF("commiting %u dirty pages", n);
749*5d465952Smartinh 		rc = writev(bt->fd, iov, n);
750*5d465952Smartinh 		if (rc != PAGESIZE*n) {
751*5d465952Smartinh 			if (rc > 0)
752*5d465952Smartinh 				DPRINTF("short write, filesystem full?");
753*5d465952Smartinh 			else
754*5d465952Smartinh 				DPRINTF("writev: %s", strerror(errno));
755*5d465952Smartinh 			btree_txn_abort(txn);
756*5d465952Smartinh 			return BT_FAIL;
757*5d465952Smartinh 		}
758*5d465952Smartinh 
759*5d465952Smartinh 		/* Remove the dirty flag from the written pages.
760*5d465952Smartinh 		 */
761*5d465952Smartinh 		while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
762*5d465952Smartinh 			mp = SIMPLEQ_FIRST(txn->dirty_queue);
763*5d465952Smartinh 			mp->dirty = 0;
764*5d465952Smartinh 			SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
765*5d465952Smartinh 			if (--n == 0)
766*5d465952Smartinh 				break;
767*5d465952Smartinh 		}
768*5d465952Smartinh 	} while (!done);
769*5d465952Smartinh 
770*5d465952Smartinh 	if (btree_sync(bt) != 0 ||
771*5d465952Smartinh 	    btree_write_meta(bt, txn->root) != BT_SUCCESS ||
772*5d465952Smartinh 	    btree_sync(bt) != 0) {
773*5d465952Smartinh 		btree_txn_abort(txn);
774*5d465952Smartinh 		return BT_FAIL;
775*5d465952Smartinh 	}
776*5d465952Smartinh 
777*5d465952Smartinh done:
778*5d465952Smartinh 	mpage_prune(bt);
779*5d465952Smartinh 	btree_txn_abort(txn);
780*5d465952Smartinh 
781*5d465952Smartinh 	return BT_SUCCESS;
782*5d465952Smartinh }
783*5d465952Smartinh 
784*5d465952Smartinh static int
785*5d465952Smartinh btree_write_meta(struct btree *bt, pgno_t root)
786*5d465952Smartinh {
787*5d465952Smartinh 	ssize_t		 rc;
788*5d465952Smartinh 
789*5d465952Smartinh 	DPRINTF("writing meta page for root page %u", root);
790*5d465952Smartinh 
791*5d465952Smartinh 	assert(bt != NULL);
792*5d465952Smartinh 	assert(bt->txn != NULL);
793*5d465952Smartinh 	assert(bt->metapage != NULL);
794*5d465952Smartinh 	assert(bt->meta != NULL);
795*5d465952Smartinh 
796*5d465952Smartinh 	bt->meta->prev_meta = bt->metapage->pgno;
797*5d465952Smartinh 	bt->metapage->pgno = bt->txn->next_pgno++;
798*5d465952Smartinh 
799*5d465952Smartinh 	bt->meta->root = root;
800*5d465952Smartinh 	bt->meta->created_at = time(0);
801*5d465952Smartinh 	bt->meta->revisions++;
802*5d465952Smartinh 	SHA1((unsigned char *)bt->meta, METAHASHLEN, bt->meta->hash);
803*5d465952Smartinh 
804*5d465952Smartinh 	if ((rc = write(bt->fd, bt->metapage, PAGESIZE)) != PAGESIZE) {
805*5d465952Smartinh 		if (rc > 0)
806*5d465952Smartinh 			DPRINTF("short write, filesystem full?");
807*5d465952Smartinh 		return BT_FAIL;
808*5d465952Smartinh 	}
809*5d465952Smartinh 
810*5d465952Smartinh 	if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) {
811*5d465952Smartinh 		DPRINTF("failed to update file size: %s", strerror(errno));
812*5d465952Smartinh 		bt->size = 0;
813*5d465952Smartinh 	}
814*5d465952Smartinh 
815*5d465952Smartinh 	return BT_SUCCESS;
816*5d465952Smartinh }
817*5d465952Smartinh 
818*5d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise.
819*5d465952Smartinh  */
820*5d465952Smartinh static int
821*5d465952Smartinh btree_is_meta_page(struct page *p)
822*5d465952Smartinh {
823*5d465952Smartinh 	struct bt_meta	*m;
824*5d465952Smartinh 	unsigned char	 hash[SHA_DIGEST_LENGTH];
825*5d465952Smartinh 
826*5d465952Smartinh 	m = METADATA(p);
827*5d465952Smartinh 	if (!F_ISSET(p->flags, P_META)) {
828*5d465952Smartinh 		DPRINTF("page %d not a meta page", p->pgno);
829*5d465952Smartinh 		return 0;
830*5d465952Smartinh 	}
831*5d465952Smartinh 
832*5d465952Smartinh 	if (m->magic != BT_MAGIC) {
833*5d465952Smartinh 		DPRINTF("page %d has invalid magic", p->pgno);
834*5d465952Smartinh 		return 0;
835*5d465952Smartinh 	}
836*5d465952Smartinh 
837*5d465952Smartinh 	if (m->root >= p->pgno && m->root != P_INVALID) {
838*5d465952Smartinh 		DPRINTF("page %d points to an invalid root page", p->pgno);
839*5d465952Smartinh 		return 0;
840*5d465952Smartinh 	}
841*5d465952Smartinh 
842*5d465952Smartinh 	SHA1((unsigned char *)m, METAHASHLEN, hash);
843*5d465952Smartinh 	if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) {
844*5d465952Smartinh 		DPRINTF("page %d has an invalid digest", p->pgno);
845*5d465952Smartinh 		return 0;
846*5d465952Smartinh 	}
847*5d465952Smartinh 
848*5d465952Smartinh 	return 1;
849*5d465952Smartinh }
850*5d465952Smartinh 
851*5d465952Smartinh static void
852*5d465952Smartinh btree_init_meta(struct btree *bt)
853*5d465952Smartinh {
854*5d465952Smartinh 	bt->metapage->flags = P_META;
855*5d465952Smartinh 	bt->meta = METADATA(bt->metapage);
856*5d465952Smartinh 	bt->meta->magic = BT_MAGIC;
857*5d465952Smartinh 	bt->meta->version = BT_VERSION;
858*5d465952Smartinh 	bt->meta->flags = 0;
859*5d465952Smartinh 	bt->meta->psize = PAGESIZE;
860*5d465952Smartinh 	bt->meta->root = P_INVALID;
861*5d465952Smartinh }
862*5d465952Smartinh 
863*5d465952Smartinh static int
864*5d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next)
865*5d465952Smartinh {
866*5d465952Smartinh 	pgno_t		 meta_pgno, next_pgno;
867*5d465952Smartinh 	off_t		 size;
868*5d465952Smartinh 	int		 rc;
869*5d465952Smartinh 
870*5d465952Smartinh 	assert(bt != NULL);
871*5d465952Smartinh 
872*5d465952Smartinh 	if ((size = lseek(bt->fd, 0, SEEK_END)) == -1)
873*5d465952Smartinh 		goto fail;
874*5d465952Smartinh 
875*5d465952Smartinh 	DPRINTF("btree_read_meta: size = %llu", size);
876*5d465952Smartinh 
877*5d465952Smartinh 	if (size < bt->size) {
878*5d465952Smartinh 		DPRINTF("file has shrunk!");
879*5d465952Smartinh 		errno = EIO;
880*5d465952Smartinh 		goto fail;
881*5d465952Smartinh 	}
882*5d465952Smartinh 
883*5d465952Smartinh 	if (size == 0) {
884*5d465952Smartinh 		if (p_next != NULL)
885*5d465952Smartinh 			*p_next = 0;
886*5d465952Smartinh 		return BT_NOTFOUND;		/* new file */
887*5d465952Smartinh 	}
888*5d465952Smartinh 
889*5d465952Smartinh 	next_pgno = size / PAGESIZE;
890*5d465952Smartinh 	if (next_pgno == 0) {
891*5d465952Smartinh 		DPRINTF("corrupt file");
892*5d465952Smartinh 		errno = EIO;
893*5d465952Smartinh 		goto fail;
894*5d465952Smartinh 	}
895*5d465952Smartinh 
896*5d465952Smartinh 	meta_pgno = next_pgno - 1;
897*5d465952Smartinh 
898*5d465952Smartinh 	if (size % PAGESIZE != 0) {
899*5d465952Smartinh 		DPRINTF("filesize not a multiple of the page size!");
900*5d465952Smartinh 		bt->flags |= BT_FIXPADDING;
901*5d465952Smartinh 		next_pgno++;
902*5d465952Smartinh 	}
903*5d465952Smartinh 
904*5d465952Smartinh 	if (p_next != NULL)
905*5d465952Smartinh 		*p_next = next_pgno;
906*5d465952Smartinh 
907*5d465952Smartinh 	if (size == bt->size) {
908*5d465952Smartinh 		DPRINTF("size unchanged, keeping current meta page");
909*5d465952Smartinh 		return BT_SUCCESS;
910*5d465952Smartinh 	}
911*5d465952Smartinh 	bt->size = size;
912*5d465952Smartinh 
913*5d465952Smartinh 	while (meta_pgno > 0) {
914*5d465952Smartinh 		rc = btree_read_page(bt, meta_pgno, bt->metapage);
915*5d465952Smartinh 		if (rc == BT_NOTFOUND)
916*5d465952Smartinh 			break;		/* no meta page found */
917*5d465952Smartinh 		if (rc == BT_SUCCESS && btree_is_meta_page(bt->metapage)) {
918*5d465952Smartinh 			bt->meta = METADATA(bt->metapage);
919*5d465952Smartinh 			return BT_SUCCESS;
920*5d465952Smartinh 		}
921*5d465952Smartinh 		--meta_pgno;	/* scan backwards to first valid meta page */
922*5d465952Smartinh 	}
923*5d465952Smartinh 
924*5d465952Smartinh 	errno = EIO;
925*5d465952Smartinh fail:
926*5d465952Smartinh 	if (p_next != NULL)
927*5d465952Smartinh 		*p_next = P_INVALID;
928*5d465952Smartinh 	return BT_FAIL;
929*5d465952Smartinh }
930*5d465952Smartinh 
931*5d465952Smartinh struct btree *
932*5d465952Smartinh btree_open_fd(int fd, uint32_t flags)
933*5d465952Smartinh {
934*5d465952Smartinh 	int		 fl;
935*5d465952Smartinh 	struct btree	*bt;
936*5d465952Smartinh 
937*5d465952Smartinh 	fl = fcntl(fd, F_GETFL, 0);
938*5d465952Smartinh 	if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1)
939*5d465952Smartinh 		return NULL;
940*5d465952Smartinh 
941*5d465952Smartinh 	if ((bt = calloc(1, sizeof(*bt))) == NULL)
942*5d465952Smartinh 		return NULL;
943*5d465952Smartinh 	bt->fd = fd;
944*5d465952Smartinh 	bt->flags = flags;
945*5d465952Smartinh 	bt->flags &= ~BT_FIXPADDING;
946*5d465952Smartinh 
947*5d465952Smartinh 	if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL)
948*5d465952Smartinh 		goto fail;
949*5d465952Smartinh 	bt->stat.max_cache = BT_MAXCACHE_DEF;
950*5d465952Smartinh 	RB_INIT(bt->page_cache);
951*5d465952Smartinh 
952*5d465952Smartinh 	if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL)
953*5d465952Smartinh 		goto fail;
954*5d465952Smartinh 	TAILQ_INIT(bt->lru_queue);
955*5d465952Smartinh 
956*5d465952Smartinh 	if ((bt->metapage = calloc(1, PAGESIZE)) == NULL)
957*5d465952Smartinh 		goto fail;
958*5d465952Smartinh 
959*5d465952Smartinh 	if (btree_read_meta(bt, NULL) == BT_FAIL)
960*5d465952Smartinh 		goto fail;
961*5d465952Smartinh 
962*5d465952Smartinh 	if (bt->meta == NULL) {
963*5d465952Smartinh 		DPRINTF("new database");
964*5d465952Smartinh 		btree_init_meta(bt);
965*5d465952Smartinh 	} else if (bt->meta->version != BT_VERSION) {
966*5d465952Smartinh 		DPRINTF("database is version %u, expected version %u",
967*5d465952Smartinh 		    bt->meta->version, BT_VERSION);
968*5d465952Smartinh 		errno = EINVAL;
969*5d465952Smartinh 		goto fail;
970*5d465952Smartinh 	} else {
971*5d465952Smartinh 		DPRINTF("opened database version %u, pagesize %u",
972*5d465952Smartinh 		    bt->meta->version, bt->meta->psize);
973*5d465952Smartinh 		DPRINTF("timestamp: %s", ctime(&bt->meta->created_at));
974*5d465952Smartinh 		DPRINTF("depth: %u", bt->meta->depth);
975*5d465952Smartinh 		DPRINTF("entries: %llu", bt->meta->entries);
976*5d465952Smartinh 		DPRINTF("revisions: %u", bt->meta->revisions);
977*5d465952Smartinh 		DPRINTF("branch pages: %u", bt->meta->branch_pages);
978*5d465952Smartinh 		DPRINTF("leaf pages: %u", bt->meta->leaf_pages);
979*5d465952Smartinh 		DPRINTF("overflow pages: %u", bt->meta->overflow_pages);
980*5d465952Smartinh 		DPRINTF("root: %u", bt->meta->root);
981*5d465952Smartinh 		DPRINTF("previous meta page: %u", bt->meta->prev_meta);
982*5d465952Smartinh 	}
983*5d465952Smartinh 
984*5d465952Smartinh 	return bt;
985*5d465952Smartinh 
986*5d465952Smartinh fail:
987*5d465952Smartinh 	free(bt->lru_queue);
988*5d465952Smartinh 	free(bt->page_cache);
989*5d465952Smartinh 	free(bt->metapage);
990*5d465952Smartinh 	free(bt);
991*5d465952Smartinh 	return NULL;
992*5d465952Smartinh }
993*5d465952Smartinh 
994*5d465952Smartinh struct btree *
995*5d465952Smartinh btree_open(const char *path, uint32_t flags, mode_t mode)
996*5d465952Smartinh {
997*5d465952Smartinh 	int		 fd, oflags;
998*5d465952Smartinh 	struct btree	*bt;
999*5d465952Smartinh 
1000*5d465952Smartinh 	if (F_ISSET(flags, BT_RDONLY))
1001*5d465952Smartinh 		oflags = O_RDONLY;
1002*5d465952Smartinh 	else
1003*5d465952Smartinh 		oflags = O_RDWR | O_CREAT | O_APPEND;
1004*5d465952Smartinh 
1005*5d465952Smartinh 	if ((fd = open(path, oflags, mode)) == -1)
1006*5d465952Smartinh 		return NULL;
1007*5d465952Smartinh 
1008*5d465952Smartinh 	if ((bt = btree_open_fd(fd, flags)) == NULL)
1009*5d465952Smartinh 		close(fd);
1010*5d465952Smartinh 	else {
1011*5d465952Smartinh 		bt->path = strdup(path);
1012*5d465952Smartinh 		bt->ref = 1;
1013*5d465952Smartinh 		DPRINTF("opened btree %p", bt);
1014*5d465952Smartinh 	}
1015*5d465952Smartinh 
1016*5d465952Smartinh 	return bt;
1017*5d465952Smartinh }
1018*5d465952Smartinh 
1019*5d465952Smartinh void
1020*5d465952Smartinh btree_close(struct btree *bt)
1021*5d465952Smartinh {
1022*5d465952Smartinh 	struct mpage	*mp;
1023*5d465952Smartinh 
1024*5d465952Smartinh 	if (bt != NULL && --bt->ref == 0) {
1025*5d465952Smartinh 		DPRINTF("ref is zero, closing btree %p", bt);
1026*5d465952Smartinh 		close(bt->fd);
1027*5d465952Smartinh 
1028*5d465952Smartinh 		/* Free page_cache. */
1029*5d465952Smartinh 		while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) {
1030*5d465952Smartinh 			mpage_del(bt, mp);
1031*5d465952Smartinh 			free(mp);
1032*5d465952Smartinh 		}
1033*5d465952Smartinh 		free(bt->page_cache);
1034*5d465952Smartinh 		free(bt);
1035*5d465952Smartinh 	}
1036*5d465952Smartinh }
1037*5d465952Smartinh 
1038*5d465952Smartinh /* Search for key within a leaf page, using binary search.
1039*5d465952Smartinh  * Returns the smallest entry larger or equal to the key.
1040*5d465952Smartinh  * If exactp is non-null, stores whether the found entry was an exact match
1041*5d465952Smartinh  * in *exactp (1 or 0).
1042*5d465952Smartinh  * If kip is non-null, stores the index of the found entry in *kip.
1043*5d465952Smartinh  * If no entry larger of equal to the key is found, returns NULL.
1044*5d465952Smartinh  */
1045*5d465952Smartinh static struct node *
1046*5d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,
1047*5d465952Smartinh     int *exactp, unsigned int *kip)
1048*5d465952Smartinh {
1049*5d465952Smartinh 	unsigned int	 i = 0;
1050*5d465952Smartinh 	int		 low, high;
1051*5d465952Smartinh 	int		 rc = 0;
1052*5d465952Smartinh 	struct node	*node;
1053*5d465952Smartinh 	struct btval	 nodekey;
1054*5d465952Smartinh 
1055*5d465952Smartinh 	DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]",
1056*5d465952Smartinh 	    NUMKEYS(mp),
1057*5d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
1058*5d465952Smartinh 	    mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);
1059*5d465952Smartinh 
1060*5d465952Smartinh 	assert(NUMKEYS(mp) > 0);
1061*5d465952Smartinh 
1062*5d465952Smartinh 	bzero(&nodekey, sizeof(nodekey));
1063*5d465952Smartinh 
1064*5d465952Smartinh 	low = IS_LEAF(mp) ? 0 : 1;
1065*5d465952Smartinh 	high = NUMKEYS(mp) - 1;
1066*5d465952Smartinh 	while (low <= high) {
1067*5d465952Smartinh 		i = (low + high) >> 1;
1068*5d465952Smartinh 		node = NODEPTR(mp, i);
1069*5d465952Smartinh 
1070*5d465952Smartinh 		nodekey.size = node->ksize;
1071*5d465952Smartinh 		nodekey.data = NODEKEY(node);
1072*5d465952Smartinh 
1073*5d465952Smartinh 		if (bt->cmp)
1074*5d465952Smartinh 			rc = bt->cmp(key, &nodekey);
1075*5d465952Smartinh 		else
1076*5d465952Smartinh 			rc = bt_cmp(bt, key, &nodekey, &mp->prefix);
1077*5d465952Smartinh 
1078*5d465952Smartinh 		if (IS_LEAF(mp))
1079*5d465952Smartinh 			DPRINTF("found leaf index %u [%.*s], rc = %i",
1080*5d465952Smartinh 			    i, (int)nodekey.size, (char *)nodekey.data, rc);
1081*5d465952Smartinh 		else
1082*5d465952Smartinh 			DPRINTF("found branch index %u [%.*s -> %u], rc = %i",
1083*5d465952Smartinh 			    i, (int)node->ksize, (char *)NODEKEY(node),
1084*5d465952Smartinh 			    node->n_pgno, rc);
1085*5d465952Smartinh 
1086*5d465952Smartinh 		if (rc == 0)
1087*5d465952Smartinh 			break;
1088*5d465952Smartinh 		if (rc > 0)
1089*5d465952Smartinh 			low = i + 1;
1090*5d465952Smartinh 		else
1091*5d465952Smartinh 			high = i - 1;
1092*5d465952Smartinh 	}
1093*5d465952Smartinh 
1094*5d465952Smartinh 	if (rc > 0) {	/* Found entry is less than the key. */
1095*5d465952Smartinh 		i++;	/* Skip to get the smallest entry larger than key. */
1096*5d465952Smartinh 		if (i >= NUMKEYS(mp))
1097*5d465952Smartinh 			/* There is no entry larger or equal to the key. */
1098*5d465952Smartinh 			return NULL;
1099*5d465952Smartinh 	}
1100*5d465952Smartinh 	if (exactp)
1101*5d465952Smartinh 		*exactp = (rc == 0);
1102*5d465952Smartinh 	if (kip)	/* Store the key index if requested. */
1103*5d465952Smartinh 		*kip = i;
1104*5d465952Smartinh 
1105*5d465952Smartinh 	return NODEPTR(mp, i);
1106*5d465952Smartinh }
1107*5d465952Smartinh 
1108*5d465952Smartinh static void
1109*5d465952Smartinh cursor_pop_page(struct cursor *cursor)
1110*5d465952Smartinh {
1111*5d465952Smartinh 	struct ppage	*top;
1112*5d465952Smartinh 
1113*5d465952Smartinh 	top = CURSOR_TOP(cursor);
1114*5d465952Smartinh 	CURSOR_POP(cursor);
1115*5d465952Smartinh 	top->mpage->ref--;
1116*5d465952Smartinh 
1117*5d465952Smartinh 	DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor);
1118*5d465952Smartinh 
1119*5d465952Smartinh 	free(top);
1120*5d465952Smartinh }
1121*5d465952Smartinh 
1122*5d465952Smartinh static struct ppage *
1123*5d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp)
1124*5d465952Smartinh {
1125*5d465952Smartinh 	struct ppage	*ppage;
1126*5d465952Smartinh 
1127*5d465952Smartinh 	DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor);
1128*5d465952Smartinh 
1129*5d465952Smartinh 	if ((ppage = calloc(1, sizeof(*ppage))) == NULL)
1130*5d465952Smartinh 		return NULL;
1131*5d465952Smartinh 	ppage->mpage = mp;
1132*5d465952Smartinh 	mp->ref++;
1133*5d465952Smartinh 	CURSOR_PUSH(cursor, ppage);
1134*5d465952Smartinh 	return ppage;
1135*5d465952Smartinh }
1136*5d465952Smartinh 
1137*5d465952Smartinh static int
1138*5d465952Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno, struct mpage **mpp)
1139*5d465952Smartinh {
1140*5d465952Smartinh 	int		 rc;
1141*5d465952Smartinh 	struct mpage	*mp;
1142*5d465952Smartinh 
1143*5d465952Smartinh 	mp = mpage_lookup(bt, pgno);
1144*5d465952Smartinh 	if (mp == NULL) {
1145*5d465952Smartinh 		if ((mp = calloc(1, sizeof(*mp))) == NULL)
1146*5d465952Smartinh 			return BT_FAIL;
1147*5d465952Smartinh 		if ((rc = btree_read_page(bt, pgno, &mp->page)) != BT_SUCCESS) {
1148*5d465952Smartinh 			free(mp);
1149*5d465952Smartinh 			return rc;
1150*5d465952Smartinh 		}
1151*5d465952Smartinh 		mp->pgno = pgno;
1152*5d465952Smartinh 		mpage_add(bt, mp);
1153*5d465952Smartinh 	} else
1154*5d465952Smartinh 		DPRINTF("returning page %u from cache", pgno);
1155*5d465952Smartinh 
1156*5d465952Smartinh 	*mpp = mp;
1157*5d465952Smartinh 	return BT_SUCCESS;
1158*5d465952Smartinh }
1159*5d465952Smartinh 
1160*5d465952Smartinh static void
1161*5d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,
1162*5d465952Smartinh     char *cs, size_t *cn)
1163*5d465952Smartinh {
1164*5d465952Smartinh 	assert(*cn >= n1 + n2);
1165*5d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
1166*5d465952Smartinh 		bcopy(s2, cs, n2);
1167*5d465952Smartinh 		bcopy(s1, cs + n2, n1);
1168*5d465952Smartinh 	} else {
1169*5d465952Smartinh 		bcopy(s1, cs, n1);
1170*5d465952Smartinh 		bcopy(s2, cs + n1, n2);
1171*5d465952Smartinh 	}
1172*5d465952Smartinh 	*cn = n1 + n2;
1173*5d465952Smartinh }
1174*5d465952Smartinh 
1175*5d465952Smartinh static void
1176*5d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp)
1177*5d465952Smartinh {
1178*5d465952Smartinh 	indx_t			 lbound = 0, ubound = 0;
1179*5d465952Smartinh 	struct mpage		*lp, *up;
1180*5d465952Smartinh 	struct btkey		 lprefix, uprefix;
1181*5d465952Smartinh 
1182*5d465952Smartinh 	mp->prefix.len = 0;
1183*5d465952Smartinh 	if (bt->cmp != NULL)
1184*5d465952Smartinh 		return;
1185*5d465952Smartinh 
1186*5d465952Smartinh 	lp = mp;
1187*5d465952Smartinh 	while (lp->parent != NULL) {
1188*5d465952Smartinh 		if (lp->parent_index > 0) {
1189*5d465952Smartinh 			lbound = lp->parent_index;
1190*5d465952Smartinh 			break;
1191*5d465952Smartinh 		}
1192*5d465952Smartinh 		lp = lp->parent;
1193*5d465952Smartinh 	}
1194*5d465952Smartinh 
1195*5d465952Smartinh 	up = mp;
1196*5d465952Smartinh 	while (up->parent != NULL) {
1197*5d465952Smartinh 		if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) {
1198*5d465952Smartinh 			ubound = up->parent_index + 1;
1199*5d465952Smartinh 			break;
1200*5d465952Smartinh 		}
1201*5d465952Smartinh 		up = up->parent;
1202*5d465952Smartinh 	}
1203*5d465952Smartinh 
1204*5d465952Smartinh 	if (lp->parent != NULL && up->parent != NULL) {
1205*5d465952Smartinh 		expand_prefix(bt, lp->parent, lbound, &lprefix);
1206*5d465952Smartinh 		expand_prefix(bt, up->parent, ubound, &uprefix);
1207*5d465952Smartinh 		common_prefix(bt, &lprefix, &uprefix, &mp->prefix);
1208*5d465952Smartinh 	}
1209*5d465952Smartinh 	else if (mp->parent)
1210*5d465952Smartinh 		bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));
1211*5d465952Smartinh 
1212*5d465952Smartinh 	DPRINTF("found common prefix [%.*s] (len %zu) for page %u",
1213*5d465952Smartinh 	    (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);
1214*5d465952Smartinh }
1215*5d465952Smartinh 
1216*5d465952Smartinh static int
1217*5d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,
1218*5d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
1219*5d465952Smartinh {
1220*5d465952Smartinh 	struct mpage	*mp, *parent;
1221*5d465952Smartinh 
1222*5d465952Smartinh 	if (cursor && cursor_push_page(cursor, root) == NULL)
1223*5d465952Smartinh 		return BT_FAIL;
1224*5d465952Smartinh 
1225*5d465952Smartinh 	mp = root;
1226*5d465952Smartinh 	while (IS_BRANCH(mp)) {
1227*5d465952Smartinh 		unsigned int	 i = 0;
1228*5d465952Smartinh 		struct node	*node;
1229*5d465952Smartinh 
1230*5d465952Smartinh 		DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp));
1231*5d465952Smartinh 		assert(NUMKEYS(mp) > 1);
1232*5d465952Smartinh 		node = NODEPTR(mp, 0);
1233*5d465952Smartinh 		DPRINTF("found index 0 to page %u", NODEPGNO(node));
1234*5d465952Smartinh 
1235*5d465952Smartinh 		if (key == NULL)	/* Initialize cursor to first page. */
1236*5d465952Smartinh 			i = 0;
1237*5d465952Smartinh 		else {
1238*5d465952Smartinh 			int	 exact;
1239*5d465952Smartinh 			node = btree_search_node(bt, mp, key, &exact, &i);
1240*5d465952Smartinh 			if (node == NULL)
1241*5d465952Smartinh 				i = NUMKEYS(mp) - 1;
1242*5d465952Smartinh 			else if (!exact) {
1243*5d465952Smartinh 				assert(i > 0);
1244*5d465952Smartinh 				i--;
1245*5d465952Smartinh 			}
1246*5d465952Smartinh 		}
1247*5d465952Smartinh 
1248*5d465952Smartinh 		if (key)
1249*5d465952Smartinh 			DPRINTF("following index %u for key %.*s",
1250*5d465952Smartinh 			    i, (int)key->size, (char *)key->data);
1251*5d465952Smartinh 		assert(i >= 0 && i < NUMKEYS(mp));
1252*5d465952Smartinh 		node = NODEPTR(mp, i);
1253*5d465952Smartinh 
1254*5d465952Smartinh 		if (cursor)
1255*5d465952Smartinh 			CURSOR_TOP(cursor)->ki = i;
1256*5d465952Smartinh 
1257*5d465952Smartinh 		parent = mp;
1258*5d465952Smartinh 		if (btree_get_mpage(bt, NODEPGNO(node), &mp) != BT_SUCCESS)
1259*5d465952Smartinh 			return BT_FAIL;
1260*5d465952Smartinh 		mp->parent = parent;
1261*5d465952Smartinh 		mp->parent_index = i;
1262*5d465952Smartinh 		find_common_prefix(bt, mp);
1263*5d465952Smartinh 
1264*5d465952Smartinh 		if (cursor && cursor_push_page(cursor, mp) == NULL)
1265*5d465952Smartinh 			return BT_FAIL;
1266*5d465952Smartinh 
1267*5d465952Smartinh 		if (modify)
1268*5d465952Smartinh 			mpage_touch(bt, mp);
1269*5d465952Smartinh 	}
1270*5d465952Smartinh 
1271*5d465952Smartinh 	if (!IS_LEAF(mp)) {
1272*5d465952Smartinh 		DPRINTF("internal error, index points to a %02X page!?",
1273*5d465952Smartinh 		    mp->page.flags);
1274*5d465952Smartinh 		return BT_FAIL;
1275*5d465952Smartinh 	}
1276*5d465952Smartinh 
1277*5d465952Smartinh 	DPRINTF("found leaf page %u for key %.*s", mp->pgno,
1278*5d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL);
1279*5d465952Smartinh 
1280*5d465952Smartinh 	*mpp = mp;
1281*5d465952Smartinh 	return BT_SUCCESS;
1282*5d465952Smartinh }
1283*5d465952Smartinh 
1284*5d465952Smartinh /* Search for the page a given key should be in.
1285*5d465952Smartinh  * Stores a pointer to the found page in *mpp.
1286*5d465952Smartinh  * If key is NULL, search for the lowest page (used by btree_cursor_first).
1287*5d465952Smartinh  * If cursor is non-null, pushes parent pages on the cursor stack.
1288*5d465952Smartinh  * If modify is true, visited pages are updated with new page numbers.
1289*5d465952Smartinh  */
1290*5d465952Smartinh static int
1291*5d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,
1292*5d465952Smartinh     struct cursor *cursor, int modify, struct mpage **mpp)
1293*5d465952Smartinh {
1294*5d465952Smartinh 	int		 rc;
1295*5d465952Smartinh 	pgno_t		 root;
1296*5d465952Smartinh 	struct mpage	*mp;
1297*5d465952Smartinh 
1298*5d465952Smartinh 	/* Can't modify pages outside a transaction. */
1299*5d465952Smartinh 	if (txn == NULL && modify) {
1300*5d465952Smartinh 		errno = EINVAL;
1301*5d465952Smartinh 		return BT_FAIL;
1302*5d465952Smartinh 	}
1303*5d465952Smartinh 
1304*5d465952Smartinh 	/* Choose which root page to start with. If a transaction is given
1305*5d465952Smartinh          * use the root page from the transaction, otherwise read the last
1306*5d465952Smartinh          * committed root page.
1307*5d465952Smartinh 	 */
1308*5d465952Smartinh 	if (txn == NULL) {
1309*5d465952Smartinh 		if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)
1310*5d465952Smartinh 			return rc;
1311*5d465952Smartinh 		root = bt->meta->root;
1312*5d465952Smartinh 	} else if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
1313*5d465952Smartinh 		DPRINTF("transaction has failed, must abort");
1314*5d465952Smartinh 		return BT_FAIL;
1315*5d465952Smartinh 	} else
1316*5d465952Smartinh 		root = txn->root;
1317*5d465952Smartinh 
1318*5d465952Smartinh 	if (root == P_INVALID)		/* Tree is empty. */
1319*5d465952Smartinh 		return BT_NOTFOUND;
1320*5d465952Smartinh 
1321*5d465952Smartinh 	if ((rc = btree_get_mpage(bt, root, &mp)) != BT_SUCCESS)
1322*5d465952Smartinh 		return rc;
1323*5d465952Smartinh 
1324*5d465952Smartinh 	assert(mp->parent == NULL);
1325*5d465952Smartinh 	assert(mp->prefix.len == 0);
1326*5d465952Smartinh 
1327*5d465952Smartinh 	if (modify && !mp->dirty) {
1328*5d465952Smartinh 		mpage_touch(bt, mp);
1329*5d465952Smartinh 		txn->root = mp->pgno;
1330*5d465952Smartinh 	}
1331*5d465952Smartinh 
1332*5d465952Smartinh 	return btree_search_page_root(bt, mp, key, cursor, modify, mpp);
1333*5d465952Smartinh }
1334*5d465952Smartinh 
1335*5d465952Smartinh static int
1336*5d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,
1337*5d465952Smartinh     struct btval *data)
1338*5d465952Smartinh {
1339*5d465952Smartinh 	size_t		 psz;
1340*5d465952Smartinh 	size_t		 max;
1341*5d465952Smartinh 	size_t		 sz = 0;
1342*5d465952Smartinh 	pgno_t		 pgno;
1343*5d465952Smartinh 	struct page	 p;
1344*5d465952Smartinh 
1345*5d465952Smartinh 	bzero(data, sizeof(*data));
1346*5d465952Smartinh 	max = PAGESIZE - PAGEHDRSZ;
1347*5d465952Smartinh 
1348*5d465952Smartinh 	if (!F_ISSET(leaf->flags, F_BIGDATA)) {
1349*5d465952Smartinh 		data->size = leaf->n_dsize;
1350*5d465952Smartinh 		if (data->size > 0) {
1351*5d465952Smartinh 			if (mp == NULL) {
1352*5d465952Smartinh 				if ((data->data = malloc(data->size)) == NULL)
1353*5d465952Smartinh 					return BT_FAIL;
1354*5d465952Smartinh 				bcopy(NODEDATA(leaf), data->data, data->size);
1355*5d465952Smartinh 				data->free_data = 1;
1356*5d465952Smartinh 				data->mp = NULL;
1357*5d465952Smartinh 			} else {
1358*5d465952Smartinh 				data->data = NODEDATA(leaf);
1359*5d465952Smartinh 				data->free_data = 0;
1360*5d465952Smartinh 				data->mp = mp;
1361*5d465952Smartinh 				mp->ref++;
1362*5d465952Smartinh 			}
1363*5d465952Smartinh 		}
1364*5d465952Smartinh 		return BT_SUCCESS;
1365*5d465952Smartinh 	}
1366*5d465952Smartinh 
1367*5d465952Smartinh 	/* Read overflow data.
1368*5d465952Smartinh 	 */
1369*5d465952Smartinh 	DPRINTF("allocating %u byte for overflow data", leaf->n_dsize);
1370*5d465952Smartinh 	if ((data->data = malloc(leaf->n_dsize)) == NULL)
1371*5d465952Smartinh 		return BT_FAIL;
1372*5d465952Smartinh 	data->size = leaf->n_dsize;
1373*5d465952Smartinh 	data->free_data = 1;
1374*5d465952Smartinh 	data->mp = NULL;
1375*5d465952Smartinh 	pgno = *(pgno_t *)NODEDATA(leaf);	/* XXX: alignment? */
1376*5d465952Smartinh 	for (sz = 0; sz < data->size; ) {
1377*5d465952Smartinh 		if (btree_read_page(bt, pgno, &p) != 0 ||
1378*5d465952Smartinh 		    !F_ISSET(p.flags, P_OVERFLOW)) {
1379*5d465952Smartinh 			DPRINTF("read overflow page failed (%02x)", p.flags);
1380*5d465952Smartinh 			free(data->data);
1381*5d465952Smartinh 			return BT_FAIL;
1382*5d465952Smartinh 		}
1383*5d465952Smartinh 		psz = data->size - sz;
1384*5d465952Smartinh 		if (psz > max)
1385*5d465952Smartinh 			psz = max;
1386*5d465952Smartinh 		bcopy(p.ptrs, (char *)data->data + sz, psz);
1387*5d465952Smartinh 		sz += psz;
1388*5d465952Smartinh 		pgno = p.p_next_pgno;
1389*5d465952Smartinh 	}
1390*5d465952Smartinh 
1391*5d465952Smartinh 	return BT_SUCCESS;
1392*5d465952Smartinh }
1393*5d465952Smartinh 
1394*5d465952Smartinh int
1395*5d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn,
1396*5d465952Smartinh     struct btval *key, struct btval *data)
1397*5d465952Smartinh {
1398*5d465952Smartinh 	int		 rc, exact;
1399*5d465952Smartinh 	struct node	*leaf;
1400*5d465952Smartinh 	struct mpage	*mp;
1401*5d465952Smartinh 
1402*5d465952Smartinh 	assert(bt);
1403*5d465952Smartinh 	assert(key);
1404*5d465952Smartinh 	assert(data);
1405*5d465952Smartinh 	DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data);
1406*5d465952Smartinh 
1407*5d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
1408*5d465952Smartinh 		errno = EINVAL;
1409*5d465952Smartinh 		return BT_FAIL;
1410*5d465952Smartinh 	}
1411*5d465952Smartinh 
1412*5d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS)
1413*5d465952Smartinh 		return rc;
1414*5d465952Smartinh 
1415*5d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, NULL);
1416*5d465952Smartinh 	if (leaf && exact)
1417*5d465952Smartinh 		rc = btree_read_data(bt, mp, leaf, data);
1418*5d465952Smartinh 	else
1419*5d465952Smartinh 		rc = BT_NOTFOUND;
1420*5d465952Smartinh 
1421*5d465952Smartinh 	mpage_prune(bt);
1422*5d465952Smartinh 	return rc;
1423*5d465952Smartinh }
1424*5d465952Smartinh 
1425*5d465952Smartinh static int
1426*5d465952Smartinh btree_sibling(struct cursor *cursor, int move_right)
1427*5d465952Smartinh {
1428*5d465952Smartinh 	int		 rc;
1429*5d465952Smartinh 	struct node	*indx;
1430*5d465952Smartinh 	struct ppage	*parent, *top;
1431*5d465952Smartinh 	struct mpage	*mp;
1432*5d465952Smartinh 
1433*5d465952Smartinh 	top = CURSOR_TOP(cursor);
1434*5d465952Smartinh 	if ((parent = SLIST_NEXT(top, entry)) == NULL)
1435*5d465952Smartinh 		return BT_NOTFOUND;		/* root has no siblings */
1436*5d465952Smartinh 
1437*5d465952Smartinh 	DPRINTF("parent page is page %u, index %u",
1438*5d465952Smartinh 	    parent->mpage->pgno, parent->ki);
1439*5d465952Smartinh 
1440*5d465952Smartinh 	cursor_pop_page(cursor);
1441*5d465952Smartinh 	if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))
1442*5d465952Smartinh 		       : (parent->ki == 0)) {
1443*5d465952Smartinh 		DPRINTF("no more keys left, moving to %s sibling",
1444*5d465952Smartinh 		    move_right ? "right" : "left");
1445*5d465952Smartinh 		if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS)
1446*5d465952Smartinh 			return rc;
1447*5d465952Smartinh 		parent = CURSOR_TOP(cursor);
1448*5d465952Smartinh 	} else {
1449*5d465952Smartinh 		if (move_right)
1450*5d465952Smartinh 			parent->ki++;
1451*5d465952Smartinh 		else
1452*5d465952Smartinh 			parent->ki--;
1453*5d465952Smartinh 		DPRINTF("just moving to %s index key %u",
1454*5d465952Smartinh 		    move_right ? "right" : "left", parent->ki);
1455*5d465952Smartinh 	}
1456*5d465952Smartinh 	assert(IS_BRANCH(parent->mpage));
1457*5d465952Smartinh 
1458*5d465952Smartinh 	indx = NODEPTR(parent->mpage, parent->ki);
1459*5d465952Smartinh 	if (btree_get_mpage(cursor->bt, indx->n_pgno, &mp) != BT_SUCCESS)
1460*5d465952Smartinh 		return BT_FAIL;
1461*5d465952Smartinh 	mp->parent = parent->mpage;
1462*5d465952Smartinh 	mp->parent_index = parent->ki;
1463*5d465952Smartinh 
1464*5d465952Smartinh 	cursor_push_page(cursor, mp);
1465*5d465952Smartinh 	find_common_prefix(cursor->bt, mp);
1466*5d465952Smartinh 
1467*5d465952Smartinh 	return BT_SUCCESS;
1468*5d465952Smartinh }
1469*5d465952Smartinh 
1470*5d465952Smartinh static int
1471*5d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,
1472*5d465952Smartinh     struct btval *key)
1473*5d465952Smartinh {
1474*5d465952Smartinh 	if (key == NULL)
1475*5d465952Smartinh 		return 0;
1476*5d465952Smartinh 
1477*5d465952Smartinh 	if (mp->prefix.len > 0) {
1478*5d465952Smartinh 		key->size = node->ksize + mp->prefix.len;
1479*5d465952Smartinh 		key->data = malloc(key->size);
1480*5d465952Smartinh 		if (key->data == NULL)
1481*5d465952Smartinh 			return -1;
1482*5d465952Smartinh 		concat_prefix(bt,
1483*5d465952Smartinh 		    mp->prefix.str, mp->prefix.len,
1484*5d465952Smartinh 		    NODEKEY(node), node->ksize,
1485*5d465952Smartinh 		    key->data, &key->size);
1486*5d465952Smartinh 		key->free_data = 1;
1487*5d465952Smartinh 	} else {
1488*5d465952Smartinh 		key->size = node->ksize;
1489*5d465952Smartinh 		key->data = NODEKEY(node);
1490*5d465952Smartinh 		key->free_data = 0;
1491*5d465952Smartinh 		key->mp = mp;
1492*5d465952Smartinh 		mp->ref++;
1493*5d465952Smartinh 	}
1494*5d465952Smartinh 
1495*5d465952Smartinh 	return 0;
1496*5d465952Smartinh }
1497*5d465952Smartinh 
1498*5d465952Smartinh static int
1499*5d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)
1500*5d465952Smartinh {
1501*5d465952Smartinh 	struct ppage	*top;
1502*5d465952Smartinh 	struct mpage	*mp;
1503*5d465952Smartinh 	struct node	*leaf;
1504*5d465952Smartinh 
1505*5d465952Smartinh 	if (cursor->eof)
1506*5d465952Smartinh 		return BT_NOTFOUND;
1507*5d465952Smartinh 
1508*5d465952Smartinh 	assert(cursor->initialized);
1509*5d465952Smartinh 
1510*5d465952Smartinh 	top = CURSOR_TOP(cursor);
1511*5d465952Smartinh 	mp = top->mpage;
1512*5d465952Smartinh 
1513*5d465952Smartinh 	DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor);
1514*5d465952Smartinh 
1515*5d465952Smartinh 	if (top->ki + 1 >= NUMKEYS(mp)) {
1516*5d465952Smartinh 		DPRINTF("=====> move to next sibling page");
1517*5d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS) {
1518*5d465952Smartinh 			cursor->eof = 1;
1519*5d465952Smartinh 			return BT_NOTFOUND;
1520*5d465952Smartinh 		}
1521*5d465952Smartinh 		top = CURSOR_TOP(cursor);
1522*5d465952Smartinh 		mp = top->mpage;
1523*5d465952Smartinh 		DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
1524*5d465952Smartinh 	} else
1525*5d465952Smartinh 		top->ki++;
1526*5d465952Smartinh 
1527*5d465952Smartinh 	DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
1528*5d465952Smartinh 	    mp->pgno, NUMKEYS(mp), top->ki);
1529*5d465952Smartinh 
1530*5d465952Smartinh 	assert(IS_LEAF(mp));
1531*5d465952Smartinh 	leaf = NODEPTR(mp, top->ki);
1532*5d465952Smartinh 
1533*5d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
1534*5d465952Smartinh 		return BT_FAIL;
1535*5d465952Smartinh 
1536*5d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
1537*5d465952Smartinh 		return BT_FAIL;
1538*5d465952Smartinh 
1539*5d465952Smartinh 	return BT_SUCCESS;
1540*5d465952Smartinh }
1541*5d465952Smartinh 
1542*5d465952Smartinh static int
1543*5d465952Smartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data)
1544*5d465952Smartinh {
1545*5d465952Smartinh 	int		 rc;
1546*5d465952Smartinh 	struct node	*leaf;
1547*5d465952Smartinh 	struct mpage	*mp;
1548*5d465952Smartinh 	struct ppage	*top;
1549*5d465952Smartinh 
1550*5d465952Smartinh 	assert(cursor);
1551*5d465952Smartinh 	assert(key);
1552*5d465952Smartinh 	assert(key->size > 0);
1553*5d465952Smartinh 
1554*5d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp);
1555*5d465952Smartinh 	if (rc != BT_SUCCESS)
1556*5d465952Smartinh 		return rc;
1557*5d465952Smartinh 	assert(IS_LEAF(mp));
1558*5d465952Smartinh 
1559*5d465952Smartinh 	top = CURSOR_TOP(cursor);
1560*5d465952Smartinh 	leaf = btree_search_node(cursor->bt, mp, key, NULL, &top->ki);
1561*5d465952Smartinh 	if (leaf == NULL) {
1562*5d465952Smartinh 		DPRINTF("===> inexact leaf not found, goto sibling");
1563*5d465952Smartinh 		if (btree_sibling(cursor, 1) != BT_SUCCESS)
1564*5d465952Smartinh 			return BT_NOTFOUND; /* no entries matched */
1565*5d465952Smartinh 		top = CURSOR_TOP(cursor);
1566*5d465952Smartinh 		top->ki = 0;
1567*5d465952Smartinh 		mp = top->mpage;
1568*5d465952Smartinh 		assert(IS_LEAF(mp));
1569*5d465952Smartinh 		leaf = NODEPTR(mp, 0);
1570*5d465952Smartinh 	}
1571*5d465952Smartinh 
1572*5d465952Smartinh 	cursor->initialized = 1;
1573*5d465952Smartinh 	cursor->eof = 0;
1574*5d465952Smartinh 
1575*5d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
1576*5d465952Smartinh 		return BT_FAIL;
1577*5d465952Smartinh 
1578*5d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
1579*5d465952Smartinh 		return BT_FAIL;
1580*5d465952Smartinh 	DPRINTF("==> cursor placed on key %.*s",
1581*5d465952Smartinh 	    (int)key->size, (char *)key->data);
1582*5d465952Smartinh 
1583*5d465952Smartinh 	return BT_SUCCESS;
1584*5d465952Smartinh }
1585*5d465952Smartinh 
1586*5d465952Smartinh static int
1587*5d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)
1588*5d465952Smartinh {
1589*5d465952Smartinh 	int		 rc;
1590*5d465952Smartinh 	struct mpage	*mp;
1591*5d465952Smartinh 	struct node	*leaf;
1592*5d465952Smartinh 
1593*5d465952Smartinh 	rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp);
1594*5d465952Smartinh 	if (rc != BT_SUCCESS)
1595*5d465952Smartinh 		return rc;
1596*5d465952Smartinh 	assert(IS_LEAF(mp));
1597*5d465952Smartinh 
1598*5d465952Smartinh 	leaf = NODEPTR(mp, 0);
1599*5d465952Smartinh 	cursor->initialized = 1;
1600*5d465952Smartinh 	cursor->eof = 0;
1601*5d465952Smartinh 
1602*5d465952Smartinh 	if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
1603*5d465952Smartinh 		return BT_FAIL;
1604*5d465952Smartinh 
1605*5d465952Smartinh 	if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
1606*5d465952Smartinh 		return BT_FAIL;
1607*5d465952Smartinh 
1608*5d465952Smartinh 	return BT_SUCCESS;
1609*5d465952Smartinh }
1610*5d465952Smartinh 
1611*5d465952Smartinh int
1612*5d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,
1613*5d465952Smartinh     enum cursor_op op)
1614*5d465952Smartinh {
1615*5d465952Smartinh 	int		 rc;
1616*5d465952Smartinh 
1617*5d465952Smartinh 	assert(cursor);
1618*5d465952Smartinh 
1619*5d465952Smartinh 	switch (op) {
1620*5d465952Smartinh 	case BT_CURSOR:
1621*5d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
1622*5d465952Smartinh 			cursor_pop_page(cursor);
1623*5d465952Smartinh 		if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) {
1624*5d465952Smartinh 			errno = EINVAL;
1625*5d465952Smartinh 			rc = BT_FAIL;
1626*5d465952Smartinh 		} else
1627*5d465952Smartinh 			rc = btree_cursor_set(cursor, key, data);
1628*5d465952Smartinh 		break;
1629*5d465952Smartinh 	case BT_NEXT:
1630*5d465952Smartinh 		if (!cursor->initialized)
1631*5d465952Smartinh 			rc = btree_cursor_first(cursor, key, data);
1632*5d465952Smartinh 		else
1633*5d465952Smartinh 			rc = btree_cursor_next(cursor, key, data);
1634*5d465952Smartinh 		break;
1635*5d465952Smartinh 	case BT_FIRST:
1636*5d465952Smartinh 		while (CURSOR_TOP(cursor) != NULL)
1637*5d465952Smartinh 			cursor_pop_page(cursor);
1638*5d465952Smartinh 		rc = btree_cursor_first(cursor, key, data);
1639*5d465952Smartinh 		break;
1640*5d465952Smartinh 	default:
1641*5d465952Smartinh 		DPRINTF("unhandled/unimplemented cursor operation %u", op);
1642*5d465952Smartinh 		rc = BT_FAIL;
1643*5d465952Smartinh 		break;
1644*5d465952Smartinh 	}
1645*5d465952Smartinh 
1646*5d465952Smartinh 	mpage_prune(cursor->bt);
1647*5d465952Smartinh 
1648*5d465952Smartinh 	return rc;
1649*5d465952Smartinh }
1650*5d465952Smartinh 
1651*5d465952Smartinh static struct mpage *
1652*5d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags)
1653*5d465952Smartinh {
1654*5d465952Smartinh 	struct mpage	*mp;
1655*5d465952Smartinh 
1656*5d465952Smartinh 	assert(bt != NULL);
1657*5d465952Smartinh 	assert(bt->txn != NULL);
1658*5d465952Smartinh 
1659*5d465952Smartinh 	DPRINTF("allocating new mpage %u", bt->txn->next_pgno);
1660*5d465952Smartinh 	if ((mp = calloc(1, sizeof(*mp))) == NULL)
1661*5d465952Smartinh 		return NULL;
1662*5d465952Smartinh 	mp->pgno = mp->page.pgno = bt->txn->next_pgno++;
1663*5d465952Smartinh 	mp->page.flags = flags;
1664*5d465952Smartinh 	mp->page.lower = PAGEHDRSZ;
1665*5d465952Smartinh 	mp->page.upper = PAGESIZE;
1666*5d465952Smartinh 
1667*5d465952Smartinh 	if (IS_BRANCH(mp))
1668*5d465952Smartinh 		bt->meta->branch_pages++;
1669*5d465952Smartinh 	else if (IS_LEAF(mp))
1670*5d465952Smartinh 		bt->meta->leaf_pages++;
1671*5d465952Smartinh 	else if (IS_OVERFLOW(mp))
1672*5d465952Smartinh 		bt->meta->overflow_pages++;
1673*5d465952Smartinh 
1674*5d465952Smartinh 	mpage_add(bt, mp);
1675*5d465952Smartinh 	mpage_dirty(bt, mp);
1676*5d465952Smartinh 
1677*5d465952Smartinh 	return mp;
1678*5d465952Smartinh }
1679*5d465952Smartinh 
1680*5d465952Smartinh static size_t
1681*5d465952Smartinh bt_leaf_size(struct btval *key, struct btval *data)
1682*5d465952Smartinh {
1683*5d465952Smartinh 	size_t		 sz;
1684*5d465952Smartinh 
1685*5d465952Smartinh 	sz = LEAFSIZE(key, data);
1686*5d465952Smartinh 	if (data->size >= PAGESIZE / BT_MINKEYS) {
1687*5d465952Smartinh 		/* put on overflow page */
1688*5d465952Smartinh 		sz -= data->size - sizeof(pgno_t);
1689*5d465952Smartinh 	}
1690*5d465952Smartinh 
1691*5d465952Smartinh 	return sz + sizeof(indx_t);
1692*5d465952Smartinh }
1693*5d465952Smartinh 
1694*5d465952Smartinh static size_t
1695*5d465952Smartinh bt_branch_size(struct btval *key)
1696*5d465952Smartinh {
1697*5d465952Smartinh 	size_t		 sz;
1698*5d465952Smartinh 
1699*5d465952Smartinh 	sz = INDXSIZE(key);
1700*5d465952Smartinh 	if (sz >= PAGESIZE / BT_MINKEYS) {
1701*5d465952Smartinh 		/* put on overflow page */
1702*5d465952Smartinh 		/* not implemented */
1703*5d465952Smartinh 		/* sz -= key->size - sizeof(pgno_t); */
1704*5d465952Smartinh 	}
1705*5d465952Smartinh 
1706*5d465952Smartinh 	return sz + sizeof(indx_t);
1707*5d465952Smartinh }
1708*5d465952Smartinh 
1709*5d465952Smartinh static int
1710*5d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)
1711*5d465952Smartinh {
1712*5d465952Smartinh 	size_t		 done = 0;
1713*5d465952Smartinh 	size_t		 sz;
1714*5d465952Smartinh 	size_t		 max;
1715*5d465952Smartinh 	pgno_t		*linkp;			/* linked page stored here */
1716*5d465952Smartinh 	struct mpage	*next = NULL;
1717*5d465952Smartinh 
1718*5d465952Smartinh 	max = PAGESIZE - PAGEHDRSZ;
1719*5d465952Smartinh 
1720*5d465952Smartinh 	while (done < data->size) {
1721*5d465952Smartinh 		linkp = &p->p_next_pgno;
1722*5d465952Smartinh 		if (data->size - done > max) {
1723*5d465952Smartinh 			/* need another overflow page */
1724*5d465952Smartinh 			if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)
1725*5d465952Smartinh 				return BT_FAIL;
1726*5d465952Smartinh 			*linkp = next->pgno;
1727*5d465952Smartinh 			DPRINTF("linking overflow page %u", next->pgno);
1728*5d465952Smartinh 		} else
1729*5d465952Smartinh 			*linkp = 0;		/* indicates end of list */
1730*5d465952Smartinh 		sz = data->size - done;
1731*5d465952Smartinh 		if (sz > max)
1732*5d465952Smartinh 			sz = max;
1733*5d465952Smartinh 		DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno);
1734*5d465952Smartinh 		bcopy((char *)data->data + done, p->ptrs, sz);
1735*5d465952Smartinh 		done += sz;
1736*5d465952Smartinh 		p = &next->page;
1737*5d465952Smartinh 	}
1738*5d465952Smartinh 
1739*5d465952Smartinh 	return BT_SUCCESS;
1740*5d465952Smartinh }
1741*5d465952Smartinh 
1742*5d465952Smartinh /* Key prefix should already be stripped.
1743*5d465952Smartinh  */
1744*5d465952Smartinh static int
1745*5d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,
1746*5d465952Smartinh     struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)
1747*5d465952Smartinh {
1748*5d465952Smartinh 	unsigned int	 i;
1749*5d465952Smartinh 	size_t		 node_size = NODESIZE;
1750*5d465952Smartinh 	indx_t		 ofs;
1751*5d465952Smartinh 	struct node	*node;
1752*5d465952Smartinh 	struct page	*p;
1753*5d465952Smartinh 	struct mpage	*ofp = NULL;		/* overflow page */
1754*5d465952Smartinh 
1755*5d465952Smartinh 	p = &mp->page;
1756*5d465952Smartinh 	assert(p->upper >= p->lower);
1757*5d465952Smartinh 
1758*5d465952Smartinh 	DPRINTF("add node [%.*s] to %s page %u at index %i",
1759*5d465952Smartinh 	    key ? (int)key->size : 0, key ? (char *)key->data : NULL,
1760*5d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
1761*5d465952Smartinh 	    mp->pgno, indx);
1762*5d465952Smartinh 
1763*5d465952Smartinh 	if (key != NULL)
1764*5d465952Smartinh 		node_size += key->size;
1765*5d465952Smartinh 
1766*5d465952Smartinh 	if (IS_LEAF(mp)) {
1767*5d465952Smartinh 		assert(data);
1768*5d465952Smartinh 		node_size += data->size;
1769*5d465952Smartinh 		if (F_ISSET(flags, F_BIGDATA)) {
1770*5d465952Smartinh 			/* Data already on overflow page. */
1771*5d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
1772*5d465952Smartinh 		} else if (data->size >= PAGESIZE / BT_MINKEYS) {
1773*5d465952Smartinh 			/* Put data on overflow page. */
1774*5d465952Smartinh 			DPRINTF("data size is %zu, put on overflow page",
1775*5d465952Smartinh 			    data->size);
1776*5d465952Smartinh 			node_size -= data->size - sizeof(pgno_t);
1777*5d465952Smartinh 			if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)
1778*5d465952Smartinh 				return BT_FAIL;
1779*5d465952Smartinh 			DPRINTF("allocated overflow page %u", ofp->pgno);
1780*5d465952Smartinh 			flags |= F_BIGDATA;
1781*5d465952Smartinh 		}
1782*5d465952Smartinh 	}
1783*5d465952Smartinh 
1784*5d465952Smartinh 	if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
1785*5d465952Smartinh 		DPRINTF("not enough room in page %u, got %lu ptrs",
1786*5d465952Smartinh 		    mp->pgno, NUMKEYS(mp));
1787*5d465952Smartinh 		DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower,
1788*5d465952Smartinh 		    p->upper - p->lower);
1789*5d465952Smartinh 		DPRINTF("node size = %lu", node_size);
1790*5d465952Smartinh 		return BT_FAIL;
1791*5d465952Smartinh 	}
1792*5d465952Smartinh 
1793*5d465952Smartinh 	/* Move higher pointers up one slot. */
1794*5d465952Smartinh 	for (i = NUMKEYS(mp); i > indx; i--)
1795*5d465952Smartinh 		p->ptrs[i] = p->ptrs[i - 1];
1796*5d465952Smartinh 
1797*5d465952Smartinh 	/* Adjust free space offsets. */
1798*5d465952Smartinh 	ofs = p->upper - node_size;
1799*5d465952Smartinh 	assert(ofs >= p->lower + sizeof(indx_t));
1800*5d465952Smartinh 	p->ptrs[indx] = ofs;
1801*5d465952Smartinh 	p->upper = ofs;
1802*5d465952Smartinh 	p->lower += sizeof(indx_t);
1803*5d465952Smartinh 
1804*5d465952Smartinh 	/* Write the node data. */
1805*5d465952Smartinh 	node = NODEPTR(mp, indx);
1806*5d465952Smartinh 	node->ksize = (key == NULL) ? 0 : key->size;
1807*5d465952Smartinh 	node->flags = flags;
1808*5d465952Smartinh 	if (IS_LEAF(mp))
1809*5d465952Smartinh 		node->n_dsize = data->size;
1810*5d465952Smartinh 	else
1811*5d465952Smartinh 		node->n_pgno = pgno;
1812*5d465952Smartinh 
1813*5d465952Smartinh 	if (key)
1814*5d465952Smartinh 		bcopy(key->data, NODEKEY(node), key->size);
1815*5d465952Smartinh 
1816*5d465952Smartinh 	if (IS_LEAF(mp)) {
1817*5d465952Smartinh 		if (ofp == NULL) {
1818*5d465952Smartinh 			if (F_ISSET(flags, F_BIGDATA))
1819*5d465952Smartinh 				bcopy(data->data, node->data + key->size,
1820*5d465952Smartinh 				    sizeof(pgno_t));
1821*5d465952Smartinh 			else
1822*5d465952Smartinh 				bcopy(data->data, node->data + key->size,
1823*5d465952Smartinh 				    data->size);
1824*5d465952Smartinh 		} else {
1825*5d465952Smartinh 			bcopy(&ofp->pgno, node->data + key->size,
1826*5d465952Smartinh 			    sizeof(pgno_t));
1827*5d465952Smartinh 			if (btree_write_overflow_data(bt, &ofp->page,
1828*5d465952Smartinh 			    data) == BT_FAIL)
1829*5d465952Smartinh 				return BT_FAIL;
1830*5d465952Smartinh 		}
1831*5d465952Smartinh 	}
1832*5d465952Smartinh 
1833*5d465952Smartinh 	return BT_SUCCESS;
1834*5d465952Smartinh }
1835*5d465952Smartinh 
1836*5d465952Smartinh static void
1837*5d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx)
1838*5d465952Smartinh {
1839*5d465952Smartinh 	unsigned int	 sz;
1840*5d465952Smartinh 	indx_t		 i, j, numkeys, ptr;
1841*5d465952Smartinh 	struct node	*node;
1842*5d465952Smartinh 	char		*base;
1843*5d465952Smartinh 
1844*5d465952Smartinh 	DPRINTF("delete node %u on %s page %u", indx,
1845*5d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);
1846*5d465952Smartinh 	assert(indx < NUMKEYS(mp));
1847*5d465952Smartinh 
1848*5d465952Smartinh 	node = NODEPTR(mp, indx);
1849*5d465952Smartinh 	sz = NODESIZE + node->ksize;
1850*5d465952Smartinh 	if (IS_LEAF(mp)) {
1851*5d465952Smartinh 		if (F_ISSET(node->flags, F_BIGDATA))
1852*5d465952Smartinh 			sz += sizeof(pgno_t);
1853*5d465952Smartinh 		else
1854*5d465952Smartinh 			sz += NODEDSZ(node);
1855*5d465952Smartinh 	}
1856*5d465952Smartinh 
1857*5d465952Smartinh 	ptr = mp->page.ptrs[indx];
1858*5d465952Smartinh 	numkeys = NUMKEYS(mp);
1859*5d465952Smartinh 	for (i = j = 0; i < numkeys; i++) {
1860*5d465952Smartinh 		if (i != indx) {
1861*5d465952Smartinh 			mp->page.ptrs[j] = mp->page.ptrs[i];
1862*5d465952Smartinh 			if (mp->page.ptrs[i] < ptr)
1863*5d465952Smartinh 				mp->page.ptrs[j] += sz;
1864*5d465952Smartinh 			j++;
1865*5d465952Smartinh 		}
1866*5d465952Smartinh 	}
1867*5d465952Smartinh 
1868*5d465952Smartinh 	base = (char *)&mp->page + mp->page.upper;
1869*5d465952Smartinh 	bcopy(base, base + sz, ptr - mp->page.upper);
1870*5d465952Smartinh 
1871*5d465952Smartinh 	mp->page.lower -= sizeof(indx_t);
1872*5d465952Smartinh 	mp->page.upper += sz;
1873*5d465952Smartinh }
1874*5d465952Smartinh 
1875*5d465952Smartinh struct cursor *
1876*5d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)
1877*5d465952Smartinh {
1878*5d465952Smartinh 	struct cursor	*cursor;
1879*5d465952Smartinh 
1880*5d465952Smartinh 	if ((cursor = calloc(1, sizeof(*cursor))) != NULL) {
1881*5d465952Smartinh 		SLIST_INIT(&cursor->stack);
1882*5d465952Smartinh 		cursor->bt = bt;
1883*5d465952Smartinh 		cursor->txn = txn;
1884*5d465952Smartinh 		bt->ref++;
1885*5d465952Smartinh 	}
1886*5d465952Smartinh 
1887*5d465952Smartinh 	return cursor;
1888*5d465952Smartinh }
1889*5d465952Smartinh 
1890*5d465952Smartinh void
1891*5d465952Smartinh btree_cursor_close(struct cursor *cursor)
1892*5d465952Smartinh {
1893*5d465952Smartinh 	if (cursor != NULL) {
1894*5d465952Smartinh 		while (!CURSOR_EMPTY(cursor))
1895*5d465952Smartinh 			cursor_pop_page(cursor);
1896*5d465952Smartinh 
1897*5d465952Smartinh 		btree_close(cursor->bt);
1898*5d465952Smartinh 		free(cursor);
1899*5d465952Smartinh 	}
1900*5d465952Smartinh }
1901*5d465952Smartinh 
1902*5d465952Smartinh static int
1903*5d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx,
1904*5d465952Smartinh     struct btval *key)
1905*5d465952Smartinh {
1906*5d465952Smartinh 	indx_t			 ptr, i, numkeys;
1907*5d465952Smartinh 	int			 delta;
1908*5d465952Smartinh 	size_t			 len;
1909*5d465952Smartinh 	struct node		*node;
1910*5d465952Smartinh 	char			*base;
1911*5d465952Smartinh 
1912*5d465952Smartinh 	node = NODEPTR(mp, indx);
1913*5d465952Smartinh 	ptr = mp->page.ptrs[indx];
1914*5d465952Smartinh 	DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u",
1915*5d465952Smartinh 	    indx, ptr,
1916*5d465952Smartinh 	    (int)node->ksize, (char *)NODEKEY(node),
1917*5d465952Smartinh 	    (int)key->size, (char *)key->data,
1918*5d465952Smartinh 	    mp->pgno);
1919*5d465952Smartinh 
1920*5d465952Smartinh 	if (key->size != node->ksize) {
1921*5d465952Smartinh 		delta = key->size - node->ksize;
1922*5d465952Smartinh 		if (delta > 0 && SIZELEFT(mp) < delta) {
1923*5d465952Smartinh 			DPRINTF("OUCH! Not enough room, delta = %d", delta);
1924*5d465952Smartinh 			return BT_FAIL;
1925*5d465952Smartinh 		}
1926*5d465952Smartinh 
1927*5d465952Smartinh 		numkeys = NUMKEYS(mp);
1928*5d465952Smartinh 		for (i = 0; i < numkeys; i++) {
1929*5d465952Smartinh 			if (mp->page.ptrs[i] <= ptr)
1930*5d465952Smartinh 				mp->page.ptrs[i] -= delta;
1931*5d465952Smartinh 		}
1932*5d465952Smartinh 
1933*5d465952Smartinh 		base = (char *)&mp->page + mp->page.upper;
1934*5d465952Smartinh 		len = ptr - mp->page.upper + NODESIZE;
1935*5d465952Smartinh 		bcopy(base, base - delta, len);
1936*5d465952Smartinh 		mp->page.upper -= delta;
1937*5d465952Smartinh 
1938*5d465952Smartinh 		node = NODEPTR(mp, indx);
1939*5d465952Smartinh 		node->ksize = key->size;
1940*5d465952Smartinh 	}
1941*5d465952Smartinh 
1942*5d465952Smartinh 	bcopy(key->data, NODEKEY(node), key->size);
1943*5d465952Smartinh 
1944*5d465952Smartinh 	return BT_SUCCESS;
1945*5d465952Smartinh }
1946*5d465952Smartinh 
1947*5d465952Smartinh static int
1948*5d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)
1949*5d465952Smartinh {
1950*5d465952Smartinh 	indx_t		 i;
1951*5d465952Smartinh 	struct node	*node;
1952*5d465952Smartinh 	struct btkey	 tmpkey;
1953*5d465952Smartinh 	struct btval	 key;
1954*5d465952Smartinh 
1955*5d465952Smartinh 	DPRINTF("adjusting prefix lengths on page %u with delta %d",
1956*5d465952Smartinh 	    src->pgno, delta);
1957*5d465952Smartinh 	assert(delta != 0);
1958*5d465952Smartinh 
1959*5d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
1960*5d465952Smartinh 		node = NODEPTR(src, i);
1961*5d465952Smartinh 		tmpkey.len = node->ksize - delta;
1962*5d465952Smartinh 		if (delta > 0) {
1963*5d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY))
1964*5d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);
1965*5d465952Smartinh 			else
1966*5d465952Smartinh 				bcopy((char *)NODEKEY(node) + delta, tmpkey.str,
1967*5d465952Smartinh 				    tmpkey.len);
1968*5d465952Smartinh 		} else {
1969*5d465952Smartinh 			if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
1970*5d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str, node->ksize);
1971*5d465952Smartinh 				bcopy(src->prefix.str, tmpkey.str + node->ksize,
1972*5d465952Smartinh 				    -delta);
1973*5d465952Smartinh 			} else {
1974*5d465952Smartinh 				bcopy(src->prefix.str + src->prefix.len + delta,
1975*5d465952Smartinh 				    tmpkey.str, -delta);
1976*5d465952Smartinh 				bcopy(NODEKEY(node), tmpkey.str - delta,
1977*5d465952Smartinh 				    node->ksize);
1978*5d465952Smartinh 			}
1979*5d465952Smartinh 		}
1980*5d465952Smartinh 		key.size = tmpkey.len;
1981*5d465952Smartinh 		key.data = tmpkey.str;
1982*5d465952Smartinh 		if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)
1983*5d465952Smartinh 			return BT_FAIL;
1984*5d465952Smartinh 	}
1985*5d465952Smartinh 
1986*5d465952Smartinh 	return BT_SUCCESS;
1987*5d465952Smartinh }
1988*5d465952Smartinh 
1989*5d465952Smartinh /* Move a node from src to dst.
1990*5d465952Smartinh  */
1991*5d465952Smartinh static int
1992*5d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,
1993*5d465952Smartinh     struct mpage *dst, indx_t dstindx)
1994*5d465952Smartinh {
1995*5d465952Smartinh 	int			 rc;
1996*5d465952Smartinh 	unsigned int		 pfxlen, mp_pfxlen = 0;
1997*5d465952Smartinh 	struct node		*node, *srcnode;
1998*5d465952Smartinh 	struct mpage		*mp;
1999*5d465952Smartinh 	struct btkey		 tmpkey, srckey;
2000*5d465952Smartinh 	struct btval		 key, data;
2001*5d465952Smartinh 
2002*5d465952Smartinh 	assert(src->parent);
2003*5d465952Smartinh 	assert(dst->parent);
2004*5d465952Smartinh 
2005*5d465952Smartinh 	srcnode = NODEPTR(src, srcindx);
2006*5d465952Smartinh 	DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u",
2007*5d465952Smartinh 	    IS_LEAF(src) ? "leaf" : "branch",
2008*5d465952Smartinh 	    srcindx,
2009*5d465952Smartinh 	    (int)srcnode->ksize, (char *)NODEKEY(srcnode),
2010*5d465952Smartinh 	    src->pgno,
2011*5d465952Smartinh 	    dstindx, dst->pgno);
2012*5d465952Smartinh 
2013*5d465952Smartinh 	if (IS_BRANCH(src)) {
2014*5d465952Smartinh 		/* Need to check if the page the moved node points to
2015*5d465952Smartinh 		 * changes prefix.
2016*5d465952Smartinh 		 */
2017*5d465952Smartinh 		btree_get_mpage(bt, NODEPGNO(srcnode), &mp);
2018*5d465952Smartinh 		mp->parent = src;
2019*5d465952Smartinh 		mp->parent_index = srcindx;
2020*5d465952Smartinh 		find_common_prefix(bt, mp);
2021*5d465952Smartinh 		mp_pfxlen = mp->prefix.len;
2022*5d465952Smartinh 	}
2023*5d465952Smartinh 
2024*5d465952Smartinh 	/* Mark src and dst as dirty. */
2025*5d465952Smartinh 	mpage_touch(bt, src);
2026*5d465952Smartinh 	mpage_touch(bt, dst);
2027*5d465952Smartinh 
2028*5d465952Smartinh 	find_common_prefix(bt, dst);
2029*5d465952Smartinh 
2030*5d465952Smartinh 	/* Check if src node has destination page prefix. Otherwise the
2031*5d465952Smartinh 	 * destination page must expand its prefix on all its nodes.
2032*5d465952Smartinh 	 */
2033*5d465952Smartinh 	srckey.len = srcnode->ksize;
2034*5d465952Smartinh 	bcopy(NODEKEY(srcnode), srckey.str, srckey.len);
2035*5d465952Smartinh 	common_prefix(bt, &srckey, &dst->prefix, &tmpkey);
2036*5d465952Smartinh 	if (tmpkey.len != dst->prefix.len) {
2037*5d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
2038*5d465952Smartinh 		    tmpkey.len - dst->prefix.len) != BT_SUCCESS)
2039*5d465952Smartinh 			return BT_FAIL;
2040*5d465952Smartinh 		bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));
2041*5d465952Smartinh 	}
2042*5d465952Smartinh 
2043*5d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
2044*5d465952Smartinh 		struct mpage	*low;
2045*5d465952Smartinh 
2046*5d465952Smartinh 		/* must find the lowest key below src
2047*5d465952Smartinh 		 */
2048*5d465952Smartinh 		assert(btree_search_page_root(bt, src, NULL, NULL, 0,
2049*5d465952Smartinh 		    &low) == BT_SUCCESS);
2050*5d465952Smartinh 		expand_prefix(bt, low, 0, &srckey);
2051*5d465952Smartinh 		DPRINTF("found lowest key [%.*s] on leaf page %u",
2052*5d465952Smartinh 		    (int)srckey.len, srckey.str, low->pgno);
2053*5d465952Smartinh 	} else {
2054*5d465952Smartinh 		srckey.len = srcnode->ksize;
2055*5d465952Smartinh 		bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);
2056*5d465952Smartinh 	}
2057*5d465952Smartinh 	find_common_prefix(bt, src);
2058*5d465952Smartinh 
2059*5d465952Smartinh 	/* expand the prefix */
2060*5d465952Smartinh 	tmpkey.len = sizeof(tmpkey.str);
2061*5d465952Smartinh 	concat_prefix(bt, src->prefix.str, src->prefix.len,
2062*5d465952Smartinh 	    srckey.str, srckey.len, tmpkey.str, &tmpkey.len);
2063*5d465952Smartinh 
2064*5d465952Smartinh 	/* Add the node to the destination page. Adjust prefix for
2065*5d465952Smartinh 	 * destination page.
2066*5d465952Smartinh 	 */
2067*5d465952Smartinh 	key.size = tmpkey.len;
2068*5d465952Smartinh 	key.data = tmpkey.str;
2069*5d465952Smartinh 	remove_prefix(bt, &key, dst->prefix.len);
2070*5d465952Smartinh 	data.size = NODEDSZ(srcnode);
2071*5d465952Smartinh 	data.data = NODEDATA(srcnode);
2072*5d465952Smartinh 	rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),
2073*5d465952Smartinh 	    srcnode->flags);
2074*5d465952Smartinh 	if (rc != BT_SUCCESS)
2075*5d465952Smartinh 		return rc;
2076*5d465952Smartinh 
2077*5d465952Smartinh 	/* Delete the node from the source page.
2078*5d465952Smartinh 	 */
2079*5d465952Smartinh 	btree_del_node(bt, src, srcindx);
2080*5d465952Smartinh 
2081*5d465952Smartinh 	/* Update the parent separators.
2082*5d465952Smartinh 	 */
2083*5d465952Smartinh 	if (srcindx == 0 && src->parent_index != 0) {
2084*5d465952Smartinh 		node = NODEPTR(src->parent, src->parent_index);
2085*5d465952Smartinh 		DPRINTF("current parent separator for source page %u is [%.*s]",
2086*5d465952Smartinh 		    src->pgno,
2087*5d465952Smartinh 		    (int)node->ksize, (char *)NODEKEY(node));
2088*5d465952Smartinh 
2089*5d465952Smartinh 		expand_prefix(bt, src, 0, &tmpkey);
2090*5d465952Smartinh 		key.size = tmpkey.len;
2091*5d465952Smartinh 		key.data = tmpkey.str;
2092*5d465952Smartinh 		remove_prefix(bt, &key, src->parent->prefix.len);
2093*5d465952Smartinh 
2094*5d465952Smartinh 		DPRINTF("update separator for source page %u to [%.*s]",
2095*5d465952Smartinh 		    src->pgno, (int)key.size, (char *)key.data);
2096*5d465952Smartinh // ?		bt_reduce_separator(bt, node, &key);
2097*5d465952Smartinh 		if (btree_update_key(bt, src->parent, src->parent_index,
2098*5d465952Smartinh 		    &key) != BT_SUCCESS)
2099*5d465952Smartinh 			return BT_FAIL;
2100*5d465952Smartinh 	}
2101*5d465952Smartinh 
2102*5d465952Smartinh 	if (srcindx == 0 && IS_BRANCH(src)) {
2103*5d465952Smartinh 		struct btval	 nullkey;
2104*5d465952Smartinh 		nullkey.size = 0;
2105*5d465952Smartinh 		assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);
2106*5d465952Smartinh 	}
2107*5d465952Smartinh 
2108*5d465952Smartinh 	if (dstindx == 0 && dst->parent_index != 0) {
2109*5d465952Smartinh 		node = NODEPTR(dst->parent, dst->parent_index);
2110*5d465952Smartinh 		DPRINTF("current parent separator for destination page %u is [%.*s]",
2111*5d465952Smartinh 		    dst->pgno,
2112*5d465952Smartinh 		    (int)node->ksize, (char *)NODEKEY(node));
2113*5d465952Smartinh 
2114*5d465952Smartinh 		expand_prefix(bt, dst, 0, &tmpkey);
2115*5d465952Smartinh 		key.size = tmpkey.len;
2116*5d465952Smartinh 		key.data = tmpkey.str;
2117*5d465952Smartinh 		remove_prefix(bt, &key, dst->parent->prefix.len);
2118*5d465952Smartinh 
2119*5d465952Smartinh 		DPRINTF("update separator for destination page %u to [%.*s]",
2120*5d465952Smartinh 		    dst->pgno, (int)key.size, (char *)key.data);
2121*5d465952Smartinh // ?		bt_reduce_separator(bt, node, &key);
2122*5d465952Smartinh 		if (btree_update_key(bt, dst->parent, dst->parent_index,
2123*5d465952Smartinh 		    &key) != BT_SUCCESS)
2124*5d465952Smartinh 			return BT_FAIL;
2125*5d465952Smartinh 	}
2126*5d465952Smartinh 
2127*5d465952Smartinh 	if (dstindx == 0 && IS_BRANCH(dst)) {
2128*5d465952Smartinh 		struct btval	 nullkey;
2129*5d465952Smartinh 		nullkey.size = 0;
2130*5d465952Smartinh 		assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);
2131*5d465952Smartinh 	}
2132*5d465952Smartinh 
2133*5d465952Smartinh 	/* We can get a new page prefix here!
2134*5d465952Smartinh 	 * Must update keys in all nodes of this page!
2135*5d465952Smartinh 	 */
2136*5d465952Smartinh 	pfxlen = src->prefix.len;
2137*5d465952Smartinh 	find_common_prefix(bt, src);
2138*5d465952Smartinh 	if (src->prefix.len != pfxlen) {
2139*5d465952Smartinh 		if (btree_adjust_prefix(bt, src,
2140*5d465952Smartinh 		    src->prefix.len - pfxlen) != BT_SUCCESS)
2141*5d465952Smartinh 			return BT_FAIL;
2142*5d465952Smartinh 	}
2143*5d465952Smartinh 
2144*5d465952Smartinh 	pfxlen = dst->prefix.len;
2145*5d465952Smartinh 	find_common_prefix(bt, dst);
2146*5d465952Smartinh 	if (dst->prefix.len != pfxlen) {
2147*5d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
2148*5d465952Smartinh 		    dst->prefix.len - pfxlen) != BT_SUCCESS)
2149*5d465952Smartinh 			return BT_FAIL;
2150*5d465952Smartinh 	}
2151*5d465952Smartinh 
2152*5d465952Smartinh 	if (IS_BRANCH(dst)) {
2153*5d465952Smartinh 		mp->parent = dst;
2154*5d465952Smartinh 		mp->parent_index = dstindx;
2155*5d465952Smartinh 		find_common_prefix(bt, mp);
2156*5d465952Smartinh 		if (mp->prefix.len != mp_pfxlen) {
2157*5d465952Smartinh 			DPRINTF("moved branch node has changed prefix");
2158*5d465952Smartinh 			mpage_touch(bt, mp);
2159*5d465952Smartinh 			if (btree_adjust_prefix(bt, mp,
2160*5d465952Smartinh 			    mp->prefix.len - mp_pfxlen) != BT_SUCCESS)
2161*5d465952Smartinh 				return BT_FAIL;
2162*5d465952Smartinh 		}
2163*5d465952Smartinh 	}
2164*5d465952Smartinh 
2165*5d465952Smartinh 	return BT_SUCCESS;
2166*5d465952Smartinh }
2167*5d465952Smartinh 
2168*5d465952Smartinh static int
2169*5d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)
2170*5d465952Smartinh {
2171*5d465952Smartinh 	int			 rc;
2172*5d465952Smartinh 	indx_t			 i;
2173*5d465952Smartinh 	struct node		*srcnode;
2174*5d465952Smartinh 	struct btkey		 tmpkey, dstpfx;
2175*5d465952Smartinh 	struct btval		 key, data;
2176*5d465952Smartinh 
2177*5d465952Smartinh 	DPRINTF("merging page %u and %u", src->pgno, dst->pgno);
2178*5d465952Smartinh 
2179*5d465952Smartinh 	assert(src->parent);	/* can't merge root page */
2180*5d465952Smartinh 	assert(dst->parent);
2181*5d465952Smartinh 	assert(bt->txn != NULL);
2182*5d465952Smartinh 
2183*5d465952Smartinh 	/* Mark src and dst as dirty. */
2184*5d465952Smartinh 	mpage_touch(bt, src);
2185*5d465952Smartinh 	mpage_touch(bt, dst);
2186*5d465952Smartinh 
2187*5d465952Smartinh 	find_common_prefix(bt, src);
2188*5d465952Smartinh 	find_common_prefix(bt, dst);
2189*5d465952Smartinh 
2190*5d465952Smartinh 	/* Check if source nodes has destination page prefix. Otherwise
2191*5d465952Smartinh 	 * the destination page must expand its prefix on all its nodes.
2192*5d465952Smartinh 	 */
2193*5d465952Smartinh 	common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);
2194*5d465952Smartinh 	if (dstpfx.len != dst->prefix.len) {
2195*5d465952Smartinh 		if (btree_adjust_prefix(bt, dst,
2196*5d465952Smartinh 		    dstpfx.len - dst->prefix.len) != BT_SUCCESS)
2197*5d465952Smartinh 			return BT_FAIL;
2198*5d465952Smartinh 		bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));
2199*5d465952Smartinh 	}
2200*5d465952Smartinh 
2201*5d465952Smartinh 	/* Move all nodes from src to dst.
2202*5d465952Smartinh 	 */
2203*5d465952Smartinh 	for (i = 0; i < NUMKEYS(src); i++) {
2204*5d465952Smartinh 		srcnode = NODEPTR(src, i);
2205*5d465952Smartinh 
2206*5d465952Smartinh 		/* If branch node 0 (implicit key), find the real key.
2207*5d465952Smartinh 		 */
2208*5d465952Smartinh 		if (i == 0 && IS_BRANCH(src)) {
2209*5d465952Smartinh 			struct mpage	*low;
2210*5d465952Smartinh 
2211*5d465952Smartinh 			/* must find the lowest key below src
2212*5d465952Smartinh 			 */
2213*5d465952Smartinh 			assert(btree_search_page_root(bt, src, NULL, NULL, 0,
2214*5d465952Smartinh 			    &low) == BT_SUCCESS);
2215*5d465952Smartinh 			expand_prefix(bt, low, 0, &tmpkey);
2216*5d465952Smartinh 			DPRINTF("found lowest key [%.*s] on leaf page %u",
2217*5d465952Smartinh 			    (int)tmpkey.len, tmpkey.str, low->pgno);
2218*5d465952Smartinh 		} else {
2219*5d465952Smartinh 			expand_prefix(bt, src, i, &tmpkey);
2220*5d465952Smartinh 		}
2221*5d465952Smartinh 
2222*5d465952Smartinh 		key.size = tmpkey.len;
2223*5d465952Smartinh 		key.data = tmpkey.str;
2224*5d465952Smartinh 
2225*5d465952Smartinh 		remove_prefix(bt, &key, dst->prefix.len);
2226*5d465952Smartinh 		data.size = NODEDSZ(srcnode);
2227*5d465952Smartinh 		data.data = NODEDATA(srcnode);
2228*5d465952Smartinh 		rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,
2229*5d465952Smartinh 		    &data, NODEPGNO(srcnode), srcnode->flags);
2230*5d465952Smartinh 		if (rc != BT_SUCCESS)
2231*5d465952Smartinh 			return rc;
2232*5d465952Smartinh 	}
2233*5d465952Smartinh 
2234*5d465952Smartinh 	DPRINTF("dst page %u now has %lu keys (%.1f%% filled)",
2235*5d465952Smartinh 	    dst->pgno, NUMKEYS(dst), PAGEFILL(dst) * 100);
2236*5d465952Smartinh 
2237*5d465952Smartinh 	/* Unlink the src page from parent.
2238*5d465952Smartinh 	 */
2239*5d465952Smartinh 	btree_del_node(bt, src->parent, src->parent_index);
2240*5d465952Smartinh 	if (src->parent_index == 0) {
2241*5d465952Smartinh 		key.size = 0;
2242*5d465952Smartinh 		if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)
2243*5d465952Smartinh 			return BT_FAIL;
2244*5d465952Smartinh 
2245*5d465952Smartinh 		unsigned int pfxlen = src->prefix.len;
2246*5d465952Smartinh 		find_common_prefix(bt, src);
2247*5d465952Smartinh 		assert (src->prefix.len == pfxlen);
2248*5d465952Smartinh 	}
2249*5d465952Smartinh 
2250*5d465952Smartinh 	if (IS_LEAF(src))
2251*5d465952Smartinh 		bt->meta->leaf_pages--;
2252*5d465952Smartinh 	else
2253*5d465952Smartinh 		bt->meta->branch_pages--;
2254*5d465952Smartinh 
2255*5d465952Smartinh 	return btree_rebalance(bt, src->parent);
2256*5d465952Smartinh }
2257*5d465952Smartinh 
2258*5d465952Smartinh #define FILL_THRESHOLD	 0.25
2259*5d465952Smartinh 
2260*5d465952Smartinh static int
2261*5d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp)
2262*5d465952Smartinh {
2263*5d465952Smartinh 	indx_t		 si = 0, di = 0;
2264*5d465952Smartinh 	struct mpage	*parent;
2265*5d465952Smartinh 	struct mpage	*neighbor = NULL;
2266*5d465952Smartinh 
2267*5d465952Smartinh 	assert(bt != NULL);
2268*5d465952Smartinh 	assert(bt->txn != NULL);
2269*5d465952Smartinh 	assert(mp != NULL);
2270*5d465952Smartinh 
2271*5d465952Smartinh 	DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)",
2272*5d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch",
2273*5d465952Smartinh 	    mp->pgno, NUMKEYS(mp), PAGEFILL(mp) * 100);
2274*5d465952Smartinh 
2275*5d465952Smartinh 	if (PAGEFILL(mp) >= FILL_THRESHOLD) {
2276*5d465952Smartinh 		DPRINTF("no need to rebalance page %u, above fill threshold",
2277*5d465952Smartinh 		    mp->pgno);
2278*5d465952Smartinh 		return BT_SUCCESS;
2279*5d465952Smartinh 	}
2280*5d465952Smartinh 
2281*5d465952Smartinh 	parent = mp->parent;
2282*5d465952Smartinh 
2283*5d465952Smartinh 	if (parent == NULL) {
2284*5d465952Smartinh 		if (NUMKEYS(mp) == 0) {
2285*5d465952Smartinh 			DPRINTF("tree is completely empty");
2286*5d465952Smartinh 			bt->txn->root = P_INVALID;
2287*5d465952Smartinh 			bt->meta->depth--;
2288*5d465952Smartinh 			bt->meta->leaf_pages--;
2289*5d465952Smartinh 		} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
2290*5d465952Smartinh 			DPRINTF("collapsing root page!");
2291*5d465952Smartinh 			bt->txn->root = NODEPGNO(NODEPTR(mp, 0));
2292*5d465952Smartinh 			bt->meta->depth--;
2293*5d465952Smartinh 			bt->meta->branch_pages--;
2294*5d465952Smartinh 		} else
2295*5d465952Smartinh 			DPRINTF("root page doesn't need rebalancing");
2296*5d465952Smartinh 		return BT_SUCCESS;
2297*5d465952Smartinh 	}
2298*5d465952Smartinh 
2299*5d465952Smartinh 	/* The parent (branch page) must have at least 2 pointers,
2300*5d465952Smartinh 	 * otherwise the tree is invalid.
2301*5d465952Smartinh 	 */
2302*5d465952Smartinh 	assert(NUMKEYS(parent) > 1);
2303*5d465952Smartinh 
2304*5d465952Smartinh 	/* Leaf page fill factor is below the threshold.
2305*5d465952Smartinh 	 * Try to move keys from left or right neighbor, or
2306*5d465952Smartinh 	 * merge with a neighbor page.
2307*5d465952Smartinh 	 */
2308*5d465952Smartinh 
2309*5d465952Smartinh 	/* Find neighbors.
2310*5d465952Smartinh 	 */
2311*5d465952Smartinh 	if (mp->parent_index == 0) {
2312*5d465952Smartinh 		/* We're the leftmost leaf in our parent.
2313*5d465952Smartinh 		 */
2314*5d465952Smartinh 		DPRINTF("reading right neighbor");
2315*5d465952Smartinh 		if (btree_get_mpage(bt,
2316*5d465952Smartinh 		    NODEPGNO(NODEPTR(parent, mp->parent_index + 1)),
2317*5d465952Smartinh 		    &neighbor) != BT_SUCCESS) {
2318*5d465952Smartinh 			return BT_FAIL;
2319*5d465952Smartinh 		}
2320*5d465952Smartinh 		neighbor->parent_index = mp->parent_index + 1;
2321*5d465952Smartinh 		si = 0;
2322*5d465952Smartinh 		di = NUMKEYS(mp);
2323*5d465952Smartinh 	} else {
2324*5d465952Smartinh 		/* There is at least one neighbor to the left.
2325*5d465952Smartinh 		 */
2326*5d465952Smartinh 		DPRINTF("reading left neighbor");
2327*5d465952Smartinh 		if (btree_get_mpage(bt,
2328*5d465952Smartinh 		    NODEPGNO(NODEPTR(parent, mp->parent_index - 1)),
2329*5d465952Smartinh 		    &neighbor) != BT_SUCCESS) {
2330*5d465952Smartinh 			return BT_FAIL;
2331*5d465952Smartinh 		}
2332*5d465952Smartinh 		neighbor->parent_index = mp->parent_index - 1;
2333*5d465952Smartinh 		si = NUMKEYS(neighbor) - 1;
2334*5d465952Smartinh 		di = 0;
2335*5d465952Smartinh 	}
2336*5d465952Smartinh 	neighbor->parent = parent;
2337*5d465952Smartinh 
2338*5d465952Smartinh 	DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)",
2339*5d465952Smartinh 	    neighbor->pgno, NUMKEYS(neighbor), PAGEFILL(neighbor) * 100);
2340*5d465952Smartinh 
2341*5d465952Smartinh 	/* If the neighbor page is above threshold and has at least two
2342*5d465952Smartinh 	 * keys, move one key from it.
2343*5d465952Smartinh 	 *
2344*5d465952Smartinh 	 * Otherwise we should try to merge them, but that might not be
2345*5d465952Smartinh 	 * possible, even if both are below threshold, as prefix expansion
2346*5d465952Smartinh 	 * might make keys larger. FIXME: detect this
2347*5d465952Smartinh 	 */
2348*5d465952Smartinh 	if (PAGEFILL(neighbor) >= FILL_THRESHOLD &&
2349*5d465952Smartinh 	    NUMKEYS(neighbor) >= NUMKEYS(mp) + 2)
2350*5d465952Smartinh 		return btree_move_node(bt, neighbor, si, mp, di);
2351*5d465952Smartinh 	else { /* FIXME: if (has_enough_room()) */
2352*5d465952Smartinh 		if (mp->parent_index == 0)
2353*5d465952Smartinh 			return btree_merge(bt, neighbor, mp);
2354*5d465952Smartinh 		else
2355*5d465952Smartinh 			return btree_merge(bt, mp, neighbor);
2356*5d465952Smartinh 	}
2357*5d465952Smartinh }
2358*5d465952Smartinh 
2359*5d465952Smartinh int
2360*5d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn,
2361*5d465952Smartinh     struct btval *key, struct btval *data)
2362*5d465952Smartinh {
2363*5d465952Smartinh 	int		 rc, exact, close_txn = 0;
2364*5d465952Smartinh 	unsigned int	 ki;
2365*5d465952Smartinh 	struct node	*leaf;
2366*5d465952Smartinh 	struct mpage	*mp;
2367*5d465952Smartinh 
2368*5d465952Smartinh 	DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data);
2369*5d465952Smartinh 
2370*5d465952Smartinh 	assert(bt != NULL);
2371*5d465952Smartinh 	assert(key != NULL);
2372*5d465952Smartinh 
2373*5d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
2374*5d465952Smartinh 		errno = EINVAL;
2375*5d465952Smartinh 		return BT_FAIL;
2376*5d465952Smartinh 	}
2377*5d465952Smartinh 
2378*5d465952Smartinh 	if (txn == NULL) {
2379*5d465952Smartinh 		close_txn = 1;
2380*5d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
2381*5d465952Smartinh 			return BT_FAIL;
2382*5d465952Smartinh 	}
2383*5d465952Smartinh 
2384*5d465952Smartinh 	if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS)
2385*5d465952Smartinh 		goto done;
2386*5d465952Smartinh 
2387*5d465952Smartinh 	leaf = btree_search_node(bt, mp, key, &exact, &ki);
2388*5d465952Smartinh 	if (leaf == NULL || !exact) {
2389*5d465952Smartinh 		rc = BT_NOTFOUND;
2390*5d465952Smartinh 		goto done;
2391*5d465952Smartinh 	}
2392*5d465952Smartinh 
2393*5d465952Smartinh 	if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)
2394*5d465952Smartinh 		goto done;
2395*5d465952Smartinh 
2396*5d465952Smartinh 	btree_del_node(bt, mp, ki);
2397*5d465952Smartinh 	bt->meta->entries--;
2398*5d465952Smartinh 	rc = btree_rebalance(bt, mp);
2399*5d465952Smartinh 	if (rc != BT_SUCCESS)
2400*5d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
2401*5d465952Smartinh 
2402*5d465952Smartinh done:
2403*5d465952Smartinh 	if (close_txn) {
2404*5d465952Smartinh 		if (rc == BT_SUCCESS)
2405*5d465952Smartinh 			rc = btree_txn_commit(txn);
2406*5d465952Smartinh 		else
2407*5d465952Smartinh 			btree_txn_abort(txn);
2408*5d465952Smartinh 	}
2409*5d465952Smartinh 	mpage_prune(bt);
2410*5d465952Smartinh 	return rc;
2411*5d465952Smartinh }
2412*5d465952Smartinh 
2413*5d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that
2414*5d465952Smartinh  * still makes it uniquely distinguishable from <min>.
2415*5d465952Smartinh  *
2416*5d465952Smartinh  * <min> is guaranteed to be sorted less than <sep>
2417*5d465952Smartinh  *
2418*5d465952Smartinh  * On return, <sep> is modified to the minimum length.
2419*5d465952Smartinh  */
2420*5d465952Smartinh static void
2421*5d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)
2422*5d465952Smartinh {
2423*5d465952Smartinh 	size_t		 n = 0;
2424*5d465952Smartinh 	char		*p1;
2425*5d465952Smartinh 	char		*p2;
2426*5d465952Smartinh 
2427*5d465952Smartinh 	if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
2428*5d465952Smartinh 
2429*5d465952Smartinh 		assert(sep->size > 0);
2430*5d465952Smartinh 
2431*5d465952Smartinh 		p1 = (char *)NODEKEY(min) + min->ksize - 1;
2432*5d465952Smartinh 		p2 = (char *)sep->data + sep->size - 1;
2433*5d465952Smartinh 
2434*5d465952Smartinh 		while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) {
2435*5d465952Smartinh 			assert(p2 > (char *)sep->data);
2436*5d465952Smartinh 			p1--;
2437*5d465952Smartinh 			p2--;
2438*5d465952Smartinh 			n++;
2439*5d465952Smartinh 		}
2440*5d465952Smartinh 
2441*5d465952Smartinh 		sep->data = p2;
2442*5d465952Smartinh 		sep->size = n + 1;
2443*5d465952Smartinh 	} else {
2444*5d465952Smartinh 
2445*5d465952Smartinh 		assert(min->ksize > 0);
2446*5d465952Smartinh 		assert(sep->size > 0);
2447*5d465952Smartinh 
2448*5d465952Smartinh 		p1 = (char *)NODEKEY(min);
2449*5d465952Smartinh 		p2 = (char *)sep->data;
2450*5d465952Smartinh 
2451*5d465952Smartinh 		while (*p1 == *p2) {
2452*5d465952Smartinh 			p1++;
2453*5d465952Smartinh 			p2++;
2454*5d465952Smartinh 			n++;
2455*5d465952Smartinh 			if (n == min->ksize || n == sep->size)
2456*5d465952Smartinh 				break;
2457*5d465952Smartinh 		}
2458*5d465952Smartinh 
2459*5d465952Smartinh 		sep->size = n + 1;
2460*5d465952Smartinh 	}
2461*5d465952Smartinh 
2462*5d465952Smartinh 	DPRINTF("reduced separator to [%.*s] > [%.*s]",
2463*5d465952Smartinh 	    (int)sep->size, (char *)sep->data,
2464*5d465952Smartinh 	    (int)min->ksize, (char *)NODEKEY(min));
2465*5d465952Smartinh }
2466*5d465952Smartinh 
2467*5d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
2468*5d465952Smartinh  * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
2469*5d465952Smartinh  * *newindxp with the actual values after split, ie if *mpp and *newindxp
2470*5d465952Smartinh  * refer to a node in the new right sibling page.
2471*5d465952Smartinh  */
2472*5d465952Smartinh static int
2473*5d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,
2474*5d465952Smartinh     struct btval *newkey, struct btval *newdata, pgno_t newpgno)
2475*5d465952Smartinh {
2476*5d465952Smartinh 	uint8_t		 flags;
2477*5d465952Smartinh 	int		 rc = BT_SUCCESS, ins_new = 0;
2478*5d465952Smartinh 	indx_t		 newindx;
2479*5d465952Smartinh 	pgno_t		 pgno = 0;
2480*5d465952Smartinh 	size_t		 orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;
2481*5d465952Smartinh 	unsigned int	 i, j, split_indx;
2482*5d465952Smartinh 	struct node	*node;
2483*5d465952Smartinh 	struct mpage	*pright, *p, *mp;
2484*5d465952Smartinh 	struct btval	 sepkey, rkey, rdata;
2485*5d465952Smartinh 	struct btkey	 tmpkey;
2486*5d465952Smartinh 	struct page	 copy;
2487*5d465952Smartinh 
2488*5d465952Smartinh 	assert(bt != NULL);
2489*5d465952Smartinh 	assert(bt->txn != NULL);
2490*5d465952Smartinh 
2491*5d465952Smartinh 	mp = *mpp;
2492*5d465952Smartinh 	newindx = *newindxp;
2493*5d465952Smartinh 
2494*5d465952Smartinh 	DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i",
2495*5d465952Smartinh 	    IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,
2496*5d465952Smartinh 	    (int)newkey->size, (char *)newkey->data, *newindxp);
2497*5d465952Smartinh 	DPRINTF("page %u has prefix [%.*s]", mp->pgno,
2498*5d465952Smartinh 	    (int)mp->prefix.len, (char *)mp->prefix.str);
2499*5d465952Smartinh 	orig_pfx_len = mp->prefix.len;
2500*5d465952Smartinh 
2501*5d465952Smartinh 	if (mp->parent == NULL) {
2502*5d465952Smartinh 		if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)
2503*5d465952Smartinh 			return BT_FAIL;
2504*5d465952Smartinh 		mp->parent_index = 0;
2505*5d465952Smartinh 		bt->txn->root = mp->parent->pgno;
2506*5d465952Smartinh 		DPRINTF("root split! new root = %u", mp->parent->pgno);
2507*5d465952Smartinh 		bt->meta->depth++;
2508*5d465952Smartinh 
2509*5d465952Smartinh 		/* Add left (implicit) pointer. */
2510*5d465952Smartinh 		if (btree_add_node(bt, mp->parent, 0, NULL, NULL,
2511*5d465952Smartinh 		    mp->pgno, 0) != BT_SUCCESS)
2512*5d465952Smartinh 			return BT_FAIL;
2513*5d465952Smartinh 	} else {
2514*5d465952Smartinh 		DPRINTF("parent branch page is %u", mp->parent->pgno);
2515*5d465952Smartinh 	}
2516*5d465952Smartinh 
2517*5d465952Smartinh 	/* Create a right sibling. */
2518*5d465952Smartinh 	if ((pright = btree_new_page(bt, mp->page.flags)) == NULL)
2519*5d465952Smartinh 		return BT_FAIL;
2520*5d465952Smartinh 	pright->parent = mp->parent;
2521*5d465952Smartinh 	pright->parent_index = mp->parent_index + 1;
2522*5d465952Smartinh 	DPRINTF("new right sibling: page %u", pright->pgno);
2523*5d465952Smartinh 
2524*5d465952Smartinh 	/* Move half of the keys to the right sibling. */
2525*5d465952Smartinh 	bcopy(&mp->page, &copy, PAGESIZE);
2526*5d465952Smartinh 	assert(mp->ref == 0);				/* XXX */
2527*5d465952Smartinh 	bzero(&mp->page.ptrs, PAGESIZE - PAGEHDRSZ);
2528*5d465952Smartinh 	mp->page.lower = PAGEHDRSZ;
2529*5d465952Smartinh 	mp->page.upper = PAGESIZE;
2530*5d465952Smartinh 
2531*5d465952Smartinh 	split_indx = NUMKEYSP(&copy) / 2 + 1;
2532*5d465952Smartinh 
2533*5d465952Smartinh 	/* First find the separating key between the split pages.
2534*5d465952Smartinh 	 */
2535*5d465952Smartinh 	bzero(&sepkey, sizeof(sepkey));
2536*5d465952Smartinh 	if (newindx == split_indx) {
2537*5d465952Smartinh 		sepkey.size = newkey->size;
2538*5d465952Smartinh 		sepkey.data = newkey->data;
2539*5d465952Smartinh 		remove_prefix(bt, &sepkey, mp->prefix.len);
2540*5d465952Smartinh 	} else {
2541*5d465952Smartinh 		node = NODEPTRP(&copy, split_indx);
2542*5d465952Smartinh 		sepkey.size = node->ksize;
2543*5d465952Smartinh 		sepkey.data = NODEKEY(node);
2544*5d465952Smartinh 	}
2545*5d465952Smartinh 
2546*5d465952Smartinh 	if (IS_LEAF(mp) && bt->cmp == NULL) {
2547*5d465952Smartinh 		/* Find the smallest separator. */
2548*5d465952Smartinh 		/* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */
2549*5d465952Smartinh 		node = NODEPTRP(&copy, split_indx - 1);
2550*5d465952Smartinh 		bt_reduce_separator(bt, node, &sepkey);
2551*5d465952Smartinh 	}
2552*5d465952Smartinh 
2553*5d465952Smartinh 	/* Fix separator wrt parent prefix. */
2554*5d465952Smartinh 	if (bt->cmp == NULL) {
2555*5d465952Smartinh 		tmpkey.len = sizeof(tmpkey.str);
2556*5d465952Smartinh 		concat_prefix(bt, mp->prefix.str, mp->prefix.len,
2557*5d465952Smartinh 		    sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len);
2558*5d465952Smartinh 		sepkey.data = tmpkey.str;
2559*5d465952Smartinh 		sepkey.size = tmpkey.len;
2560*5d465952Smartinh 	}
2561*5d465952Smartinh 
2562*5d465952Smartinh 	DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data);
2563*5d465952Smartinh 
2564*5d465952Smartinh 	/* Copy separator key to the parent.
2565*5d465952Smartinh 	 */
2566*5d465952Smartinh 	if (SIZELEFT(pright->parent) < bt_branch_size(&sepkey)) {
2567*5d465952Smartinh 		rc = btree_split(bt, &pright->parent, &pright->parent_index,
2568*5d465952Smartinh 		    &sepkey, NULL, pright->pgno);
2569*5d465952Smartinh 
2570*5d465952Smartinh 		/* Right page might now have changed parent.
2571*5d465952Smartinh 		 * Check if left page also changed parent.
2572*5d465952Smartinh 		 */
2573*5d465952Smartinh 		if (pright->parent != mp->parent &&
2574*5d465952Smartinh 		    mp->parent_index >= NUMKEYS(mp->parent)) {
2575*5d465952Smartinh 			mp->parent = pright->parent;
2576*5d465952Smartinh 			mp->parent_index = pright->parent_index - 1;
2577*5d465952Smartinh 		}
2578*5d465952Smartinh 	} else {
2579*5d465952Smartinh 		remove_prefix(bt, &sepkey, pright->parent->prefix.len);
2580*5d465952Smartinh 		rc = btree_add_node(bt, pright->parent, pright->parent_index,
2581*5d465952Smartinh 		    &sepkey, NULL, pright->pgno, 0);
2582*5d465952Smartinh 	}
2583*5d465952Smartinh 	if (rc != BT_SUCCESS)
2584*5d465952Smartinh 		return BT_FAIL;
2585*5d465952Smartinh 
2586*5d465952Smartinh 	/* Update prefix for right and left page, if the parent was split.
2587*5d465952Smartinh 	 */
2588*5d465952Smartinh 	find_common_prefix(bt, pright);
2589*5d465952Smartinh 	assert(orig_pfx_len <= pright->prefix.len);
2590*5d465952Smartinh 	right_pfx_diff = pright->prefix.len - orig_pfx_len;
2591*5d465952Smartinh 
2592*5d465952Smartinh 	find_common_prefix(bt, mp);
2593*5d465952Smartinh 	assert(orig_pfx_len <= mp->prefix.len);
2594*5d465952Smartinh 	left_pfx_diff = mp->prefix.len - orig_pfx_len;
2595*5d465952Smartinh 
2596*5d465952Smartinh 	for (i = j = 0; i <= NUMKEYSP(&copy); j++) {
2597*5d465952Smartinh 		if (i < split_indx) {
2598*5d465952Smartinh 			/* Re-insert in left sibling. */
2599*5d465952Smartinh 			p = mp;
2600*5d465952Smartinh 			pfx_diff = left_pfx_diff;
2601*5d465952Smartinh 		} else {
2602*5d465952Smartinh 			/* Insert in right sibling. */
2603*5d465952Smartinh 			if (i == split_indx)
2604*5d465952Smartinh 				/* Reset insert index for right sibling. */
2605*5d465952Smartinh 				j = (i == newindx && ins_new);
2606*5d465952Smartinh 			p = pright;
2607*5d465952Smartinh 			pfx_diff = right_pfx_diff;
2608*5d465952Smartinh 		}
2609*5d465952Smartinh 
2610*5d465952Smartinh 		if (i == newindx && !ins_new) {
2611*5d465952Smartinh 			/* Insert the original entry that caused the split. */
2612*5d465952Smartinh 			rkey.data = newkey->data;
2613*5d465952Smartinh 			rkey.size = newkey->size;
2614*5d465952Smartinh 			if (IS_LEAF(mp)) {
2615*5d465952Smartinh 				rdata.data = newdata->data;
2616*5d465952Smartinh 				rdata.size = newdata->size;
2617*5d465952Smartinh 			} else
2618*5d465952Smartinh 				pgno = newpgno;
2619*5d465952Smartinh 			flags = 0;
2620*5d465952Smartinh 			pfx_diff = p->prefix.len;
2621*5d465952Smartinh 
2622*5d465952Smartinh 			ins_new = 1;
2623*5d465952Smartinh 
2624*5d465952Smartinh 			/* Update page and index for the new key. */
2625*5d465952Smartinh 			*newindxp = j;
2626*5d465952Smartinh 			*mpp = p;
2627*5d465952Smartinh 		} else if (i == NUMKEYSP(&copy)) {
2628*5d465952Smartinh 			break;
2629*5d465952Smartinh 		} else {
2630*5d465952Smartinh 			node = NODEPTRP(&copy, i);
2631*5d465952Smartinh 			rkey.data = NODEKEY(node);
2632*5d465952Smartinh 			rkey.size = node->ksize;
2633*5d465952Smartinh 			if (IS_LEAF(mp)) {
2634*5d465952Smartinh 				rdata.data = NODEDATA(node);
2635*5d465952Smartinh 				rdata.size = node->n_dsize;
2636*5d465952Smartinh 			} else
2637*5d465952Smartinh 				pgno = node->n_pgno;
2638*5d465952Smartinh 			flags = node->flags;
2639*5d465952Smartinh 
2640*5d465952Smartinh 			i++;
2641*5d465952Smartinh 		}
2642*5d465952Smartinh 
2643*5d465952Smartinh 		if (!IS_LEAF(mp) && j == 0) {
2644*5d465952Smartinh 			/* First branch index doesn't need key data. */
2645*5d465952Smartinh 			rkey.size = 0;
2646*5d465952Smartinh 		} else
2647*5d465952Smartinh 			remove_prefix(bt, &rkey, pfx_diff);
2648*5d465952Smartinh 
2649*5d465952Smartinh 		rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);
2650*5d465952Smartinh 		if (rc != BT_SUCCESS)
2651*5d465952Smartinh 			return BT_FAIL;
2652*5d465952Smartinh 	}
2653*5d465952Smartinh 
2654*5d465952Smartinh 	return rc;
2655*5d465952Smartinh }
2656*5d465952Smartinh 
2657*5d465952Smartinh int
2658*5d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn,
2659*5d465952Smartinh     struct btval *key, struct btval *data, unsigned int flags)
2660*5d465952Smartinh {
2661*5d465952Smartinh 	int		 rc = BT_SUCCESS, exact, close_txn = 0;
2662*5d465952Smartinh 	unsigned int	 ki;
2663*5d465952Smartinh 	struct node	*leaf;
2664*5d465952Smartinh 	struct mpage	*mp;
2665*5d465952Smartinh 	struct btval	 xkey;
2666*5d465952Smartinh 
2667*5d465952Smartinh 	assert(bt != NULL);
2668*5d465952Smartinh 	assert(key != NULL);
2669*5d465952Smartinh 	assert(data != NULL);
2670*5d465952Smartinh 
2671*5d465952Smartinh 	if (key->size == 0 || key->size > MAXKEYSIZE) {
2672*5d465952Smartinh 		errno = EINVAL;
2673*5d465952Smartinh 		return BT_FAIL;
2674*5d465952Smartinh 	}
2675*5d465952Smartinh 
2676*5d465952Smartinh 	DPRINTF("==> put key %.*s, size %zu, data size %zu",
2677*5d465952Smartinh 		(int)key->size, (char *)key->data, key->size, data->size);
2678*5d465952Smartinh 
2679*5d465952Smartinh 	if (txn == NULL) {
2680*5d465952Smartinh 		close_txn = 1;
2681*5d465952Smartinh 		if ((txn = btree_txn_begin(bt, 0)) == NULL)
2682*5d465952Smartinh 			return BT_FAIL;
2683*5d465952Smartinh 	}
2684*5d465952Smartinh 
2685*5d465952Smartinh 	rc = btree_search_page(bt, txn, key, NULL, 1, &mp);
2686*5d465952Smartinh 	if (rc == BT_SUCCESS) {
2687*5d465952Smartinh 		leaf = btree_search_node(bt, mp, key, &exact, &ki);
2688*5d465952Smartinh 		if (leaf && exact) {
2689*5d465952Smartinh 			if (F_ISSET(flags, BT_NOOVERWRITE)) {
2690*5d465952Smartinh 				DPRINTF("duplicate key %.*s",
2691*5d465952Smartinh 				    (int)key->size, (char *)key->data);
2692*5d465952Smartinh 				rc = BT_EXISTS;
2693*5d465952Smartinh 				goto done;
2694*5d465952Smartinh 			}
2695*5d465952Smartinh 			btree_del_node(bt, mp, ki);
2696*5d465952Smartinh 		}
2697*5d465952Smartinh 		if (leaf == NULL) {		/* append if not found */
2698*5d465952Smartinh 			ki = NUMKEYS(mp);
2699*5d465952Smartinh 			DPRINTF("appending key at index %i", ki);
2700*5d465952Smartinh 		}
2701*5d465952Smartinh 	} else if (rc == BT_NOTFOUND) {
2702*5d465952Smartinh 		/* new file, just write a root leaf page */
2703*5d465952Smartinh 		DPRINTF("allocating new root leaf page");
2704*5d465952Smartinh 		if ((mp = btree_new_page(bt, P_LEAF)) == NULL) {
2705*5d465952Smartinh 			rc = BT_FAIL;
2706*5d465952Smartinh 			goto done;
2707*5d465952Smartinh 		}
2708*5d465952Smartinh 		txn->root = mp->pgno;
2709*5d465952Smartinh 		bt->meta->depth++;
2710*5d465952Smartinh 		ki = 0;
2711*5d465952Smartinh 	}
2712*5d465952Smartinh 	else
2713*5d465952Smartinh 		goto done;
2714*5d465952Smartinh 
2715*5d465952Smartinh 	assert(IS_LEAF(mp));
2716*5d465952Smartinh 	DPRINTF("there are %lu keys, should insert new key at index %i",
2717*5d465952Smartinh 		NUMKEYS(mp), ki);
2718*5d465952Smartinh 
2719*5d465952Smartinh 	/* Copy the key pointer as it is modified by the prefix code. The
2720*5d465952Smartinh 	 * caller might have malloc'ed the data.
2721*5d465952Smartinh 	 */
2722*5d465952Smartinh 	xkey.data = key->data;
2723*5d465952Smartinh 	xkey.size = key->size;
2724*5d465952Smartinh 
2725*5d465952Smartinh 	if (SIZELEFT(mp) < bt_leaf_size(key, data)) {
2726*5d465952Smartinh 		rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);
2727*5d465952Smartinh 	} else {
2728*5d465952Smartinh 		/* There is room already in this leaf page. */
2729*5d465952Smartinh 		remove_prefix(bt, &xkey, mp->prefix.len);
2730*5d465952Smartinh 		rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);
2731*5d465952Smartinh 	}
2732*5d465952Smartinh 
2733*5d465952Smartinh 	if (rc != BT_SUCCESS)
2734*5d465952Smartinh 		txn->flags |= BT_TXN_ERROR;
2735*5d465952Smartinh 	else
2736*5d465952Smartinh 		bt->meta->entries++;
2737*5d465952Smartinh 
2738*5d465952Smartinh done:
2739*5d465952Smartinh 	if (close_txn) {
2740*5d465952Smartinh 		if (rc == BT_SUCCESS)
2741*5d465952Smartinh 			rc = btree_txn_commit(txn);
2742*5d465952Smartinh 		else
2743*5d465952Smartinh 			btree_txn_abort(txn);
2744*5d465952Smartinh 	}
2745*5d465952Smartinh 	mpage_prune(bt);
2746*5d465952Smartinh 	return rc;
2747*5d465952Smartinh }
2748*5d465952Smartinh 
2749*5d465952Smartinh static pgno_t
2750*5d465952Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, int fd)
2751*5d465952Smartinh {
2752*5d465952Smartinh 	indx_t		 i;
2753*5d465952Smartinh 	pgno_t		*pnext;
2754*5d465952Smartinh 	struct node	*node;
2755*5d465952Smartinh 	struct page	*p;
2756*5d465952Smartinh 	struct mpage	*mp;
2757*5d465952Smartinh 	char		 page[PAGESIZE];
2758*5d465952Smartinh 
2759*5d465952Smartinh 	p = (struct page *)page;
2760*5d465952Smartinh 	if ((mp = mpage_lookup(bt, pgno)) != NULL)
2761*5d465952Smartinh 		bcopy(&mp->page, p, PAGESIZE);
2762*5d465952Smartinh 	else if (btree_read_page(bt, pgno, p) != BT_SUCCESS)
2763*5d465952Smartinh 		return P_INVALID;
2764*5d465952Smartinh 
2765*5d465952Smartinh 	if (F_ISSET(p->flags, P_BRANCH)) {
2766*5d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
2767*5d465952Smartinh 			node = NODEPTRP(p, i);
2768*5d465952Smartinh 			node->n_pgno = btree_compact_tree(bt, node->n_pgno, fd);
2769*5d465952Smartinh 			if (node->n_pgno == P_INVALID)
2770*5d465952Smartinh 				return P_INVALID;
2771*5d465952Smartinh 		}
2772*5d465952Smartinh 	} else if (F_ISSET(p->flags, P_LEAF)) {
2773*5d465952Smartinh 		for (i = 0; i < NUMKEYSP(p); i++) {
2774*5d465952Smartinh 			node = NODEPTRP(p, i);
2775*5d465952Smartinh 			if (F_ISSET(node->flags, F_BIGDATA)) {
2776*5d465952Smartinh 				pnext = NODEDATA(node);
2777*5d465952Smartinh 				*pnext = btree_compact_tree(bt, *pnext, fd);
2778*5d465952Smartinh 				if (*pnext == P_INVALID)
2779*5d465952Smartinh 					return P_INVALID;
2780*5d465952Smartinh 			}
2781*5d465952Smartinh 		}
2782*5d465952Smartinh 	} else if (F_ISSET(p->flags, P_OVERFLOW)) {
2783*5d465952Smartinh 		pnext = &p->p_next_pgno;
2784*5d465952Smartinh 		if (*pnext > 0) {
2785*5d465952Smartinh 			*pnext = btree_compact_tree(bt, *pnext, fd);
2786*5d465952Smartinh 			if (*pnext == P_INVALID)
2787*5d465952Smartinh 				return P_INVALID;
2788*5d465952Smartinh 		}
2789*5d465952Smartinh 	} else
2790*5d465952Smartinh 		assert(0);
2791*5d465952Smartinh 
2792*5d465952Smartinh 	p->pgno = bt->txn->next_pgno++;
2793*5d465952Smartinh 	if (write(fd, page, PAGESIZE) != PAGESIZE)
2794*5d465952Smartinh 		return P_INVALID;
2795*5d465952Smartinh 	return p->pgno;
2796*5d465952Smartinh }
2797*5d465952Smartinh 
2798*5d465952Smartinh int
2799*5d465952Smartinh btree_compact(struct btree *bt)
2800*5d465952Smartinh {
2801*5d465952Smartinh 	int			 rc, fd, old_fd;
2802*5d465952Smartinh 	char			*compact_path = NULL;
2803*5d465952Smartinh 	struct btree_txn	*txn;
2804*5d465952Smartinh 	pgno_t			 root;
2805*5d465952Smartinh 
2806*5d465952Smartinh 	assert(bt != NULL);
2807*5d465952Smartinh 
2808*5d465952Smartinh 	if (bt->path == NULL)
2809*5d465952Smartinh 		return BT_FAIL;
2810*5d465952Smartinh 
2811*5d465952Smartinh 	if ((rc = btree_read_meta(bt, NULL)) == BT_FAIL)
2812*5d465952Smartinh 		return BT_FAIL;
2813*5d465952Smartinh 	else if (rc == BT_NOTFOUND)
2814*5d465952Smartinh 		return BT_SUCCESS;
2815*5d465952Smartinh 
2816*5d465952Smartinh 	asprintf(&compact_path, "%s.compact.XXXXXX", bt->path);
2817*5d465952Smartinh 	fd = mkstemp(compact_path);
2818*5d465952Smartinh 	if (fd == -1) {
2819*5d465952Smartinh 		free(compact_path);
2820*5d465952Smartinh 		return BT_FAIL;
2821*5d465952Smartinh 	}
2822*5d465952Smartinh 
2823*5d465952Smartinh 	old_fd = bt->fd;
2824*5d465952Smartinh 
2825*5d465952Smartinh 	if ((txn = btree_txn_begin(bt, 0)) == NULL)
2826*5d465952Smartinh 		goto failed;
2827*5d465952Smartinh 
2828*5d465952Smartinh 	bt->txn->next_pgno = 0;
2829*5d465952Smartinh 	root = btree_compact_tree(bt, bt->meta->root, fd);
2830*5d465952Smartinh 	if (root == P_INVALID)
2831*5d465952Smartinh 		goto failed;
2832*5d465952Smartinh 	bt->fd = fd;
2833*5d465952Smartinh 	bt->meta->revisions = 0;
2834*5d465952Smartinh 	if (btree_write_meta(bt, root) != BT_SUCCESS)
2835*5d465952Smartinh 		goto failed;
2836*5d465952Smartinh 
2837*5d465952Smartinh 	fsync(fd);
2838*5d465952Smartinh 
2839*5d465952Smartinh 	DPRINTF("renaming %s to %s", compact_path, bt->path);
2840*5d465952Smartinh 	if (rename(compact_path, bt->path) != 0)
2841*5d465952Smartinh 		goto failed;
2842*5d465952Smartinh 
2843*5d465952Smartinh 	// XXX: write a "reopen me" meta page for other processes to see
2844*5d465952Smartinh 	btree_txn_abort(txn);
2845*5d465952Smartinh 	close(old_fd);
2846*5d465952Smartinh 
2847*5d465952Smartinh 	free(compact_path);
2848*5d465952Smartinh 	return BT_SUCCESS;
2849*5d465952Smartinh 
2850*5d465952Smartinh failed:
2851*5d465952Smartinh 	bt->fd = old_fd;
2852*5d465952Smartinh 	btree_txn_abort(txn);
2853*5d465952Smartinh 	unlink(compact_path);
2854*5d465952Smartinh 	free(compact_path);
2855*5d465952Smartinh 	return BT_FAIL;
2856*5d465952Smartinh }
2857*5d465952Smartinh 
2858*5d465952Smartinh /* Reverts the last change. Truncates the file at the last root page.
2859*5d465952Smartinh  */
2860*5d465952Smartinh int
2861*5d465952Smartinh btree_revert(struct btree *bt)
2862*5d465952Smartinh {
2863*5d465952Smartinh 	if (btree_read_meta(bt, NULL) == BT_FAIL)
2864*5d465952Smartinh 		return BT_FAIL;
2865*5d465952Smartinh 
2866*5d465952Smartinh 	if (bt->meta == NULL)
2867*5d465952Smartinh 		return BT_SUCCESS;
2868*5d465952Smartinh 
2869*5d465952Smartinh 	DPRINTF("truncating file at page %u", bt->meta->root);
2870*5d465952Smartinh 	return ftruncate(bt->fd, PAGESIZE * bt->meta->root);
2871*5d465952Smartinh }
2872*5d465952Smartinh 
2873*5d465952Smartinh void
2874*5d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size)
2875*5d465952Smartinh {
2876*5d465952Smartinh 	bt->stat.max_cache = cache_size;
2877*5d465952Smartinh }
2878*5d465952Smartinh 
2879*5d465952Smartinh unsigned int
2880*5d465952Smartinh btree_get_flags(struct btree *bt)
2881*5d465952Smartinh {
2882*5d465952Smartinh 	return (bt->flags & ~BT_FIXPADDING);
2883*5d465952Smartinh }
2884*5d465952Smartinh 
2885*5d465952Smartinh const char *
2886*5d465952Smartinh btree_get_path(struct btree *bt)
2887*5d465952Smartinh {
2888*5d465952Smartinh 	return bt->path;
2889*5d465952Smartinh }
2890*5d465952Smartinh 
2891*5d465952Smartinh const struct btree_stat *
2892*5d465952Smartinh btree_stat(struct btree *bt)
2893*5d465952Smartinh {
2894*5d465952Smartinh 	bt->stat.branch_pages = bt->meta->branch_pages;
2895*5d465952Smartinh 	bt->stat.leaf_pages = bt->meta->leaf_pages;
2896*5d465952Smartinh 	bt->stat.overflow_pages = bt->meta->overflow_pages;
2897*5d465952Smartinh 	bt->stat.revisions = bt->meta->revisions;
2898*5d465952Smartinh 	bt->stat.depth = bt->meta->depth;
2899*5d465952Smartinh 	bt->stat.entries = bt->meta->entries;
2900*5d465952Smartinh 	bt->stat.psize = bt->meta->psize;
2901*5d465952Smartinh 	bt->stat.created_at = bt->meta->created_at;
2902*5d465952Smartinh 
2903*5d465952Smartinh 	return &bt->stat;
2904*5d465952Smartinh }
2905*5d465952Smartinh 
2906