1*3572a0e3Ssthen /* $OpenBSD: btree.c,v 1.38 2017/05/26 21:23:14 sthen Exp $ */
25d465952Smartinh
35d465952Smartinh /*
45d465952Smartinh * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
55d465952Smartinh *
65d465952Smartinh * Permission to use, copy, modify, and distribute this software for any
75d465952Smartinh * purpose with or without fee is hereby granted, provided that the above
85d465952Smartinh * copyright notice and this permission notice appear in all copies.
95d465952Smartinh *
105d465952Smartinh * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
115d465952Smartinh * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
125d465952Smartinh * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
135d465952Smartinh * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
145d465952Smartinh * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
155d465952Smartinh * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
165d465952Smartinh * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
175d465952Smartinh */
185d465952Smartinh
195d465952Smartinh #include <sys/types.h>
205d465952Smartinh #include <sys/tree.h>
2137ee45a1Smartinh #include <sys/stat.h>
225d465952Smartinh #include <sys/queue.h>
235d465952Smartinh #include <sys/uio.h>
245d465952Smartinh
255d465952Smartinh #include <assert.h>
265d465952Smartinh #include <err.h>
275d465952Smartinh #include <errno.h>
285d465952Smartinh #include <fcntl.h>
295d465952Smartinh #include <stddef.h>
305d465952Smartinh #include <stdint.h>
315d465952Smartinh #include <stdio.h>
325d465952Smartinh #include <stdlib.h>
335d465952Smartinh #include <string.h>
345d465952Smartinh #include <time.h>
355d465952Smartinh #include <unistd.h>
365d465952Smartinh
375d465952Smartinh #include "btree.h"
385d465952Smartinh
395d465952Smartinh /* #define DEBUG */
405d465952Smartinh
415d465952Smartinh #ifdef DEBUG
425d465952Smartinh # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \
435d465952Smartinh fprintf(stderr, __VA_ARGS__); \
445d465952Smartinh fprintf(stderr, "\n"); } while(0)
455d465952Smartinh #else
465d465952Smartinh # define DPRINTF(...)
475d465952Smartinh #endif
485d465952Smartinh
495d465952Smartinh #define PAGESIZE 4096
505d465952Smartinh #define BT_MINKEYS 4
515d465952Smartinh #define BT_MAGIC 0xB3DBB3DB
5237ee45a1Smartinh #define BT_VERSION 4
535d465952Smartinh #define MAXKEYSIZE 255
545d465952Smartinh
555d465952Smartinh #define P_INVALID 0xFFFFFFFF
565d465952Smartinh
575d465952Smartinh #define F_ISSET(w, f) (((w) & (f)) == (f))
58*3572a0e3Ssthen #define MINIMUM(a,b) ((a) < (b) ? (a) : (b))
595d465952Smartinh
60176c8cbcSmartinh typedef uint32_t pgno_t;
61176c8cbcSmartinh typedef uint16_t indx_t;
62176c8cbcSmartinh
635d465952Smartinh /* There are four page types: meta, index, leaf and overflow.
645d465952Smartinh * They all share the same page header.
655d465952Smartinh */
665d465952Smartinh struct page { /* represents an on-disk page */
6737ee45a1Smartinh pgno_t pgno; /* page number */
685d465952Smartinh #define P_BRANCH 0x01 /* branch page */
695d465952Smartinh #define P_LEAF 0x02 /* leaf page */
705d465952Smartinh #define P_OVERFLOW 0x04 /* overflow page */
715d465952Smartinh #define P_META 0x08 /* meta page */
7237ee45a1Smartinh #define P_HEAD 0x10 /* header page */
735d465952Smartinh uint32_t flags;
745d465952Smartinh #define lower b.fb.fb_lower
755d465952Smartinh #define upper b.fb.fb_upper
765d465952Smartinh #define p_next_pgno b.pb_next_pgno
775d465952Smartinh union page_bounds {
785d465952Smartinh struct {
795d465952Smartinh indx_t fb_lower; /* lower bound of free space */
805d465952Smartinh indx_t fb_upper; /* upper bound of free space */
815d465952Smartinh } fb;
825d465952Smartinh pgno_t pb_next_pgno; /* overflow page linked list */
835d465952Smartinh } b;
8437ee45a1Smartinh indx_t ptrs[1]; /* dynamic size */
85bf245c3bSmartinh } __packed;
865d465952Smartinh
8737ee45a1Smartinh #define PAGEHDRSZ offsetof(struct page, ptrs)
885d465952Smartinh
8937ee45a1Smartinh #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1)
9037ee45a1Smartinh #define NUMKEYS(mp) (((mp)->page->lower - PAGEHDRSZ) >> 1)
9137ee45a1Smartinh #define SIZELEFT(mp) (indx_t)((mp)->page->upper - (mp)->page->lower)
92944ad5d4Smartinh #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \
9337ee45a1Smartinh ((bt)->head.psize - PAGEHDRSZ))
9437ee45a1Smartinh #define IS_LEAF(mp) F_ISSET((mp)->page->flags, P_LEAF)
9537ee45a1Smartinh #define IS_BRANCH(mp) F_ISSET((mp)->page->flags, P_BRANCH)
9637ee45a1Smartinh #define IS_OVERFLOW(mp) F_ISSET((mp)->page->flags, P_OVERFLOW)
9737ee45a1Smartinh
9837ee45a1Smartinh struct bt_head { /* header page content */
9937ee45a1Smartinh uint32_t magic;
1005d465952Smartinh uint32_t version;
1015d465952Smartinh uint32_t flags;
1025d465952Smartinh uint32_t psize; /* page size */
10337ee45a1Smartinh } __packed;
10437ee45a1Smartinh
10537ee45a1Smartinh struct bt_meta { /* meta (footer) page content */
10637ee45a1Smartinh #define BT_TOMBSTONE 0x01 /* file is replaced */
10737ee45a1Smartinh uint32_t flags;
1085d465952Smartinh pgno_t root; /* page number of root page */
1095d465952Smartinh pgno_t prev_meta; /* previous meta page number */
1105d465952Smartinh time_t created_at;
1115d465952Smartinh uint32_t branch_pages;
1125d465952Smartinh uint32_t leaf_pages;
1135d465952Smartinh uint32_t overflow_pages;
1145d465952Smartinh uint32_t revisions;
1155d465952Smartinh uint32_t depth;
1165d465952Smartinh uint64_t entries;
1175d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH];
118bf245c3bSmartinh } __packed;
1195d465952Smartinh
1205d465952Smartinh struct btkey {
1215d465952Smartinh size_t len;
1225d465952Smartinh char str[MAXKEYSIZE];
1235d465952Smartinh };
1245d465952Smartinh
1255d465952Smartinh struct mpage { /* an in-memory cached page */
1265d465952Smartinh RB_ENTRY(mpage) entry; /* page cache entry */
1275d465952Smartinh SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */
1285d465952Smartinh TAILQ_ENTRY(mpage) lru_next; /* LRU queue */
1295d465952Smartinh struct mpage *parent; /* NULL if root */
1305d465952Smartinh unsigned int parent_index; /* keep track of node index */
1315d465952Smartinh struct btkey prefix;
13237ee45a1Smartinh struct page *page;
1335d465952Smartinh pgno_t pgno; /* copy of page->pgno */
1345d465952Smartinh short ref; /* increased by cursors */
1355d465952Smartinh short dirty; /* 1 if on dirty queue */
1365d465952Smartinh };
1375d465952Smartinh RB_HEAD(page_cache, mpage);
1385d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage);
1395d465952Smartinh TAILQ_HEAD(lru_queue, mpage);
1405d465952Smartinh
1415d465952Smartinh static int mpage_cmp(struct mpage *a, struct mpage *b);
1425d465952Smartinh static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno);
1435d465952Smartinh static void mpage_add(struct btree *bt, struct mpage *mp);
14437ee45a1Smartinh static void mpage_free(struct mpage *mp);
1455d465952Smartinh static void mpage_del(struct btree *bt, struct mpage *mp);
146e14f8e24Smartinh static void mpage_flush(struct btree *bt);
14737ee45a1Smartinh static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp);
1485d465952Smartinh static void mpage_prune(struct btree *bt);
1495d465952Smartinh static void mpage_dirty(struct btree *bt, struct mpage *mp);
1507f8a466aSmartinh static struct mpage *mpage_touch(struct btree *bt, struct mpage *mp);
1515d465952Smartinh
1525d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp);
1535d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp);
1545d465952Smartinh
1555d465952Smartinh struct ppage { /* ordered list of pages */
1565d465952Smartinh SLIST_ENTRY(ppage) entry;
1575d465952Smartinh struct mpage *mpage;
1585d465952Smartinh unsigned int ki; /* cursor index on page */
1595d465952Smartinh };
1605d465952Smartinh SLIST_HEAD(page_stack, ppage);
1615d465952Smartinh
1625d465952Smartinh #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack)
1635d465952Smartinh #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack)
1645d465952Smartinh #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry)
1655d465952Smartinh #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry)
1665d465952Smartinh
1675d465952Smartinh struct cursor {
1685d465952Smartinh struct btree *bt;
1695d465952Smartinh struct btree_txn *txn;
1705d465952Smartinh struct page_stack stack; /* stack of parent pages */
1715d465952Smartinh short initialized; /* 1 if initialized */
1725d465952Smartinh short eof; /* 1 if end is reached */
1735d465952Smartinh };
1745d465952Smartinh
1755d465952Smartinh #define METAHASHLEN offsetof(struct bt_meta, hash)
17637ee45a1Smartinh #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ))
1775d465952Smartinh
1785d465952Smartinh struct node {
1795d465952Smartinh #define n_pgno p.np_pgno
1805d465952Smartinh #define n_dsize p.np_dsize
1815d465952Smartinh union {
1825d465952Smartinh pgno_t np_pgno; /* child page number */
1835d465952Smartinh uint32_t np_dsize; /* leaf data size */
1845d465952Smartinh } p;
1855d465952Smartinh uint16_t ksize; /* key size */
1865d465952Smartinh #define F_BIGDATA 0x01 /* data put on overflow page */
1875d465952Smartinh uint8_t flags;
188a33d639aSmartinh char data[1];
189bf245c3bSmartinh } __packed;
1905d465952Smartinh
1915d465952Smartinh struct btree_txn {
1925d465952Smartinh pgno_t root; /* current / new root page */
1935d465952Smartinh pgno_t next_pgno; /* next unallocated page */
1945d465952Smartinh struct btree *bt; /* btree is ref'd */
1955d465952Smartinh struct dirty_queue *dirty_queue; /* modified pages */
1965d465952Smartinh #define BT_TXN_RDONLY 0x01 /* read-only transaction */
1975d465952Smartinh #define BT_TXN_ERROR 0x02 /* an error has occurred */
1985d465952Smartinh unsigned int flags;
1995d465952Smartinh };
2005d465952Smartinh
2015d465952Smartinh struct btree {
2025d465952Smartinh int fd;
2035d465952Smartinh char *path;
2045d465952Smartinh #define BT_FIXPADDING 0x01 /* internal */
2055d465952Smartinh unsigned int flags;
2065d465952Smartinh bt_cmp_func cmp; /* user compare function */
20737ee45a1Smartinh struct bt_head head;
20837ee45a1Smartinh struct bt_meta meta;
2095d465952Smartinh struct page_cache *page_cache;
2105d465952Smartinh struct lru_queue *lru_queue;
2115d465952Smartinh struct btree_txn *txn; /* current write transaction */
212e14f8e24Smartinh int ref; /* increased by cursors & txn */
2135d465952Smartinh struct btree_stat stat;
2145d465952Smartinh off_t size; /* current file size */
2155d465952Smartinh };
2165d465952Smartinh
2175d465952Smartinh #define NODESIZE offsetof(struct node, data)
2185d465952Smartinh
2195d465952Smartinh #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size))
2205d465952Smartinh #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size)
2215d465952Smartinh #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i]))
22237ee45a1Smartinh #define NODEPTR(mp, i) NODEPTRP((mp)->page, i)
2235d465952Smartinh #define NODEKEY(node) (void *)((node)->data)
2245d465952Smartinh #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize)
2255d465952Smartinh #define NODEPGNO(node) ((node)->p.np_pgno)
2265d465952Smartinh #define NODEDSZ(node) ((node)->p.np_dsize)
2275d465952Smartinh
2285d465952Smartinh #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */
2295d465952Smartinh #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */
2305d465952Smartinh
2315d465952Smartinh static int btree_read_page(struct btree *bt, pgno_t pgno,
2325d465952Smartinh struct page *page);
23337ee45a1Smartinh static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno);
2345d465952Smartinh static int btree_search_page_root(struct btree *bt,
2355d465952Smartinh struct mpage *root, struct btval *key,
2365d465952Smartinh struct cursor *cursor, int modify,
2375d465952Smartinh struct mpage **mpp);
2385d465952Smartinh static int btree_search_page(struct btree *bt,
2395d465952Smartinh struct btree_txn *txn, struct btval *key,
2405d465952Smartinh struct cursor *cursor, int modify,
2415d465952Smartinh struct mpage **mpp);
2425d465952Smartinh
24337ee45a1Smartinh static int btree_write_header(struct btree *bt, int fd);
24437ee45a1Smartinh static int btree_read_header(struct btree *bt);
2455d465952Smartinh static int btree_is_meta_page(struct page *p);
2465d465952Smartinh static int btree_read_meta(struct btree *bt, pgno_t *p_next);
247e14f8e24Smartinh static int btree_write_meta(struct btree *bt, pgno_t root,
248e14f8e24Smartinh unsigned int flags);
249408be415Smartinh static void btree_ref(struct btree *bt);
2505d465952Smartinh
2515d465952Smartinh static struct node *btree_search_node(struct btree *bt, struct mpage *mp,
2525d465952Smartinh struct btval *key, int *exactp, unsigned int *kip);
2535d465952Smartinh static int btree_add_node(struct btree *bt, struct mpage *mp,
2545d465952Smartinh indx_t indx, struct btval *key, struct btval *data,
2555d465952Smartinh pgno_t pgno, uint8_t flags);
2565d465952Smartinh static void btree_del_node(struct btree *bt, struct mpage *mp,
2575d465952Smartinh indx_t indx);
2585d465952Smartinh static int btree_read_data(struct btree *bt, struct mpage *mp,
2595d465952Smartinh struct node *leaf, struct btval *data);
2605d465952Smartinh
2615d465952Smartinh static int btree_rebalance(struct btree *bt, struct mpage *mp);
2625d465952Smartinh static int btree_update_key(struct btree *bt, struct mpage *mp,
2635d465952Smartinh indx_t indx, struct btval *key);
2645d465952Smartinh static int btree_adjust_prefix(struct btree *bt,
2655d465952Smartinh struct mpage *src, int delta);
2665d465952Smartinh static int btree_move_node(struct btree *bt, struct mpage *src,
2675d465952Smartinh indx_t srcindx, struct mpage *dst, indx_t dstindx);
2685d465952Smartinh static int btree_merge(struct btree *bt, struct mpage *src,
2695d465952Smartinh struct mpage *dst);
2705d465952Smartinh static int btree_split(struct btree *bt, struct mpage **mpp,
2715d465952Smartinh unsigned int *newindxp, struct btval *newkey,
2725d465952Smartinh struct btval *newdata, pgno_t newpgno);
2735d465952Smartinh static struct mpage *btree_new_page(struct btree *bt, uint32_t flags);
2745d465952Smartinh static int btree_write_overflow_data(struct btree *bt,
2755d465952Smartinh struct page *p, struct btval *data);
2765d465952Smartinh
2775d465952Smartinh static void cursor_pop_page(struct cursor *cursor);
2785d465952Smartinh static struct ppage *cursor_push_page(struct cursor *cursor,
2795d465952Smartinh struct mpage *mp);
2805d465952Smartinh
2815d465952Smartinh static int bt_set_key(struct btree *bt, struct mpage *mp,
2825d465952Smartinh struct node *node, struct btval *key);
2835d465952Smartinh static int btree_sibling(struct cursor *cursor, int move_right);
2845d465952Smartinh static int btree_cursor_next(struct cursor *cursor,
2855d465952Smartinh struct btval *key, struct btval *data);
2865d465952Smartinh static int btree_cursor_set(struct cursor *cursor,
2875cea9f8fSmartinh struct btval *key, struct btval *data, int *exactp);
2885d465952Smartinh static int btree_cursor_first(struct cursor *cursor,
2895d465952Smartinh struct btval *key, struct btval *data);
2905d465952Smartinh
2915d465952Smartinh static void bt_reduce_separator(struct btree *bt, struct node *min,
2925d465952Smartinh struct btval *sep);
2935d465952Smartinh static void remove_prefix(struct btree *bt, struct btval *key,
2945d465952Smartinh size_t pfxlen);
2955d465952Smartinh static void expand_prefix(struct btree *bt, struct mpage *mp,
2965d465952Smartinh indx_t indx, struct btkey *expkey);
2975d465952Smartinh static void concat_prefix(struct btree *bt, char *s1, size_t n1,
2985d465952Smartinh char *s2, size_t n2, char *cs, size_t *cn);
2995d465952Smartinh static void common_prefix(struct btree *bt, struct btkey *min,
3005d465952Smartinh struct btkey *max, struct btkey *pfx);
3015d465952Smartinh static void find_common_prefix(struct btree *bt, struct mpage *mp);
3025d465952Smartinh
30337ee45a1Smartinh static size_t bt_leaf_size(struct btree *bt, struct btval *key,
30437ee45a1Smartinh struct btval *data);
30537ee45a1Smartinh static size_t bt_branch_size(struct btree *bt, struct btval *key);
3065d465952Smartinh
3075d465952Smartinh static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno,
30837ee45a1Smartinh struct btree *btc);
3095d465952Smartinh
3105d465952Smartinh static int memncmp(const void *s1, size_t n1,
3115d465952Smartinh const void *s2, size_t n2);
3125d465952Smartinh static int memnrcmp(const void *s1, size_t n1,
3135d465952Smartinh const void *s2, size_t n2);
3145d465952Smartinh
3155d465952Smartinh static int
memncmp(const void * s1,size_t n1,const void * s2,size_t n2)3165d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2)
3175d465952Smartinh {
3185d465952Smartinh if (n1 < n2) {
3195d465952Smartinh if (memcmp(s1, s2, n1) == 0)
3205d465952Smartinh return -1;
3215d465952Smartinh }
3225d465952Smartinh else if (n1 > n2) {
3235d465952Smartinh if (memcmp(s1, s2, n2) == 0)
3245d465952Smartinh return 1;
3255d465952Smartinh }
3265d465952Smartinh return memcmp(s1, s2, n1);
3275d465952Smartinh }
3285d465952Smartinh
3295d465952Smartinh static int
memnrcmp(const void * s1,size_t n1,const void * s2,size_t n2)3305d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2)
3315d465952Smartinh {
3325d465952Smartinh const unsigned char *p1;
3335d465952Smartinh const unsigned char *p2;
3345d465952Smartinh
3355d465952Smartinh if (n1 == 0)
3365d465952Smartinh return n2 == 0 ? 0 : -1;
3375d465952Smartinh
3385d465952Smartinh if (n2 == 0)
3395d465952Smartinh return n1 == 0 ? 0 : 1;
3405d465952Smartinh
3415d465952Smartinh p1 = (const unsigned char *)s1 + n1 - 1;
3425d465952Smartinh p2 = (const unsigned char *)s2 + n2 - 1;
3435d465952Smartinh
3445d465952Smartinh while (*p1 == *p2) {
3455d465952Smartinh if (p1 == s1)
3465d465952Smartinh return (p2 == s2) ? 0 : -1;
3475d465952Smartinh if (p2 == s2)
3485d465952Smartinh return (p1 == p2) ? 0 : 1;
3495d465952Smartinh p1--;
3505d465952Smartinh p2--;
3515d465952Smartinh }
3525d465952Smartinh return *p1 - *p2;
3535d465952Smartinh }
3545d465952Smartinh
3555d465952Smartinh int
btree_cmp(struct btree * bt,const struct btval * a,const struct btval * b)3565d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b)
3575d465952Smartinh {
3585d465952Smartinh return bt->cmp(a, b);
3595d465952Smartinh }
3605d465952Smartinh
3615d465952Smartinh static void
common_prefix(struct btree * bt,struct btkey * min,struct btkey * max,struct btkey * pfx)3625d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max,
3635d465952Smartinh struct btkey *pfx)
3645d465952Smartinh {
3655d465952Smartinh size_t n = 0;
3665d465952Smartinh char *p1;
3675d465952Smartinh char *p2;
3685d465952Smartinh
3695d465952Smartinh if (min->len == 0 || max->len == 0) {
3705d465952Smartinh pfx->len = 0;
3715d465952Smartinh return;
3725d465952Smartinh }
3735d465952Smartinh
3745d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
3755d465952Smartinh p1 = min->str + min->len - 1;
3765d465952Smartinh p2 = max->str + max->len - 1;
3775d465952Smartinh
3785d465952Smartinh while (*p1 == *p2) {
3795d465952Smartinh if (p1 < min->str || p2 < max->str)
3805d465952Smartinh break;
3815d465952Smartinh p1--;
3825d465952Smartinh p2--;
3835d465952Smartinh n++;
3845d465952Smartinh }
3855d465952Smartinh
3865d465952Smartinh assert(n <= (int)sizeof(pfx->str));
3875d465952Smartinh pfx->len = n;
3885d465952Smartinh bcopy(p2 + 1, pfx->str, n);
3895d465952Smartinh } else {
3905d465952Smartinh p1 = min->str;
3915d465952Smartinh p2 = max->str;
3925d465952Smartinh
3935d465952Smartinh while (*p1 == *p2) {
3945d465952Smartinh if (n == min->len || n == max->len)
3955d465952Smartinh break;
3965d465952Smartinh p1++;
3975d465952Smartinh p2++;
3985d465952Smartinh n++;
3995d465952Smartinh }
4005d465952Smartinh
4015d465952Smartinh assert(n <= (int)sizeof(pfx->str));
4025d465952Smartinh pfx->len = n;
4035d465952Smartinh bcopy(max->str, pfx->str, n);
4045d465952Smartinh }
4055d465952Smartinh }
4065d465952Smartinh
4075d465952Smartinh static void
remove_prefix(struct btree * bt,struct btval * key,size_t pfxlen)4085d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen)
4095d465952Smartinh {
4105d465952Smartinh if (pfxlen == 0 || bt->cmp != NULL)
4115d465952Smartinh return;
4125d465952Smartinh
4135d465952Smartinh DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen,
4145d465952Smartinh (int)key->size, (char *)key->data);
4155d465952Smartinh assert(pfxlen <= key->size);
4165d465952Smartinh key->size -= pfxlen;
4175d465952Smartinh if (!F_ISSET(bt->flags, BT_REVERSEKEY))
4185d465952Smartinh key->data = (char *)key->data + pfxlen;
4195d465952Smartinh }
4205d465952Smartinh
4215d465952Smartinh static void
expand_prefix(struct btree * bt,struct mpage * mp,indx_t indx,struct btkey * expkey)4225d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx,
4235d465952Smartinh struct btkey *expkey)
4245d465952Smartinh {
4255d465952Smartinh struct node *node;
4265d465952Smartinh
4275d465952Smartinh node = NODEPTR(mp, indx);
4285d465952Smartinh expkey->len = sizeof(expkey->str);
4295d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len,
4305d465952Smartinh NODEKEY(node), node->ksize, expkey->str, &expkey->len);
4315d465952Smartinh }
4325d465952Smartinh
4335d465952Smartinh static int
bt_cmp(struct btree * bt,const struct btval * key1,const struct btval * key2,struct btkey * pfx)4345d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2,
4355d465952Smartinh struct btkey *pfx)
4365d465952Smartinh {
4375d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY))
4385d465952Smartinh return memnrcmp(key1->data, key1->size - pfx->len,
4395d465952Smartinh key2->data, key2->size);
4405d465952Smartinh else
4415d465952Smartinh return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len,
4425d465952Smartinh key2->data, key2->size);
4435d465952Smartinh }
4445d465952Smartinh
4455d465952Smartinh void
btval_reset(struct btval * btv)4465d465952Smartinh btval_reset(struct btval *btv)
4475d465952Smartinh {
4485d465952Smartinh if (btv) {
4495d465952Smartinh if (btv->mp)
4505d465952Smartinh btv->mp->ref--;
4515d465952Smartinh if (btv->free_data)
4525d465952Smartinh free(btv->data);
45337f4b933Smmcc memset(btv, 0, sizeof(*btv));
4545d465952Smartinh }
4555d465952Smartinh }
4565d465952Smartinh
4575d465952Smartinh static int
mpage_cmp(struct mpage * a,struct mpage * b)4585d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b)
4595d465952Smartinh {
4605d465952Smartinh if (a->pgno > b->pgno)
4615d465952Smartinh return 1;
4625d465952Smartinh if (a->pgno < b->pgno)
4635d465952Smartinh return -1;
4645d465952Smartinh return 0;
4655d465952Smartinh }
4665d465952Smartinh
4675d465952Smartinh static struct mpage *
mpage_lookup(struct btree * bt,pgno_t pgno)4685d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno)
4695d465952Smartinh {
4705d465952Smartinh struct mpage find, *mp;
4715d465952Smartinh
4725d465952Smartinh find.pgno = pgno;
4735d465952Smartinh mp = RB_FIND(page_cache, bt->page_cache, &find);
4745d465952Smartinh if (mp) {
4755d465952Smartinh bt->stat.hits++;
4765d465952Smartinh /* Update LRU queue. Move page to the end. */
4775d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
4785d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
4795d465952Smartinh }
4805d465952Smartinh return mp;
4815d465952Smartinh }
4825d465952Smartinh
4835d465952Smartinh static void
mpage_add(struct btree * bt,struct mpage * mp)4845d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp)
4855d465952Smartinh {
4865d465952Smartinh assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL);
4875d465952Smartinh bt->stat.cache_size++;
4885d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next);
48937ee45a1Smartinh }
49037ee45a1Smartinh
49137ee45a1Smartinh static void
mpage_free(struct mpage * mp)49237ee45a1Smartinh mpage_free(struct mpage *mp)
49337ee45a1Smartinh {
49437ee45a1Smartinh if (mp != NULL) {
49537ee45a1Smartinh free(mp->page);
49637ee45a1Smartinh free(mp);
49737ee45a1Smartinh }
4985d465952Smartinh }
4995d465952Smartinh
5005d465952Smartinh static void
mpage_del(struct btree * bt,struct mpage * mp)5015d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp)
5025d465952Smartinh {
5035d465952Smartinh assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp);
5045d465952Smartinh assert(bt->stat.cache_size > 0);
5055d465952Smartinh bt->stat.cache_size--;
5065d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next);
5075d465952Smartinh }
5085d465952Smartinh
509e14f8e24Smartinh static void
mpage_flush(struct btree * bt)510e14f8e24Smartinh mpage_flush(struct btree *bt)
511e14f8e24Smartinh {
512e14f8e24Smartinh struct mpage *mp;
513e14f8e24Smartinh
514e14f8e24Smartinh while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) {
515e14f8e24Smartinh mpage_del(bt, mp);
51637ee45a1Smartinh mpage_free(mp);
517e14f8e24Smartinh }
518e14f8e24Smartinh }
519e14f8e24Smartinh
5205d465952Smartinh static struct mpage *
mpage_copy(struct btree * bt,struct mpage * mp)52137ee45a1Smartinh mpage_copy(struct btree *bt, struct mpage *mp)
5225d465952Smartinh {
5235d465952Smartinh struct mpage *copy;
5245d465952Smartinh
5255d465952Smartinh if ((copy = calloc(1, sizeof(*copy))) == NULL)
5265d465952Smartinh return NULL;
52737ee45a1Smartinh if ((copy->page = malloc(bt->head.psize)) == NULL) {
52837ee45a1Smartinh free(copy);
52937ee45a1Smartinh return NULL;
53037ee45a1Smartinh }
53137ee45a1Smartinh bcopy(mp->page, copy->page, bt->head.psize);
5325d465952Smartinh bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix));
5335d465952Smartinh copy->parent = mp->parent;
5345d465952Smartinh copy->parent_index = mp->parent_index;
5355d465952Smartinh copy->pgno = mp->pgno;
5365d465952Smartinh
5375d465952Smartinh return copy;
5385d465952Smartinh }
5395d465952Smartinh
5405d465952Smartinh /* Remove the least recently used memory pages until the cache size is
5415d465952Smartinh * within the configured bounds. Pages referenced by cursors or returned
5425d465952Smartinh * key/data are not pruned.
5435d465952Smartinh */
5445d465952Smartinh static void
mpage_prune(struct btree * bt)5455d465952Smartinh mpage_prune(struct btree *bt)
5465d465952Smartinh {
5475d465952Smartinh struct mpage *mp, *next;
5485d465952Smartinh
5495d465952Smartinh for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) {
5505d465952Smartinh if (bt->stat.cache_size <= bt->stat.max_cache)
5515d465952Smartinh break;
5525d465952Smartinh next = TAILQ_NEXT(mp, lru_next);
5535d465952Smartinh if (!mp->dirty && mp->ref <= 0) {
5545d465952Smartinh mpage_del(bt, mp);
55537ee45a1Smartinh mpage_free(mp);
5565d465952Smartinh }
5575d465952Smartinh }
5585d465952Smartinh }
5595d465952Smartinh
5605d465952Smartinh /* Mark a page as dirty and push it on the dirty queue.
5615d465952Smartinh */
5625d465952Smartinh static void
mpage_dirty(struct btree * bt,struct mpage * mp)5635d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp)
5645d465952Smartinh {
5655d465952Smartinh assert(bt != NULL);
5665d465952Smartinh assert(bt->txn != NULL);
5675d465952Smartinh
5685d465952Smartinh if (!mp->dirty) {
5695d465952Smartinh mp->dirty = 1;
5705d465952Smartinh SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next);
5715d465952Smartinh }
5725d465952Smartinh }
5735d465952Smartinh
5745d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno.
5755d465952Smartinh */
5767f8a466aSmartinh static struct mpage *
mpage_touch(struct btree * bt,struct mpage * mp)5775d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp)
5785d465952Smartinh {
5795d465952Smartinh assert(bt != NULL);
5805d465952Smartinh assert(bt->txn != NULL);
5815d465952Smartinh assert(mp != NULL);
5825d465952Smartinh
5835d465952Smartinh if (!mp->dirty) {
5845d465952Smartinh DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno);
5855d465952Smartinh if (mp->ref == 0)
5865d465952Smartinh mpage_del(bt, mp);
5877f8a466aSmartinh else {
5887f8a466aSmartinh if ((mp = mpage_copy(bt, mp)) == NULL)
5897f8a466aSmartinh return NULL;
5907f8a466aSmartinh }
59137ee45a1Smartinh mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
5925d465952Smartinh mpage_dirty(bt, mp);
5935d465952Smartinh mpage_add(bt, mp);
5945d465952Smartinh
5955d465952Smartinh /* Update the page number to new touched page. */
5965d465952Smartinh if (mp->parent != NULL)
5975d465952Smartinh NODEPGNO(NODEPTR(mp->parent,
5985d465952Smartinh mp->parent_index)) = mp->pgno;
5995d465952Smartinh }
6007f8a466aSmartinh
6017f8a466aSmartinh return mp;
6025d465952Smartinh }
6035d465952Smartinh
6045d465952Smartinh static int
btree_read_page(struct btree * bt,pgno_t pgno,struct page * page)6055d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page)
6065d465952Smartinh {
6075d465952Smartinh ssize_t rc;
6085d465952Smartinh
6095d465952Smartinh DPRINTF("reading page %u", pgno);
6105d465952Smartinh bt->stat.reads++;
61137ee45a1Smartinh if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) {
6125d465952Smartinh DPRINTF("page %u doesn't exist", pgno);
6130279e526Smartinh errno = ENOENT;
6140279e526Smartinh return BT_FAIL;
61536e226d9Smartinh } else if (rc != (ssize_t)bt->head.psize) {
61637ee45a1Smartinh if (rc > 0)
61737ee45a1Smartinh errno = EINVAL;
6185d465952Smartinh DPRINTF("read: %s", strerror(errno));
6195d465952Smartinh return BT_FAIL;
6205d465952Smartinh }
6215d465952Smartinh
6225d465952Smartinh if (page->pgno != pgno) {
6235d465952Smartinh DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno);
6245d465952Smartinh errno = EINVAL;
6255d465952Smartinh return BT_FAIL;
6265d465952Smartinh }
6275d465952Smartinh
62837ee45a1Smartinh DPRINTF("page %u has flags 0x%X", pgno, page->flags);
62937ee45a1Smartinh
6305d465952Smartinh return BT_SUCCESS;
6315d465952Smartinh }
6325d465952Smartinh
6335d465952Smartinh int
btree_sync(struct btree * bt)6345d465952Smartinh btree_sync(struct btree *bt)
6355d465952Smartinh {
6365d465952Smartinh if (!F_ISSET(bt->flags, BT_NOSYNC))
6375d465952Smartinh return fsync(bt->fd);
6385d465952Smartinh return 0;
6395d465952Smartinh }
6405d465952Smartinh
6415d465952Smartinh struct btree_txn *
btree_txn_begin(struct btree * bt,int rdonly)6425d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly)
6435d465952Smartinh {
6445d465952Smartinh struct btree_txn *txn;
6455d465952Smartinh
6465d465952Smartinh if (!rdonly && bt->txn != NULL) {
6475d465952Smartinh DPRINTF("write transaction already begun");
6480279e526Smartinh errno = EBUSY;
6495d465952Smartinh return NULL;
6505d465952Smartinh }
6515d465952Smartinh
6525d465952Smartinh if ((txn = calloc(1, sizeof(*txn))) == NULL) {
6535d465952Smartinh DPRINTF("calloc: %s", strerror(errno));
6545d465952Smartinh return NULL;
6555d465952Smartinh }
6565d465952Smartinh
6575d465952Smartinh if (rdonly) {
6585d465952Smartinh txn->flags |= BT_TXN_RDONLY;
6595d465952Smartinh } else {
6605d465952Smartinh txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue));
6615d465952Smartinh if (txn->dirty_queue == NULL) {
6625d465952Smartinh free(txn);
6635d465952Smartinh return NULL;
6645d465952Smartinh }
6655d465952Smartinh SIMPLEQ_INIT(txn->dirty_queue);
6665d465952Smartinh
6675d465952Smartinh DPRINTF("taking write lock on txn %p", txn);
6685d465952Smartinh if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) {
6690279e526Smartinh DPRINTF("flock: %s", strerror(errno));
6700279e526Smartinh errno = EBUSY;
6715d465952Smartinh free(txn->dirty_queue);
6725d465952Smartinh free(txn);
6735d465952Smartinh return NULL;
6745d465952Smartinh }
6755d465952Smartinh bt->txn = txn;
6765d465952Smartinh }
6775d465952Smartinh
6785d465952Smartinh txn->bt = bt;
679408be415Smartinh btree_ref(bt);
6805d465952Smartinh
6810279e526Smartinh if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) {
6825d465952Smartinh btree_txn_abort(txn);
6835d465952Smartinh return NULL;
6845d465952Smartinh }
6855d465952Smartinh
68637ee45a1Smartinh txn->root = bt->meta.root;
6875d465952Smartinh DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root);
6885d465952Smartinh
6895d465952Smartinh return txn;
6905d465952Smartinh }
6915d465952Smartinh
6925d465952Smartinh void
btree_txn_abort(struct btree_txn * txn)6935d465952Smartinh btree_txn_abort(struct btree_txn *txn)
6945d465952Smartinh {
6955d465952Smartinh struct mpage *mp;
6965d465952Smartinh struct btree *bt;
6975d465952Smartinh
6985d465952Smartinh if (txn == NULL)
6995d465952Smartinh return;
7005d465952Smartinh
7015d465952Smartinh bt = txn->bt;
7025d465952Smartinh DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root);
7035d465952Smartinh
704f189d82aSmartinh if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7055d465952Smartinh /* Discard all dirty pages.
7065d465952Smartinh */
7075d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
7085d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue);
7095d465952Smartinh assert(mp->ref == 0); /* cursors should be closed */
7105d465952Smartinh mpage_del(bt, mp);
7115d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
71246fd5648Smartinh mpage_free(mp);
7135d465952Smartinh }
7145d465952Smartinh
7155d465952Smartinh DPRINTF("releasing write lock on txn %p", txn);
7165d465952Smartinh txn->bt->txn = NULL;
7175d465952Smartinh if (flock(txn->bt->fd, LOCK_UN) != 0) {
7185d465952Smartinh DPRINTF("failed to unlock fd %d: %s",
7195d465952Smartinh txn->bt->fd, strerror(errno));
7205d465952Smartinh }
7215d465952Smartinh free(txn->dirty_queue);
7225d465952Smartinh }
7235d465952Smartinh
7245d465952Smartinh btree_close(txn->bt);
7255d465952Smartinh free(txn);
7265d465952Smartinh }
7275d465952Smartinh
7285d465952Smartinh int
btree_txn_commit(struct btree_txn * txn)7295d465952Smartinh btree_txn_commit(struct btree_txn *txn)
7305d465952Smartinh {
7315d465952Smartinh int n, done;
7325d465952Smartinh ssize_t rc;
7335d465952Smartinh off_t size;
7345d465952Smartinh struct mpage *mp;
7355d465952Smartinh struct btree *bt;
7365d465952Smartinh struct iovec iov[BT_COMMIT_PAGES];
7375d465952Smartinh
7385d465952Smartinh assert(txn != NULL);
7395d465952Smartinh assert(txn->bt != NULL);
7405d465952Smartinh
7415d465952Smartinh bt = txn->bt;
7425d465952Smartinh
7435d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_RDONLY)) {
7445d465952Smartinh DPRINTF("attempt to commit read-only transaction");
7455d465952Smartinh btree_txn_abort(txn);
7460279e526Smartinh errno = EPERM;
7475d465952Smartinh return BT_FAIL;
7485d465952Smartinh }
7495d465952Smartinh
7505d465952Smartinh if (txn != bt->txn) {
7515d465952Smartinh DPRINTF("attempt to commit unknown transaction");
7525d465952Smartinh btree_txn_abort(txn);
7530279e526Smartinh errno = EINVAL;
7545d465952Smartinh return BT_FAIL;
7555d465952Smartinh }
7565d465952Smartinh
7575d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
7585d465952Smartinh DPRINTF("error flag is set, can't commit");
7595d465952Smartinh btree_txn_abort(txn);
7600279e526Smartinh errno = EINVAL;
7615d465952Smartinh return BT_FAIL;
7625d465952Smartinh }
7635d465952Smartinh
7645d465952Smartinh if (SIMPLEQ_EMPTY(txn->dirty_queue))
7655d465952Smartinh goto done;
7665d465952Smartinh
7675d465952Smartinh if (F_ISSET(bt->flags, BT_FIXPADDING)) {
7685d465952Smartinh size = lseek(bt->fd, 0, SEEK_END);
76937ee45a1Smartinh size += bt->head.psize - (size % bt->head.psize);
7705d465952Smartinh DPRINTF("extending to multiple of page size: %llu", size);
7715d465952Smartinh if (ftruncate(bt->fd, size) != 0) {
7725d465952Smartinh DPRINTF("ftruncate: %s", strerror(errno));
7735d465952Smartinh btree_txn_abort(txn);
7745d465952Smartinh return BT_FAIL;
7755d465952Smartinh }
7765d465952Smartinh bt->flags &= ~BT_FIXPADDING;
7775d465952Smartinh }
7785d465952Smartinh
7795d465952Smartinh DPRINTF("committing transaction on btree %p, root page %u",
7805d465952Smartinh bt, txn->root);
7815d465952Smartinh
7825d465952Smartinh /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done.
7835d465952Smartinh */
7845d465952Smartinh do {
7855d465952Smartinh n = 0;
7865d465952Smartinh done = 1;
7875d465952Smartinh SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) {
788ca70fba4Smmcc DPRINTF("committing page %u", mp->pgno);
78937ee45a1Smartinh iov[n].iov_len = bt->head.psize;
79037ee45a1Smartinh iov[n].iov_base = mp->page;
7915d465952Smartinh if (++n >= BT_COMMIT_PAGES) {
7925d465952Smartinh done = 0;
7935d465952Smartinh break;
7945d465952Smartinh }
7955d465952Smartinh }
7965d465952Smartinh
7975d465952Smartinh if (n == 0)
7985d465952Smartinh break;
7995d465952Smartinh
800ca70fba4Smmcc DPRINTF("committing %u dirty pages", n);
8015d465952Smartinh rc = writev(bt->fd, iov, n);
80236e226d9Smartinh if (rc != (ssize_t)bt->head.psize*n) {
8035d465952Smartinh if (rc > 0)
8045d465952Smartinh DPRINTF("short write, filesystem full?");
8055d465952Smartinh else
8065d465952Smartinh DPRINTF("writev: %s", strerror(errno));
8075d465952Smartinh btree_txn_abort(txn);
8085d465952Smartinh return BT_FAIL;
8095d465952Smartinh }
8105d465952Smartinh
8115d465952Smartinh /* Remove the dirty flag from the written pages.
8125d465952Smartinh */
8135d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) {
8145d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue);
8155d465952Smartinh mp->dirty = 0;
8165d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next);
8175d465952Smartinh if (--n == 0)
8185d465952Smartinh break;
8195d465952Smartinh }
8205d465952Smartinh } while (!done);
8215d465952Smartinh
8225d465952Smartinh if (btree_sync(bt) != 0 ||
823e14f8e24Smartinh btree_write_meta(bt, txn->root, 0) != BT_SUCCESS ||
8245d465952Smartinh btree_sync(bt) != 0) {
8255d465952Smartinh btree_txn_abort(txn);
8265d465952Smartinh return BT_FAIL;
8275d465952Smartinh }
8285d465952Smartinh
8295d465952Smartinh done:
8305d465952Smartinh mpage_prune(bt);
8315d465952Smartinh btree_txn_abort(txn);
8325d465952Smartinh
8335d465952Smartinh return BT_SUCCESS;
8345d465952Smartinh }
8355d465952Smartinh
8365d465952Smartinh static int
btree_write_header(struct btree * bt,int fd)83737ee45a1Smartinh btree_write_header(struct btree *bt, int fd)
83837ee45a1Smartinh {
83937ee45a1Smartinh struct stat sb;
84037ee45a1Smartinh struct bt_head *h;
84137ee45a1Smartinh struct page *p;
84237ee45a1Smartinh ssize_t rc;
84337ee45a1Smartinh unsigned int psize;
84437ee45a1Smartinh
84537ee45a1Smartinh DPRINTF("writing header page");
84637ee45a1Smartinh assert(bt != NULL);
84737ee45a1Smartinh
848*3572a0e3Ssthen /*
849*3572a0e3Ssthen * Ask stat for 'optimal blocksize for I/O', but cap to fit in indx_t.
85037ee45a1Smartinh */
85137ee45a1Smartinh if (fstat(fd, &sb) == 0)
852*3572a0e3Ssthen psize = MINIMUM(32*1024, sb.st_blksize);
85337ee45a1Smartinh else
85437ee45a1Smartinh psize = PAGESIZE;
85537ee45a1Smartinh
85637ee45a1Smartinh if ((p = calloc(1, psize)) == NULL)
85737ee45a1Smartinh return -1;
85837ee45a1Smartinh p->flags = P_HEAD;
85937ee45a1Smartinh
86037ee45a1Smartinh h = METADATA(p);
86137ee45a1Smartinh h->magic = BT_MAGIC;
86237ee45a1Smartinh h->version = BT_VERSION;
86337ee45a1Smartinh h->psize = psize;
86437ee45a1Smartinh bcopy(h, &bt->head, sizeof(*h));
86537ee45a1Smartinh
86637ee45a1Smartinh rc = write(fd, p, bt->head.psize);
86737ee45a1Smartinh free(p);
86836e226d9Smartinh if (rc != (ssize_t)bt->head.psize) {
86937ee45a1Smartinh if (rc > 0)
87037ee45a1Smartinh DPRINTF("short write, filesystem full?");
87137ee45a1Smartinh return BT_FAIL;
87237ee45a1Smartinh }
87337ee45a1Smartinh
87437ee45a1Smartinh return BT_SUCCESS;
87537ee45a1Smartinh }
87637ee45a1Smartinh
87737ee45a1Smartinh static int
btree_read_header(struct btree * bt)87837ee45a1Smartinh btree_read_header(struct btree *bt)
87937ee45a1Smartinh {
88037ee45a1Smartinh char page[PAGESIZE];
88137ee45a1Smartinh struct page *p;
88237ee45a1Smartinh struct bt_head *h;
88337ee45a1Smartinh int rc;
88437ee45a1Smartinh
88537ee45a1Smartinh assert(bt != NULL);
88637ee45a1Smartinh
88737ee45a1Smartinh /* We don't know the page size yet, so use a minimum value.
88837ee45a1Smartinh */
88937ee45a1Smartinh
89037ee45a1Smartinh if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) {
89137ee45a1Smartinh errno = ENOENT;
89237ee45a1Smartinh return -1;
89337ee45a1Smartinh } else if (rc != PAGESIZE) {
89437ee45a1Smartinh if (rc > 0)
89537ee45a1Smartinh errno = EINVAL;
89637ee45a1Smartinh DPRINTF("read: %s", strerror(errno));
89737ee45a1Smartinh return -1;
89837ee45a1Smartinh }
89937ee45a1Smartinh
90037ee45a1Smartinh p = (struct page *)page;
90137ee45a1Smartinh
90237ee45a1Smartinh if (!F_ISSET(p->flags, P_HEAD)) {
90337ee45a1Smartinh DPRINTF("page %d not a header page", p->pgno);
90437ee45a1Smartinh errno = EINVAL;
90537ee45a1Smartinh return -1;
90637ee45a1Smartinh }
90737ee45a1Smartinh
90837ee45a1Smartinh h = METADATA(p);
90937ee45a1Smartinh if (h->magic != BT_MAGIC) {
91037ee45a1Smartinh DPRINTF("header has invalid magic");
91137ee45a1Smartinh errno = EINVAL;
91237ee45a1Smartinh return -1;
91337ee45a1Smartinh }
91437ee45a1Smartinh
91537ee45a1Smartinh if (h->version != BT_VERSION) {
91637ee45a1Smartinh DPRINTF("database is version %u, expected version %u",
91737ee45a1Smartinh bt->head.version, BT_VERSION);
91837ee45a1Smartinh errno = EINVAL;
91937ee45a1Smartinh return -1;
92037ee45a1Smartinh }
92137ee45a1Smartinh
92237ee45a1Smartinh bcopy(h, &bt->head, sizeof(*h));
92337ee45a1Smartinh return 0;
92437ee45a1Smartinh }
92537ee45a1Smartinh
92637ee45a1Smartinh static int
btree_write_meta(struct btree * bt,pgno_t root,unsigned int flags)927e14f8e24Smartinh btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags)
9285d465952Smartinh {
92937ee45a1Smartinh struct mpage *mp;
93037ee45a1Smartinh struct bt_meta *meta;
9315d465952Smartinh ssize_t rc;
9325d465952Smartinh
9335d465952Smartinh DPRINTF("writing meta page for root page %u", root);
9345d465952Smartinh
9355d465952Smartinh assert(bt != NULL);
9365d465952Smartinh assert(bt->txn != NULL);
9375d465952Smartinh
93837ee45a1Smartinh if ((mp = btree_new_page(bt, P_META)) == NULL)
93937ee45a1Smartinh return -1;
9405d465952Smartinh
94137ee45a1Smartinh bt->meta.prev_meta = bt->meta.root;
94237ee45a1Smartinh bt->meta.root = root;
94337ee45a1Smartinh bt->meta.flags = flags;
94437ee45a1Smartinh bt->meta.created_at = time(0);
94537ee45a1Smartinh bt->meta.revisions++;
94637ee45a1Smartinh SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash);
9475d465952Smartinh
94837ee45a1Smartinh /* Copy the meta data changes to the new meta page. */
94937ee45a1Smartinh meta = METADATA(mp->page);
95037ee45a1Smartinh bcopy(&bt->meta, meta, sizeof(*meta));
95137ee45a1Smartinh
95236e226d9Smartinh rc = write(bt->fd, mp->page, bt->head.psize);
953b09f546cSmartinh mp->dirty = 0;
954b09f546cSmartinh SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next);
95536e226d9Smartinh if (rc != (ssize_t)bt->head.psize) {
9565d465952Smartinh if (rc > 0)
9575d465952Smartinh DPRINTF("short write, filesystem full?");
9585d465952Smartinh return BT_FAIL;
9595d465952Smartinh }
9605d465952Smartinh
9615d465952Smartinh if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) {
9625d465952Smartinh DPRINTF("failed to update file size: %s", strerror(errno));
9635d465952Smartinh bt->size = 0;
9645d465952Smartinh }
9655d465952Smartinh
9665d465952Smartinh return BT_SUCCESS;
9675d465952Smartinh }
9685d465952Smartinh
9695d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise.
9705d465952Smartinh */
9715d465952Smartinh static int
btree_is_meta_page(struct page * p)9725d465952Smartinh btree_is_meta_page(struct page *p)
9735d465952Smartinh {
9745d465952Smartinh struct bt_meta *m;
9755d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH];
9765d465952Smartinh
9775d465952Smartinh m = METADATA(p);
9785d465952Smartinh if (!F_ISSET(p->flags, P_META)) {
9795d465952Smartinh DPRINTF("page %d not a meta page", p->pgno);
98037ee45a1Smartinh errno = EINVAL;
9815d465952Smartinh return 0;
9825d465952Smartinh }
9835d465952Smartinh
9845d465952Smartinh if (m->root >= p->pgno && m->root != P_INVALID) {
9855d465952Smartinh DPRINTF("page %d points to an invalid root page", p->pgno);
98637ee45a1Smartinh errno = EINVAL;
9875d465952Smartinh return 0;
9885d465952Smartinh }
9895d465952Smartinh
9905d465952Smartinh SHA1((unsigned char *)m, METAHASHLEN, hash);
9915d465952Smartinh if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) {
9925d465952Smartinh DPRINTF("page %d has an invalid digest", p->pgno);
99337ee45a1Smartinh errno = EINVAL;
9945d465952Smartinh return 0;
9955d465952Smartinh }
9965d465952Smartinh
9975d465952Smartinh return 1;
9985d465952Smartinh }
9995d465952Smartinh
10005d465952Smartinh static int
btree_read_meta(struct btree * bt,pgno_t * p_next)10015d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next)
10025d465952Smartinh {
100337ee45a1Smartinh struct mpage *mp;
100437ee45a1Smartinh struct bt_meta *meta;
10055d465952Smartinh pgno_t meta_pgno, next_pgno;
10065d465952Smartinh off_t size;
10075d465952Smartinh
10085d465952Smartinh assert(bt != NULL);
10095d465952Smartinh
10105d465952Smartinh if ((size = lseek(bt->fd, 0, SEEK_END)) == -1)
10115d465952Smartinh goto fail;
10125d465952Smartinh
10135d465952Smartinh DPRINTF("btree_read_meta: size = %llu", size);
10145d465952Smartinh
10155d465952Smartinh if (size < bt->size) {
10165d465952Smartinh DPRINTF("file has shrunk!");
10175d465952Smartinh errno = EIO;
10185d465952Smartinh goto fail;
10195d465952Smartinh }
10205d465952Smartinh
102137ee45a1Smartinh if (size == bt->head.psize) { /* there is only the header */
10225d465952Smartinh if (p_next != NULL)
102337ee45a1Smartinh *p_next = 1;
10240279e526Smartinh return BT_SUCCESS; /* new file */
10255d465952Smartinh }
10265d465952Smartinh
102737ee45a1Smartinh next_pgno = size / bt->head.psize;
10285d465952Smartinh if (next_pgno == 0) {
10295d465952Smartinh DPRINTF("corrupt file");
10305d465952Smartinh errno = EIO;
10315d465952Smartinh goto fail;
10325d465952Smartinh }
10335d465952Smartinh
10345d465952Smartinh meta_pgno = next_pgno - 1;
10355d465952Smartinh
103637ee45a1Smartinh if (size % bt->head.psize != 0) {
10375d465952Smartinh DPRINTF("filesize not a multiple of the page size!");
10385d465952Smartinh bt->flags |= BT_FIXPADDING;
10395d465952Smartinh next_pgno++;
10405d465952Smartinh }
10415d465952Smartinh
10425d465952Smartinh if (p_next != NULL)
10435d465952Smartinh *p_next = next_pgno;
10445d465952Smartinh
10455d465952Smartinh if (size == bt->size) {
10465d465952Smartinh DPRINTF("size unchanged, keeping current meta page");
104737ee45a1Smartinh if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) {
104862436105Smartinh DPRINTF("file is dead");
10490279e526Smartinh errno = ESTALE;
10500279e526Smartinh return BT_FAIL;
105162436105Smartinh } else
10525d465952Smartinh return BT_SUCCESS;
10535d465952Smartinh }
10545d465952Smartinh bt->size = size;
10555d465952Smartinh
10565d465952Smartinh while (meta_pgno > 0) {
105737ee45a1Smartinh if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL)
105837ee45a1Smartinh break;
105937ee45a1Smartinh if (btree_is_meta_page(mp->page)) {
106037ee45a1Smartinh meta = METADATA(mp->page);
106137ee45a1Smartinh DPRINTF("flags = 0x%x", meta->flags);
106237ee45a1Smartinh if (F_ISSET(meta->flags, BT_TOMBSTONE)) {
1063e14f8e24Smartinh DPRINTF("file is dead");
10640279e526Smartinh errno = ESTALE;
10650279e526Smartinh return BT_FAIL;
106637ee45a1Smartinh } else {
106737ee45a1Smartinh /* Make copy of last meta page. */
106837ee45a1Smartinh bcopy(meta, &bt->meta, sizeof(bt->meta));
10695d465952Smartinh return BT_SUCCESS;
10705d465952Smartinh }
107137ee45a1Smartinh }
10725d465952Smartinh --meta_pgno; /* scan backwards to first valid meta page */
10735d465952Smartinh }
10745d465952Smartinh
10755d465952Smartinh errno = EIO;
10765d465952Smartinh fail:
10775d465952Smartinh if (p_next != NULL)
10785d465952Smartinh *p_next = P_INVALID;
10795d465952Smartinh return BT_FAIL;
10805d465952Smartinh }
10815d465952Smartinh
10825d465952Smartinh struct btree *
btree_open_fd(int fd,unsigned int flags)1083176c8cbcSmartinh btree_open_fd(int fd, unsigned int flags)
10845d465952Smartinh {
10855d465952Smartinh struct btree *bt;
108637ee45a1Smartinh int fl;
10875d465952Smartinh
108883094010Skrw fl = fcntl(fd, F_GETFL);
10895d465952Smartinh if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1)
10905d465952Smartinh return NULL;
10915d465952Smartinh
10925d465952Smartinh if ((bt = calloc(1, sizeof(*bt))) == NULL)
10935d465952Smartinh return NULL;
10945d465952Smartinh bt->fd = fd;
10955d465952Smartinh bt->flags = flags;
10965d465952Smartinh bt->flags &= ~BT_FIXPADDING;
1097408be415Smartinh bt->ref = 1;
109837ee45a1Smartinh bt->meta.root = P_INVALID;
10995d465952Smartinh
11005d465952Smartinh if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL)
11015d465952Smartinh goto fail;
11025d465952Smartinh bt->stat.max_cache = BT_MAXCACHE_DEF;
11035d465952Smartinh RB_INIT(bt->page_cache);
11045d465952Smartinh
11055d465952Smartinh if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL)
11065d465952Smartinh goto fail;
11075d465952Smartinh TAILQ_INIT(bt->lru_queue);
11085d465952Smartinh
110937ee45a1Smartinh if (btree_read_header(bt) != 0) {
111037ee45a1Smartinh if (errno != ENOENT)
11115d465952Smartinh goto fail;
111237ee45a1Smartinh DPRINTF("new database");
111337ee45a1Smartinh btree_write_header(bt, bt->fd);
111437ee45a1Smartinh }
11155d465952Smartinh
11160279e526Smartinh if (btree_read_meta(bt, NULL) != 0)
11175d465952Smartinh goto fail;
11185d465952Smartinh
11195d465952Smartinh DPRINTF("opened database version %u, pagesize %u",
112037ee45a1Smartinh bt->head.version, bt->head.psize);
112137ee45a1Smartinh DPRINTF("timestamp: %s", ctime(&bt->meta.created_at));
112237ee45a1Smartinh DPRINTF("depth: %u", bt->meta.depth);
112337ee45a1Smartinh DPRINTF("entries: %llu", bt->meta.entries);
112437ee45a1Smartinh DPRINTF("revisions: %u", bt->meta.revisions);
112537ee45a1Smartinh DPRINTF("branch pages: %u", bt->meta.branch_pages);
112637ee45a1Smartinh DPRINTF("leaf pages: %u", bt->meta.leaf_pages);
112737ee45a1Smartinh DPRINTF("overflow pages: %u", bt->meta.overflow_pages);
112837ee45a1Smartinh DPRINTF("root: %u", bt->meta.root);
112937ee45a1Smartinh DPRINTF("previous meta page: %u", bt->meta.prev_meta);
11305d465952Smartinh
11315d465952Smartinh return bt;
11325d465952Smartinh
11335d465952Smartinh fail:
11345d465952Smartinh free(bt->lru_queue);
11355d465952Smartinh free(bt->page_cache);
11365d465952Smartinh free(bt);
11375d465952Smartinh return NULL;
11385d465952Smartinh }
11395d465952Smartinh
11405d465952Smartinh struct btree *
btree_open(const char * path,unsigned int flags,mode_t mode)1141176c8cbcSmartinh btree_open(const char *path, unsigned int flags, mode_t mode)
11425d465952Smartinh {
11435d465952Smartinh int fd, oflags;
11445d465952Smartinh struct btree *bt;
11455d465952Smartinh
11465d465952Smartinh if (F_ISSET(flags, BT_RDONLY))
11475d465952Smartinh oflags = O_RDONLY;
11485d465952Smartinh else
11495d465952Smartinh oflags = O_RDWR | O_CREAT | O_APPEND;
11505d465952Smartinh
11515d465952Smartinh if ((fd = open(path, oflags, mode)) == -1)
11525d465952Smartinh return NULL;
11535d465952Smartinh
11545d465952Smartinh if ((bt = btree_open_fd(fd, flags)) == NULL)
11555d465952Smartinh close(fd);
11565d465952Smartinh else {
11575d465952Smartinh bt->path = strdup(path);
11585d465952Smartinh DPRINTF("opened btree %p", bt);
11595d465952Smartinh }
11605d465952Smartinh
11615d465952Smartinh return bt;
11625d465952Smartinh }
11635d465952Smartinh
1164408be415Smartinh static void
btree_ref(struct btree * bt)1165408be415Smartinh btree_ref(struct btree *bt)
1166408be415Smartinh {
1167408be415Smartinh bt->ref++;
1168408be415Smartinh DPRINTF("ref is now %d on btree %p", bt->ref, bt);
1169408be415Smartinh }
1170408be415Smartinh
11715d465952Smartinh void
btree_close(struct btree * bt)11725d465952Smartinh btree_close(struct btree *bt)
11735d465952Smartinh {
1174e14f8e24Smartinh if (bt == NULL)
1175e14f8e24Smartinh return;
11765d465952Smartinh
1177e14f8e24Smartinh if (--bt->ref == 0) {
11785d465952Smartinh DPRINTF("ref is zero, closing btree %p", bt);
11795d465952Smartinh close(bt->fd);
1180e14f8e24Smartinh mpage_flush(bt);
1181e83131ceSjmatthew free(bt->lru_queue);
1182e83131ceSjmatthew free(bt->path);
11835d465952Smartinh free(bt->page_cache);
11845d465952Smartinh free(bt);
1185e14f8e24Smartinh } else
1186408be415Smartinh DPRINTF("ref is now %d on btree %p", bt->ref, bt);
11875d465952Smartinh }
11885d465952Smartinh
11895d465952Smartinh /* Search for key within a leaf page, using binary search.
11905d465952Smartinh * Returns the smallest entry larger or equal to the key.
11915d465952Smartinh * If exactp is non-null, stores whether the found entry was an exact match
11925d465952Smartinh * in *exactp (1 or 0).
11935d465952Smartinh * If kip is non-null, stores the index of the found entry in *kip.
11945d465952Smartinh * If no entry larger of equal to the key is found, returns NULL.
11955d465952Smartinh */
11965d465952Smartinh static struct node *
btree_search_node(struct btree * bt,struct mpage * mp,struct btval * key,int * exactp,unsigned int * kip)11975d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key,
11985d465952Smartinh int *exactp, unsigned int *kip)
11995d465952Smartinh {
12005d465952Smartinh unsigned int i = 0;
12015d465952Smartinh int low, high;
12025d465952Smartinh int rc = 0;
12035d465952Smartinh struct node *node;
12045d465952Smartinh struct btval nodekey;
12055d465952Smartinh
12065d465952Smartinh DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]",
12075d465952Smartinh NUMKEYS(mp),
12085d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch",
12095d465952Smartinh mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str);
12105d465952Smartinh
12115d465952Smartinh assert(NUMKEYS(mp) > 0);
12125d465952Smartinh
121337f4b933Smmcc memset(&nodekey, 0, sizeof(nodekey));
12145d465952Smartinh
12155d465952Smartinh low = IS_LEAF(mp) ? 0 : 1;
12165d465952Smartinh high = NUMKEYS(mp) - 1;
12175d465952Smartinh while (low <= high) {
12185d465952Smartinh i = (low + high) >> 1;
12195d465952Smartinh node = NODEPTR(mp, i);
12205d465952Smartinh
12215d465952Smartinh nodekey.size = node->ksize;
12225d465952Smartinh nodekey.data = NODEKEY(node);
12235d465952Smartinh
12245d465952Smartinh if (bt->cmp)
12255d465952Smartinh rc = bt->cmp(key, &nodekey);
12265d465952Smartinh else
12275d465952Smartinh rc = bt_cmp(bt, key, &nodekey, &mp->prefix);
12285d465952Smartinh
12295d465952Smartinh if (IS_LEAF(mp))
123048318e0bSderaadt DPRINTF("found leaf index %u [%.*s], rc = %d",
12315d465952Smartinh i, (int)nodekey.size, (char *)nodekey.data, rc);
12325d465952Smartinh else
123348318e0bSderaadt DPRINTF("found branch index %u [%.*s -> %u], rc = %d",
12345d465952Smartinh i, (int)node->ksize, (char *)NODEKEY(node),
12355d465952Smartinh node->n_pgno, rc);
12365d465952Smartinh
12375d465952Smartinh if (rc == 0)
12385d465952Smartinh break;
12395d465952Smartinh if (rc > 0)
12405d465952Smartinh low = i + 1;
12415d465952Smartinh else
12425d465952Smartinh high = i - 1;
12435d465952Smartinh }
12445d465952Smartinh
12455d465952Smartinh if (rc > 0) { /* Found entry is less than the key. */
12465d465952Smartinh i++; /* Skip to get the smallest entry larger than key. */
12475d465952Smartinh if (i >= NUMKEYS(mp))
12485d465952Smartinh /* There is no entry larger or equal to the key. */
12495d465952Smartinh return NULL;
12505d465952Smartinh }
12515d465952Smartinh if (exactp)
12525d465952Smartinh *exactp = (rc == 0);
12535d465952Smartinh if (kip) /* Store the key index if requested. */
12545d465952Smartinh *kip = i;
12555d465952Smartinh
12565d465952Smartinh return NODEPTR(mp, i);
12575d465952Smartinh }
12585d465952Smartinh
12595d465952Smartinh static void
cursor_pop_page(struct cursor * cursor)12605d465952Smartinh cursor_pop_page(struct cursor *cursor)
12615d465952Smartinh {
12625d465952Smartinh struct ppage *top;
12635d465952Smartinh
12645d465952Smartinh top = CURSOR_TOP(cursor);
12655d465952Smartinh CURSOR_POP(cursor);
12665d465952Smartinh top->mpage->ref--;
12675d465952Smartinh
12685d465952Smartinh DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor);
12695d465952Smartinh
12705d465952Smartinh free(top);
12715d465952Smartinh }
12725d465952Smartinh
12735d465952Smartinh static struct ppage *
cursor_push_page(struct cursor * cursor,struct mpage * mp)12745d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp)
12755d465952Smartinh {
12765d465952Smartinh struct ppage *ppage;
12775d465952Smartinh
12785d465952Smartinh DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor);
12795d465952Smartinh
12805d465952Smartinh if ((ppage = calloc(1, sizeof(*ppage))) == NULL)
12815d465952Smartinh return NULL;
12825d465952Smartinh ppage->mpage = mp;
12835d465952Smartinh mp->ref++;
12845d465952Smartinh CURSOR_PUSH(cursor, ppage);
12855d465952Smartinh return ppage;
12865d465952Smartinh }
12875d465952Smartinh
128837ee45a1Smartinh static struct mpage *
btree_get_mpage(struct btree * bt,pgno_t pgno)128937ee45a1Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno)
12905d465952Smartinh {
12915d465952Smartinh struct mpage *mp;
12925d465952Smartinh
12935d465952Smartinh mp = mpage_lookup(bt, pgno);
12945d465952Smartinh if (mp == NULL) {
12955d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL)
129637ee45a1Smartinh return NULL;
129737ee45a1Smartinh if ((mp->page = malloc(bt->head.psize)) == NULL) {
12985d465952Smartinh free(mp);
129937ee45a1Smartinh return NULL;
130037ee45a1Smartinh }
13011c05783eSmartinh if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) {
130237ee45a1Smartinh mpage_free(mp);
130337ee45a1Smartinh return NULL;
13045d465952Smartinh }
13055d465952Smartinh mp->pgno = pgno;
13065d465952Smartinh mpage_add(bt, mp);
13075d465952Smartinh } else
13085d465952Smartinh DPRINTF("returning page %u from cache", pgno);
13095d465952Smartinh
131037ee45a1Smartinh return mp;
13115d465952Smartinh }
13125d465952Smartinh
13135d465952Smartinh static void
concat_prefix(struct btree * bt,char * s1,size_t n1,char * s2,size_t n2,char * cs,size_t * cn)13145d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2,
13155d465952Smartinh char *cs, size_t *cn)
13165d465952Smartinh {
13175d465952Smartinh assert(*cn >= n1 + n2);
13185d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
13195d465952Smartinh bcopy(s2, cs, n2);
13205d465952Smartinh bcopy(s1, cs + n2, n1);
13215d465952Smartinh } else {
13225d465952Smartinh bcopy(s1, cs, n1);
13235d465952Smartinh bcopy(s2, cs + n1, n2);
13245d465952Smartinh }
13255d465952Smartinh *cn = n1 + n2;
13265d465952Smartinh }
13275d465952Smartinh
13285d465952Smartinh static void
find_common_prefix(struct btree * bt,struct mpage * mp)13295d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp)
13305d465952Smartinh {
13315d465952Smartinh indx_t lbound = 0, ubound = 0;
13325d465952Smartinh struct mpage *lp, *up;
13335d465952Smartinh struct btkey lprefix, uprefix;
13345d465952Smartinh
13355d465952Smartinh mp->prefix.len = 0;
13365d465952Smartinh if (bt->cmp != NULL)
13375d465952Smartinh return;
13385d465952Smartinh
13395d465952Smartinh lp = mp;
13405d465952Smartinh while (lp->parent != NULL) {
13415d465952Smartinh if (lp->parent_index > 0) {
13425d465952Smartinh lbound = lp->parent_index;
13435d465952Smartinh break;
13445d465952Smartinh }
13455d465952Smartinh lp = lp->parent;
13465d465952Smartinh }
13475d465952Smartinh
13485d465952Smartinh up = mp;
13495d465952Smartinh while (up->parent != NULL) {
13505d465952Smartinh if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) {
13515d465952Smartinh ubound = up->parent_index + 1;
13525d465952Smartinh break;
13535d465952Smartinh }
13545d465952Smartinh up = up->parent;
13555d465952Smartinh }
13565d465952Smartinh
13575d465952Smartinh if (lp->parent != NULL && up->parent != NULL) {
13585d465952Smartinh expand_prefix(bt, lp->parent, lbound, &lprefix);
13595d465952Smartinh expand_prefix(bt, up->parent, ubound, &uprefix);
13605d465952Smartinh common_prefix(bt, &lprefix, &uprefix, &mp->prefix);
13615d465952Smartinh }
13625d465952Smartinh else if (mp->parent)
13635d465952Smartinh bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix));
13645d465952Smartinh
13655d465952Smartinh DPRINTF("found common prefix [%.*s] (len %zu) for page %u",
13665d465952Smartinh (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno);
13675d465952Smartinh }
13685d465952Smartinh
13695d465952Smartinh static int
btree_search_page_root(struct btree * bt,struct mpage * root,struct btval * key,struct cursor * cursor,int modify,struct mpage ** mpp)13705d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key,
13715d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp)
13725d465952Smartinh {
13735d465952Smartinh struct mpage *mp, *parent;
13745d465952Smartinh
13755d465952Smartinh if (cursor && cursor_push_page(cursor, root) == NULL)
13765d465952Smartinh return BT_FAIL;
13775d465952Smartinh
13785d465952Smartinh mp = root;
13795d465952Smartinh while (IS_BRANCH(mp)) {
13805d465952Smartinh unsigned int i = 0;
13815d465952Smartinh struct node *node;
13825d465952Smartinh
13835d465952Smartinh DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp));
13845d465952Smartinh assert(NUMKEYS(mp) > 1);
13851c05783eSmartinh DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0)));
13865d465952Smartinh
13875d465952Smartinh if (key == NULL) /* Initialize cursor to first page. */
13885d465952Smartinh i = 0;
13895d465952Smartinh else {
13905d465952Smartinh int exact;
13915d465952Smartinh node = btree_search_node(bt, mp, key, &exact, &i);
13925d465952Smartinh if (node == NULL)
13935d465952Smartinh i = NUMKEYS(mp) - 1;
13945d465952Smartinh else if (!exact) {
13955d465952Smartinh assert(i > 0);
13965d465952Smartinh i--;
13975d465952Smartinh }
13985d465952Smartinh }
13995d465952Smartinh
14005d465952Smartinh if (key)
14015d465952Smartinh DPRINTF("following index %u for key %.*s",
14025d465952Smartinh i, (int)key->size, (char *)key->data);
14035d465952Smartinh assert(i >= 0 && i < NUMKEYS(mp));
14045d465952Smartinh node = NODEPTR(mp, i);
14055d465952Smartinh
14065d465952Smartinh if (cursor)
14075d465952Smartinh CURSOR_TOP(cursor)->ki = i;
14085d465952Smartinh
14095d465952Smartinh parent = mp;
141037ee45a1Smartinh if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
14115d465952Smartinh return BT_FAIL;
14125d465952Smartinh mp->parent = parent;
14135d465952Smartinh mp->parent_index = i;
14145d465952Smartinh find_common_prefix(bt, mp);
14155d465952Smartinh
14165d465952Smartinh if (cursor && cursor_push_page(cursor, mp) == NULL)
14175d465952Smartinh return BT_FAIL;
14185d465952Smartinh
14197f8a466aSmartinh if (modify && (mp = mpage_touch(bt, mp)) == NULL)
14207f8a466aSmartinh return BT_FAIL;
14215d465952Smartinh }
14225d465952Smartinh
14235d465952Smartinh if (!IS_LEAF(mp)) {
14245d465952Smartinh DPRINTF("internal error, index points to a %02X page!?",
142537ee45a1Smartinh mp->page->flags);
14265d465952Smartinh return BT_FAIL;
14275d465952Smartinh }
14285d465952Smartinh
14295d465952Smartinh DPRINTF("found leaf page %u for key %.*s", mp->pgno,
14305d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL);
14315d465952Smartinh
14325d465952Smartinh *mpp = mp;
14335d465952Smartinh return BT_SUCCESS;
14345d465952Smartinh }
14355d465952Smartinh
14365d465952Smartinh /* Search for the page a given key should be in.
14375d465952Smartinh * Stores a pointer to the found page in *mpp.
14385d465952Smartinh * If key is NULL, search for the lowest page (used by btree_cursor_first).
14395d465952Smartinh * If cursor is non-null, pushes parent pages on the cursor stack.
14405d465952Smartinh * If modify is true, visited pages are updated with new page numbers.
14415d465952Smartinh */
14425d465952Smartinh static int
btree_search_page(struct btree * bt,struct btree_txn * txn,struct btval * key,struct cursor * cursor,int modify,struct mpage ** mpp)14435d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key,
14445d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp)
14455d465952Smartinh {
14465d465952Smartinh int rc;
14475d465952Smartinh pgno_t root;
14485d465952Smartinh struct mpage *mp;
14495d465952Smartinh
14505d465952Smartinh /* Can't modify pages outside a transaction. */
14515d465952Smartinh if (txn == NULL && modify) {
14525d465952Smartinh errno = EINVAL;
14535d465952Smartinh return BT_FAIL;
14545d465952Smartinh }
14555d465952Smartinh
14565d465952Smartinh /* Choose which root page to start with. If a transaction is given
14575d465952Smartinh * use the root page from the transaction, otherwise read the last
14585d465952Smartinh * committed root page.
14595d465952Smartinh */
14605d465952Smartinh if (txn == NULL) {
14615d465952Smartinh if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS)
14625d465952Smartinh return rc;
146337ee45a1Smartinh root = bt->meta.root;
14645d465952Smartinh } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) {
14655d465952Smartinh DPRINTF("transaction has failed, must abort");
14660279e526Smartinh errno = EINVAL;
14675d465952Smartinh return BT_FAIL;
14685d465952Smartinh } else
14695d465952Smartinh root = txn->root;
14705d465952Smartinh
14710279e526Smartinh if (root == P_INVALID) { /* Tree is empty. */
147237ee45a1Smartinh DPRINTF("tree is empty");
14730279e526Smartinh errno = ENOENT;
14740279e526Smartinh return BT_FAIL;
14750279e526Smartinh }
14765d465952Smartinh
147737ee45a1Smartinh if ((mp = btree_get_mpage(bt, root)) == NULL)
147837ee45a1Smartinh return BT_FAIL;
147937ee45a1Smartinh
148037ee45a1Smartinh DPRINTF("root page has flags 0x%X", mp->page->flags);
14815d465952Smartinh
14825d465952Smartinh assert(mp->parent == NULL);
14835d465952Smartinh assert(mp->prefix.len == 0);
14845d465952Smartinh
14855d465952Smartinh if (modify && !mp->dirty) {
14867f8a466aSmartinh if ((mp = mpage_touch(bt, mp)) == NULL)
14877f8a466aSmartinh return BT_FAIL;
14885d465952Smartinh txn->root = mp->pgno;
14895d465952Smartinh }
14905d465952Smartinh
14915d465952Smartinh return btree_search_page_root(bt, mp, key, cursor, modify, mpp);
14925d465952Smartinh }
14935d465952Smartinh
14945d465952Smartinh static int
btree_read_data(struct btree * bt,struct mpage * mp,struct node * leaf,struct btval * data)14955d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf,
14965d465952Smartinh struct btval *data)
14975d465952Smartinh {
149837ee45a1Smartinh struct mpage *omp; /* overflow mpage */
14995d465952Smartinh size_t psz;
15005d465952Smartinh size_t max;
15015d465952Smartinh size_t sz = 0;
15025d465952Smartinh pgno_t pgno;
15035d465952Smartinh
150437f4b933Smmcc memset(data, 0, sizeof(*data));
150537ee45a1Smartinh max = bt->head.psize - PAGEHDRSZ;
15065d465952Smartinh
15075d465952Smartinh if (!F_ISSET(leaf->flags, F_BIGDATA)) {
15085d465952Smartinh data->size = leaf->n_dsize;
15095d465952Smartinh if (data->size > 0) {
15105d465952Smartinh if (mp == NULL) {
15115d465952Smartinh if ((data->data = malloc(data->size)) == NULL)
15125d465952Smartinh return BT_FAIL;
15135d465952Smartinh bcopy(NODEDATA(leaf), data->data, data->size);
15145d465952Smartinh data->free_data = 1;
15155d465952Smartinh data->mp = NULL;
15165d465952Smartinh } else {
15175d465952Smartinh data->data = NODEDATA(leaf);
15185d465952Smartinh data->free_data = 0;
15195d465952Smartinh data->mp = mp;
15205d465952Smartinh mp->ref++;
15215d465952Smartinh }
15225d465952Smartinh }
15235d465952Smartinh return BT_SUCCESS;
15245d465952Smartinh }
15255d465952Smartinh
15265d465952Smartinh /* Read overflow data.
15275d465952Smartinh */
15285d465952Smartinh DPRINTF("allocating %u byte for overflow data", leaf->n_dsize);
15295d465952Smartinh if ((data->data = malloc(leaf->n_dsize)) == NULL)
15305d465952Smartinh return BT_FAIL;
15315d465952Smartinh data->size = leaf->n_dsize;
15325d465952Smartinh data->free_data = 1;
15335d465952Smartinh data->mp = NULL;
153445ac9d85Smartinh bcopy(NODEDATA(leaf), &pgno, sizeof(pgno));
15355d465952Smartinh for (sz = 0; sz < data->size; ) {
153637ee45a1Smartinh if ((omp = btree_get_mpage(bt, pgno)) == NULL ||
153737ee45a1Smartinh !F_ISSET(omp->page->flags, P_OVERFLOW)) {
15382c6c9175Smartinh DPRINTF("read overflow page %u failed", pgno);
15395d465952Smartinh free(data->data);
154037ee45a1Smartinh mpage_free(omp);
15415d465952Smartinh return BT_FAIL;
15425d465952Smartinh }
15435d465952Smartinh psz = data->size - sz;
15445d465952Smartinh if (psz > max)
15455d465952Smartinh psz = max;
154637ee45a1Smartinh bcopy(omp->page->ptrs, (char *)data->data + sz, psz);
15475d465952Smartinh sz += psz;
154837ee45a1Smartinh pgno = omp->page->p_next_pgno;
15495d465952Smartinh }
15505d465952Smartinh
15515d465952Smartinh return BT_SUCCESS;
15525d465952Smartinh }
15535d465952Smartinh
15545d465952Smartinh int
btree_txn_get(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data)15555d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn,
15565d465952Smartinh struct btval *key, struct btval *data)
15575d465952Smartinh {
15585d465952Smartinh int rc, exact;
15595d465952Smartinh struct node *leaf;
15605d465952Smartinh struct mpage *mp;
15615d465952Smartinh
15625d465952Smartinh assert(key);
15635d465952Smartinh assert(data);
15645d465952Smartinh DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data);
15655d465952Smartinh
156659916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) {
156759916c99Smartinh errno = EINVAL;
156859916c99Smartinh return BT_FAIL;
156959916c99Smartinh }
157059916c99Smartinh
157159916c99Smartinh if (bt == NULL) {
157259916c99Smartinh if (txn == NULL) {
157359916c99Smartinh errno = EINVAL;
157459916c99Smartinh return BT_FAIL;
157559916c99Smartinh }
157659916c99Smartinh bt = txn->bt;
157759916c99Smartinh }
157859916c99Smartinh
15795d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) {
15805d465952Smartinh errno = EINVAL;
15815d465952Smartinh return BT_FAIL;
15825d465952Smartinh }
15835d465952Smartinh
15845d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS)
15855d465952Smartinh return rc;
15865d465952Smartinh
15875d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, NULL);
15885d465952Smartinh if (leaf && exact)
15895d465952Smartinh rc = btree_read_data(bt, mp, leaf, data);
15900279e526Smartinh else {
15910279e526Smartinh errno = ENOENT;
15920279e526Smartinh rc = BT_FAIL;
15930279e526Smartinh }
15945d465952Smartinh
15955d465952Smartinh mpage_prune(bt);
15965d465952Smartinh return rc;
15975d465952Smartinh }
15985d465952Smartinh
15995d465952Smartinh static int
btree_sibling(struct cursor * cursor,int move_right)16005d465952Smartinh btree_sibling(struct cursor *cursor, int move_right)
16015d465952Smartinh {
16025d465952Smartinh int rc;
16035d465952Smartinh struct node *indx;
16045d465952Smartinh struct ppage *parent, *top;
16055d465952Smartinh struct mpage *mp;
16065d465952Smartinh
16075d465952Smartinh top = CURSOR_TOP(cursor);
16080279e526Smartinh if ((parent = SLIST_NEXT(top, entry)) == NULL) {
16090279e526Smartinh errno = ENOENT;
16100279e526Smartinh return BT_FAIL; /* root has no siblings */
16110279e526Smartinh }
16125d465952Smartinh
16135d465952Smartinh DPRINTF("parent page is page %u, index %u",
16145d465952Smartinh parent->mpage->pgno, parent->ki);
16155d465952Smartinh
16165d465952Smartinh cursor_pop_page(cursor);
16175d465952Smartinh if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage))
16185d465952Smartinh : (parent->ki == 0)) {
16195d465952Smartinh DPRINTF("no more keys left, moving to %s sibling",
16205d465952Smartinh move_right ? "right" : "left");
16215d465952Smartinh if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS)
16225d465952Smartinh return rc;
16235d465952Smartinh parent = CURSOR_TOP(cursor);
16245d465952Smartinh } else {
16255d465952Smartinh if (move_right)
16265d465952Smartinh parent->ki++;
16275d465952Smartinh else
16285d465952Smartinh parent->ki--;
16295d465952Smartinh DPRINTF("just moving to %s index key %u",
16305d465952Smartinh move_right ? "right" : "left", parent->ki);
16315d465952Smartinh }
16325d465952Smartinh assert(IS_BRANCH(parent->mpage));
16335d465952Smartinh
16345d465952Smartinh indx = NODEPTR(parent->mpage, parent->ki);
163537ee45a1Smartinh if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL)
16365d465952Smartinh return BT_FAIL;
16375d465952Smartinh mp->parent = parent->mpage;
16385d465952Smartinh mp->parent_index = parent->ki;
16395d465952Smartinh
16405d465952Smartinh cursor_push_page(cursor, mp);
16415d465952Smartinh find_common_prefix(cursor->bt, mp);
16425d465952Smartinh
16435d465952Smartinh return BT_SUCCESS;
16445d465952Smartinh }
16455d465952Smartinh
16465d465952Smartinh static int
bt_set_key(struct btree * bt,struct mpage * mp,struct node * node,struct btval * key)16475d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node,
16485d465952Smartinh struct btval *key)
16495d465952Smartinh {
16505d465952Smartinh if (key == NULL)
16515d465952Smartinh return 0;
16525d465952Smartinh
16535d465952Smartinh if (mp->prefix.len > 0) {
16545d465952Smartinh key->size = node->ksize + mp->prefix.len;
16555d465952Smartinh key->data = malloc(key->size);
16565d465952Smartinh if (key->data == NULL)
16575d465952Smartinh return -1;
16585d465952Smartinh concat_prefix(bt,
16595d465952Smartinh mp->prefix.str, mp->prefix.len,
16605d465952Smartinh NODEKEY(node), node->ksize,
16615d465952Smartinh key->data, &key->size);
16625d465952Smartinh key->free_data = 1;
16635d465952Smartinh } else {
16645d465952Smartinh key->size = node->ksize;
16655d465952Smartinh key->data = NODEKEY(node);
16665d465952Smartinh key->free_data = 0;
16675d465952Smartinh key->mp = mp;
16685d465952Smartinh mp->ref++;
16695d465952Smartinh }
16705d465952Smartinh
16715d465952Smartinh return 0;
16725d465952Smartinh }
16735d465952Smartinh
16745d465952Smartinh static int
btree_cursor_next(struct cursor * cursor,struct btval * key,struct btval * data)16755d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data)
16765d465952Smartinh {
16775d465952Smartinh struct ppage *top;
16785d465952Smartinh struct mpage *mp;
16795d465952Smartinh struct node *leaf;
16805d465952Smartinh
16810279e526Smartinh if (cursor->eof) {
16820279e526Smartinh errno = ENOENT;
16830279e526Smartinh return BT_FAIL;
16840279e526Smartinh }
16855d465952Smartinh
16865d465952Smartinh assert(cursor->initialized);
16875d465952Smartinh
16885d465952Smartinh top = CURSOR_TOP(cursor);
16895d465952Smartinh mp = top->mpage;
16905d465952Smartinh
16915d465952Smartinh DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor);
16925d465952Smartinh
16935d465952Smartinh if (top->ki + 1 >= NUMKEYS(mp)) {
16945d465952Smartinh DPRINTF("=====> move to next sibling page");
16955d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS) {
16965d465952Smartinh cursor->eof = 1;
16970279e526Smartinh return BT_FAIL;
16985d465952Smartinh }
16995d465952Smartinh top = CURSOR_TOP(cursor);
17005d465952Smartinh mp = top->mpage;
17015d465952Smartinh DPRINTF("next page is %u, key index %u", mp->pgno, top->ki);
17025d465952Smartinh } else
17035d465952Smartinh top->ki++;
17045d465952Smartinh
17055d465952Smartinh DPRINTF("==> cursor points to page %u with %lu keys, key index %u",
17065d465952Smartinh mp->pgno, NUMKEYS(mp), top->ki);
17075d465952Smartinh
17085d465952Smartinh assert(IS_LEAF(mp));
17095d465952Smartinh leaf = NODEPTR(mp, top->ki);
17105d465952Smartinh
17115d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17125d465952Smartinh return BT_FAIL;
17135d465952Smartinh
17145d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17155d465952Smartinh return BT_FAIL;
17165d465952Smartinh
17175d465952Smartinh return BT_SUCCESS;
17185d465952Smartinh }
17195d465952Smartinh
17205d465952Smartinh static int
btree_cursor_set(struct cursor * cursor,struct btval * key,struct btval * data,int * exactp)17215cea9f8fSmartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data,
17225cea9f8fSmartinh int *exactp)
17235d465952Smartinh {
17245d465952Smartinh int rc;
17255d465952Smartinh struct node *leaf;
17265d465952Smartinh struct mpage *mp;
17275d465952Smartinh struct ppage *top;
17285d465952Smartinh
17295d465952Smartinh assert(cursor);
17305d465952Smartinh assert(key);
17315d465952Smartinh assert(key->size > 0);
17325d465952Smartinh
17335d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp);
17345d465952Smartinh if (rc != BT_SUCCESS)
17355d465952Smartinh return rc;
17365d465952Smartinh assert(IS_LEAF(mp));
17375d465952Smartinh
17385d465952Smartinh top = CURSOR_TOP(cursor);
17395cea9f8fSmartinh leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki);
17405cea9f8fSmartinh if (exactp != NULL && !*exactp) {
17415cea9f8fSmartinh /* BT_CURSOR_EXACT specified and not an exact match. */
17425cea9f8fSmartinh errno = ENOENT;
17435cea9f8fSmartinh return BT_FAIL;
17445cea9f8fSmartinh }
17455cea9f8fSmartinh
17465d465952Smartinh if (leaf == NULL) {
17475d465952Smartinh DPRINTF("===> inexact leaf not found, goto sibling");
17485d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS)
17490279e526Smartinh return BT_FAIL; /* no entries matched */
17505d465952Smartinh top = CURSOR_TOP(cursor);
17515d465952Smartinh top->ki = 0;
17525d465952Smartinh mp = top->mpage;
17535d465952Smartinh assert(IS_LEAF(mp));
17545d465952Smartinh leaf = NODEPTR(mp, 0);
17555d465952Smartinh }
17565d465952Smartinh
17575d465952Smartinh cursor->initialized = 1;
17585d465952Smartinh cursor->eof = 0;
17595d465952Smartinh
17605d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17615d465952Smartinh return BT_FAIL;
17625d465952Smartinh
17635d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17645d465952Smartinh return BT_FAIL;
17655d465952Smartinh DPRINTF("==> cursor placed on key %.*s",
17665d465952Smartinh (int)key->size, (char *)key->data);
17675d465952Smartinh
17685d465952Smartinh return BT_SUCCESS;
17695d465952Smartinh }
17705d465952Smartinh
17715d465952Smartinh static int
btree_cursor_first(struct cursor * cursor,struct btval * key,struct btval * data)17725d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data)
17735d465952Smartinh {
17745d465952Smartinh int rc;
17755d465952Smartinh struct mpage *mp;
17765d465952Smartinh struct node *leaf;
17775d465952Smartinh
17785d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp);
17795d465952Smartinh if (rc != BT_SUCCESS)
17805d465952Smartinh return rc;
17815d465952Smartinh assert(IS_LEAF(mp));
17825d465952Smartinh
17835d465952Smartinh leaf = NODEPTR(mp, 0);
17845d465952Smartinh cursor->initialized = 1;
17855d465952Smartinh cursor->eof = 0;
17865d465952Smartinh
17875d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS)
17885d465952Smartinh return BT_FAIL;
17895d465952Smartinh
17905d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0)
17915d465952Smartinh return BT_FAIL;
17925d465952Smartinh
17935d465952Smartinh return BT_SUCCESS;
17945d465952Smartinh }
17955d465952Smartinh
17965d465952Smartinh int
btree_cursor_get(struct cursor * cursor,struct btval * key,struct btval * data,enum cursor_op op)17975d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data,
17985d465952Smartinh enum cursor_op op)
17995d465952Smartinh {
18005d465952Smartinh int rc;
18015cea9f8fSmartinh int exact = 0;
18025d465952Smartinh
18035d465952Smartinh assert(cursor);
18045d465952Smartinh
18055d465952Smartinh switch (op) {
18065d465952Smartinh case BT_CURSOR:
18075cea9f8fSmartinh case BT_CURSOR_EXACT:
18085d465952Smartinh while (CURSOR_TOP(cursor) != NULL)
18095d465952Smartinh cursor_pop_page(cursor);
18105d465952Smartinh if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) {
18115d465952Smartinh errno = EINVAL;
18125d465952Smartinh rc = BT_FAIL;
18135cea9f8fSmartinh } else if (op == BT_CURSOR_EXACT)
18145cea9f8fSmartinh rc = btree_cursor_set(cursor, key, data, &exact);
18155cea9f8fSmartinh else
18165cea9f8fSmartinh rc = btree_cursor_set(cursor, key, data, NULL);
18175d465952Smartinh break;
18185d465952Smartinh case BT_NEXT:
18195d465952Smartinh if (!cursor->initialized)
18205d465952Smartinh rc = btree_cursor_first(cursor, key, data);
18215d465952Smartinh else
18225d465952Smartinh rc = btree_cursor_next(cursor, key, data);
18235d465952Smartinh break;
18245d465952Smartinh case BT_FIRST:
18255d465952Smartinh while (CURSOR_TOP(cursor) != NULL)
18265d465952Smartinh cursor_pop_page(cursor);
18275d465952Smartinh rc = btree_cursor_first(cursor, key, data);
18285d465952Smartinh break;
18295d465952Smartinh default:
18305d465952Smartinh DPRINTF("unhandled/unimplemented cursor operation %u", op);
18315d465952Smartinh rc = BT_FAIL;
18325d465952Smartinh break;
18335d465952Smartinh }
18345d465952Smartinh
18355d465952Smartinh mpage_prune(cursor->bt);
18365d465952Smartinh
18375d465952Smartinh return rc;
18385d465952Smartinh }
18395d465952Smartinh
18405d465952Smartinh static struct mpage *
btree_new_page(struct btree * bt,uint32_t flags)18415d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags)
18425d465952Smartinh {
18435d465952Smartinh struct mpage *mp;
18445d465952Smartinh
18455d465952Smartinh assert(bt != NULL);
18465d465952Smartinh assert(bt->txn != NULL);
18475d465952Smartinh
184837ee45a1Smartinh DPRINTF("allocating new mpage %u, page size %u",
184937ee45a1Smartinh bt->txn->next_pgno, bt->head.psize);
18505d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL)
18515d465952Smartinh return NULL;
185237ee45a1Smartinh if ((mp->page = malloc(bt->head.psize)) == NULL) {
185337ee45a1Smartinh free(mp);
185437ee45a1Smartinh return NULL;
185537ee45a1Smartinh }
185637ee45a1Smartinh mp->pgno = mp->page->pgno = bt->txn->next_pgno++;
185737ee45a1Smartinh mp->page->flags = flags;
185837ee45a1Smartinh mp->page->lower = PAGEHDRSZ;
185937ee45a1Smartinh mp->page->upper = bt->head.psize;
18605d465952Smartinh
18615d465952Smartinh if (IS_BRANCH(mp))
186237ee45a1Smartinh bt->meta.branch_pages++;
18635d465952Smartinh else if (IS_LEAF(mp))
186437ee45a1Smartinh bt->meta.leaf_pages++;
18655d465952Smartinh else if (IS_OVERFLOW(mp))
186637ee45a1Smartinh bt->meta.overflow_pages++;
18675d465952Smartinh
18685d465952Smartinh mpage_add(bt, mp);
18695d465952Smartinh mpage_dirty(bt, mp);
18705d465952Smartinh
18715d465952Smartinh return mp;
18725d465952Smartinh }
18735d465952Smartinh
18745d465952Smartinh static size_t
bt_leaf_size(struct btree * bt,struct btval * key,struct btval * data)187537ee45a1Smartinh bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data)
18765d465952Smartinh {
18775d465952Smartinh size_t sz;
18785d465952Smartinh
18795d465952Smartinh sz = LEAFSIZE(key, data);
188037ee45a1Smartinh if (data->size >= bt->head.psize / BT_MINKEYS) {
18815d465952Smartinh /* put on overflow page */
18825d465952Smartinh sz -= data->size - sizeof(pgno_t);
18835d465952Smartinh }
18845d465952Smartinh
18855d465952Smartinh return sz + sizeof(indx_t);
18865d465952Smartinh }
18875d465952Smartinh
18885d465952Smartinh static size_t
bt_branch_size(struct btree * bt,struct btval * key)188937ee45a1Smartinh bt_branch_size(struct btree *bt, struct btval *key)
18905d465952Smartinh {
18915d465952Smartinh size_t sz;
18925d465952Smartinh
18935d465952Smartinh sz = INDXSIZE(key);
189437ee45a1Smartinh if (sz >= bt->head.psize / BT_MINKEYS) {
18955d465952Smartinh /* put on overflow page */
18965d465952Smartinh /* not implemented */
18975d465952Smartinh /* sz -= key->size - sizeof(pgno_t); */
18985d465952Smartinh }
18995d465952Smartinh
19005d465952Smartinh return sz + sizeof(indx_t);
19015d465952Smartinh }
19025d465952Smartinh
19035d465952Smartinh static int
btree_write_overflow_data(struct btree * bt,struct page * p,struct btval * data)19045d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data)
19055d465952Smartinh {
19065d465952Smartinh size_t done = 0;
19075d465952Smartinh size_t sz;
19085d465952Smartinh size_t max;
19095d465952Smartinh pgno_t *linkp; /* linked page stored here */
19105d465952Smartinh struct mpage *next = NULL;
19115d465952Smartinh
191237ee45a1Smartinh max = bt->head.psize - PAGEHDRSZ;
19135d465952Smartinh
19145d465952Smartinh while (done < data->size) {
19152c6c9175Smartinh if (next != NULL)
19162c6c9175Smartinh p = next->page;
19175d465952Smartinh linkp = &p->p_next_pgno;
19185d465952Smartinh if (data->size - done > max) {
19195d465952Smartinh /* need another overflow page */
19205d465952Smartinh if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL)
19215d465952Smartinh return BT_FAIL;
19225d465952Smartinh *linkp = next->pgno;
19235d465952Smartinh DPRINTF("linking overflow page %u", next->pgno);
19245d465952Smartinh } else
19255d465952Smartinh *linkp = 0; /* indicates end of list */
19265d465952Smartinh sz = data->size - done;
19275d465952Smartinh if (sz > max)
19285d465952Smartinh sz = max;
19295d465952Smartinh DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno);
19305d465952Smartinh bcopy((char *)data->data + done, p->ptrs, sz);
19315d465952Smartinh done += sz;
19325d465952Smartinh }
19335d465952Smartinh
19345d465952Smartinh return BT_SUCCESS;
19355d465952Smartinh }
19365d465952Smartinh
19375d465952Smartinh /* Key prefix should already be stripped.
19385d465952Smartinh */
19395d465952Smartinh static int
btree_add_node(struct btree * bt,struct mpage * mp,indx_t indx,struct btval * key,struct btval * data,pgno_t pgno,uint8_t flags)19405d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx,
19415d465952Smartinh struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags)
19425d465952Smartinh {
19435d465952Smartinh unsigned int i;
19445d465952Smartinh size_t node_size = NODESIZE;
19455d465952Smartinh indx_t ofs;
19465d465952Smartinh struct node *node;
19475d465952Smartinh struct page *p;
19485d465952Smartinh struct mpage *ofp = NULL; /* overflow page */
19495d465952Smartinh
195037ee45a1Smartinh p = mp->page;
19515d465952Smartinh assert(p->upper >= p->lower);
19525d465952Smartinh
195348318e0bSderaadt DPRINTF("add node [%.*s] to %s page %u at index %d, key size %zu",
19545d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL,
19555d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch",
195637ee45a1Smartinh mp->pgno, indx, key ? key->size : 0);
19575d465952Smartinh
19585d465952Smartinh if (key != NULL)
19595d465952Smartinh node_size += key->size;
19605d465952Smartinh
19615d465952Smartinh if (IS_LEAF(mp)) {
19625d465952Smartinh assert(data);
19635d465952Smartinh node_size += data->size;
19645d465952Smartinh if (F_ISSET(flags, F_BIGDATA)) {
19655d465952Smartinh /* Data already on overflow page. */
19665d465952Smartinh node_size -= data->size - sizeof(pgno_t);
196737ee45a1Smartinh } else if (data->size >= bt->head.psize / BT_MINKEYS) {
19685d465952Smartinh /* Put data on overflow page. */
19695d465952Smartinh DPRINTF("data size is %zu, put on overflow page",
19705d465952Smartinh data->size);
19715d465952Smartinh node_size -= data->size - sizeof(pgno_t);
19725d465952Smartinh if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL)
19735d465952Smartinh return BT_FAIL;
19745d465952Smartinh DPRINTF("allocated overflow page %u", ofp->pgno);
19755d465952Smartinh flags |= F_BIGDATA;
19765d465952Smartinh }
19775d465952Smartinh }
19785d465952Smartinh
19795d465952Smartinh if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
19805d465952Smartinh DPRINTF("not enough room in page %u, got %lu ptrs",
19815d465952Smartinh mp->pgno, NUMKEYS(mp));
19825d465952Smartinh DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower,
19835d465952Smartinh p->upper - p->lower);
198437ee45a1Smartinh DPRINTF("node size = %zu", node_size);
19855d465952Smartinh return BT_FAIL;
19865d465952Smartinh }
19875d465952Smartinh
19885d465952Smartinh /* Move higher pointers up one slot. */
19895d465952Smartinh for (i = NUMKEYS(mp); i > indx; i--)
19905d465952Smartinh p->ptrs[i] = p->ptrs[i - 1];
19915d465952Smartinh
19925d465952Smartinh /* Adjust free space offsets. */
19935d465952Smartinh ofs = p->upper - node_size;
19945d465952Smartinh assert(ofs >= p->lower + sizeof(indx_t));
19955d465952Smartinh p->ptrs[indx] = ofs;
19965d465952Smartinh p->upper = ofs;
19975d465952Smartinh p->lower += sizeof(indx_t);
19985d465952Smartinh
19995d465952Smartinh /* Write the node data. */
20005d465952Smartinh node = NODEPTR(mp, indx);
20015d465952Smartinh node->ksize = (key == NULL) ? 0 : key->size;
20025d465952Smartinh node->flags = flags;
20035d465952Smartinh if (IS_LEAF(mp))
20045d465952Smartinh node->n_dsize = data->size;
20055d465952Smartinh else
20065d465952Smartinh node->n_pgno = pgno;
20075d465952Smartinh
20085d465952Smartinh if (key)
20095d465952Smartinh bcopy(key->data, NODEKEY(node), key->size);
20105d465952Smartinh
20115d465952Smartinh if (IS_LEAF(mp)) {
20122c6c9175Smartinh assert(key);
20135d465952Smartinh if (ofp == NULL) {
20145d465952Smartinh if (F_ISSET(flags, F_BIGDATA))
20155d465952Smartinh bcopy(data->data, node->data + key->size,
20165d465952Smartinh sizeof(pgno_t));
20175d465952Smartinh else
20185d465952Smartinh bcopy(data->data, node->data + key->size,
20195d465952Smartinh data->size);
20205d465952Smartinh } else {
20215d465952Smartinh bcopy(&ofp->pgno, node->data + key->size,
20225d465952Smartinh sizeof(pgno_t));
202337ee45a1Smartinh if (btree_write_overflow_data(bt, ofp->page,
20245d465952Smartinh data) == BT_FAIL)
20255d465952Smartinh return BT_FAIL;
20265d465952Smartinh }
20275d465952Smartinh }
20285d465952Smartinh
20295d465952Smartinh return BT_SUCCESS;
20305d465952Smartinh }
20315d465952Smartinh
20325d465952Smartinh static void
btree_del_node(struct btree * bt,struct mpage * mp,indx_t indx)20335d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx)
20345d465952Smartinh {
20355d465952Smartinh unsigned int sz;
20365d465952Smartinh indx_t i, j, numkeys, ptr;
20375d465952Smartinh struct node *node;
20385d465952Smartinh char *base;
20395d465952Smartinh
20405d465952Smartinh DPRINTF("delete node %u on %s page %u", indx,
20415d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno);
20425d465952Smartinh assert(indx < NUMKEYS(mp));
20435d465952Smartinh
20445d465952Smartinh node = NODEPTR(mp, indx);
20455d465952Smartinh sz = NODESIZE + node->ksize;
20465d465952Smartinh if (IS_LEAF(mp)) {
20475d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA))
20485d465952Smartinh sz += sizeof(pgno_t);
20495d465952Smartinh else
20505d465952Smartinh sz += NODEDSZ(node);
20515d465952Smartinh }
20525d465952Smartinh
205337ee45a1Smartinh ptr = mp->page->ptrs[indx];
20545d465952Smartinh numkeys = NUMKEYS(mp);
20555d465952Smartinh for (i = j = 0; i < numkeys; i++) {
20565d465952Smartinh if (i != indx) {
205737ee45a1Smartinh mp->page->ptrs[j] = mp->page->ptrs[i];
205837ee45a1Smartinh if (mp->page->ptrs[i] < ptr)
205937ee45a1Smartinh mp->page->ptrs[j] += sz;
20605d465952Smartinh j++;
20615d465952Smartinh }
20625d465952Smartinh }
20635d465952Smartinh
206437ee45a1Smartinh base = (char *)mp->page + mp->page->upper;
206537ee45a1Smartinh bcopy(base, base + sz, ptr - mp->page->upper);
20665d465952Smartinh
206737ee45a1Smartinh mp->page->lower -= sizeof(indx_t);
206837ee45a1Smartinh mp->page->upper += sz;
20695d465952Smartinh }
20705d465952Smartinh
20715d465952Smartinh struct cursor *
btree_txn_cursor_open(struct btree * bt,struct btree_txn * txn)20725d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn)
20735d465952Smartinh {
20745d465952Smartinh struct cursor *cursor;
20755d465952Smartinh
207659916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) {
207759916c99Smartinh errno = EINVAL;
207859916c99Smartinh return NULL;
207959916c99Smartinh }
208059916c99Smartinh
208159916c99Smartinh if (bt == NULL) {
208259916c99Smartinh if (txn == NULL) {
208359916c99Smartinh errno = EINVAL;
208459916c99Smartinh return NULL;
208559916c99Smartinh }
208659916c99Smartinh bt = txn->bt;
208759916c99Smartinh }
208859916c99Smartinh
20895d465952Smartinh if ((cursor = calloc(1, sizeof(*cursor))) != NULL) {
20905d465952Smartinh SLIST_INIT(&cursor->stack);
20915d465952Smartinh cursor->bt = bt;
20925d465952Smartinh cursor->txn = txn;
2093408be415Smartinh btree_ref(bt);
20945d465952Smartinh }
20955d465952Smartinh
20965d465952Smartinh return cursor;
20975d465952Smartinh }
20985d465952Smartinh
20995d465952Smartinh void
btree_cursor_close(struct cursor * cursor)21005d465952Smartinh btree_cursor_close(struct cursor *cursor)
21015d465952Smartinh {
21025d465952Smartinh if (cursor != NULL) {
21035d465952Smartinh while (!CURSOR_EMPTY(cursor))
21045d465952Smartinh cursor_pop_page(cursor);
21055d465952Smartinh
21065d465952Smartinh btree_close(cursor->bt);
21075d465952Smartinh free(cursor);
21085d465952Smartinh }
21095d465952Smartinh }
21105d465952Smartinh
21115d465952Smartinh static int
btree_update_key(struct btree * bt,struct mpage * mp,indx_t indx,struct btval * key)21125d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx,
21135d465952Smartinh struct btval *key)
21145d465952Smartinh {
21155d465952Smartinh indx_t ptr, i, numkeys;
21165d465952Smartinh int delta;
21175d465952Smartinh size_t len;
21185d465952Smartinh struct node *node;
21195d465952Smartinh char *base;
21205d465952Smartinh
21215d465952Smartinh node = NODEPTR(mp, indx);
212237ee45a1Smartinh ptr = mp->page->ptrs[indx];
21235d465952Smartinh DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u",
21245d465952Smartinh indx, ptr,
21255d465952Smartinh (int)node->ksize, (char *)NODEKEY(node),
21265d465952Smartinh (int)key->size, (char *)key->data,
21275d465952Smartinh mp->pgno);
21285d465952Smartinh
21295d465952Smartinh if (key->size != node->ksize) {
21305d465952Smartinh delta = key->size - node->ksize;
21315d465952Smartinh if (delta > 0 && SIZELEFT(mp) < delta) {
21325d465952Smartinh DPRINTF("OUCH! Not enough room, delta = %d", delta);
21335d465952Smartinh return BT_FAIL;
21345d465952Smartinh }
21355d465952Smartinh
21365d465952Smartinh numkeys = NUMKEYS(mp);
21375d465952Smartinh for (i = 0; i < numkeys; i++) {
213837ee45a1Smartinh if (mp->page->ptrs[i] <= ptr)
213937ee45a1Smartinh mp->page->ptrs[i] -= delta;
21405d465952Smartinh }
21415d465952Smartinh
214237ee45a1Smartinh base = (char *)mp->page + mp->page->upper;
214337ee45a1Smartinh len = ptr - mp->page->upper + NODESIZE;
21445d465952Smartinh bcopy(base, base - delta, len);
214537ee45a1Smartinh mp->page->upper -= delta;
21465d465952Smartinh
21475d465952Smartinh node = NODEPTR(mp, indx);
21485d465952Smartinh node->ksize = key->size;
21495d465952Smartinh }
21505d465952Smartinh
21515d465952Smartinh bcopy(key->data, NODEKEY(node), key->size);
21525d465952Smartinh
21535d465952Smartinh return BT_SUCCESS;
21545d465952Smartinh }
21555d465952Smartinh
21565d465952Smartinh static int
btree_adjust_prefix(struct btree * bt,struct mpage * src,int delta)21575d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta)
21585d465952Smartinh {
21595d465952Smartinh indx_t i;
21605d465952Smartinh struct node *node;
21615d465952Smartinh struct btkey tmpkey;
21625d465952Smartinh struct btval key;
21635d465952Smartinh
21645d465952Smartinh DPRINTF("adjusting prefix lengths on page %u with delta %d",
21655d465952Smartinh src->pgno, delta);
21665d465952Smartinh assert(delta != 0);
21675d465952Smartinh
21685d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) {
21695d465952Smartinh node = NODEPTR(src, i);
21705d465952Smartinh tmpkey.len = node->ksize - delta;
21715d465952Smartinh if (delta > 0) {
21725d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY))
21735d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, tmpkey.len);
21745d465952Smartinh else
21755d465952Smartinh bcopy((char *)NODEKEY(node) + delta, tmpkey.str,
21765d465952Smartinh tmpkey.len);
21775d465952Smartinh } else {
21785d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
21795d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, node->ksize);
21805d465952Smartinh bcopy(src->prefix.str, tmpkey.str + node->ksize,
21815d465952Smartinh -delta);
21825d465952Smartinh } else {
21835d465952Smartinh bcopy(src->prefix.str + src->prefix.len + delta,
21845d465952Smartinh tmpkey.str, -delta);
21855d465952Smartinh bcopy(NODEKEY(node), tmpkey.str - delta,
21865d465952Smartinh node->ksize);
21875d465952Smartinh }
21885d465952Smartinh }
21895d465952Smartinh key.size = tmpkey.len;
21905d465952Smartinh key.data = tmpkey.str;
21915d465952Smartinh if (btree_update_key(bt, src, i, &key) != BT_SUCCESS)
21925d465952Smartinh return BT_FAIL;
21935d465952Smartinh }
21945d465952Smartinh
21955d465952Smartinh return BT_SUCCESS;
21965d465952Smartinh }
21975d465952Smartinh
21985d465952Smartinh /* Move a node from src to dst.
21995d465952Smartinh */
22005d465952Smartinh static int
btree_move_node(struct btree * bt,struct mpage * src,indx_t srcindx,struct mpage * dst,indx_t dstindx)22015d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx,
22025d465952Smartinh struct mpage *dst, indx_t dstindx)
22035d465952Smartinh {
22045d465952Smartinh int rc;
22055d465952Smartinh unsigned int pfxlen, mp_pfxlen = 0;
22061c05783eSmartinh struct node *srcnode;
220737ee45a1Smartinh struct mpage *mp = NULL;
22085d465952Smartinh struct btkey tmpkey, srckey;
22095d465952Smartinh struct btval key, data;
22105d465952Smartinh
22115d465952Smartinh assert(src->parent);
22125d465952Smartinh assert(dst->parent);
22135d465952Smartinh
22145d465952Smartinh srcnode = NODEPTR(src, srcindx);
22155d465952Smartinh DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u",
22165d465952Smartinh IS_LEAF(src) ? "leaf" : "branch",
22175d465952Smartinh srcindx,
22185d465952Smartinh (int)srcnode->ksize, (char *)NODEKEY(srcnode),
22195d465952Smartinh src->pgno,
22205d465952Smartinh dstindx, dst->pgno);
22215d465952Smartinh
22229074208fSmartinh find_common_prefix(bt, src);
22239074208fSmartinh
22245d465952Smartinh if (IS_BRANCH(src)) {
22255d465952Smartinh /* Need to check if the page the moved node points to
22265d465952Smartinh * changes prefix.
22275d465952Smartinh */
222837ee45a1Smartinh if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL)
222937ee45a1Smartinh return BT_FAIL;
22305d465952Smartinh mp->parent = src;
22315d465952Smartinh mp->parent_index = srcindx;
22325d465952Smartinh find_common_prefix(bt, mp);
22335d465952Smartinh mp_pfxlen = mp->prefix.len;
22345d465952Smartinh }
22355d465952Smartinh
22365d465952Smartinh /* Mark src and dst as dirty. */
22377f8a466aSmartinh if ((src = mpage_touch(bt, src)) == NULL ||
22387f8a466aSmartinh (dst = mpage_touch(bt, dst)) == NULL)
22397f8a466aSmartinh return BT_FAIL;
22405d465952Smartinh
22415d465952Smartinh find_common_prefix(bt, dst);
22425d465952Smartinh
22435d465952Smartinh /* Check if src node has destination page prefix. Otherwise the
22445d465952Smartinh * destination page must expand its prefix on all its nodes.
22455d465952Smartinh */
22465d465952Smartinh srckey.len = srcnode->ksize;
22475d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srckey.len);
22485d465952Smartinh common_prefix(bt, &srckey, &dst->prefix, &tmpkey);
22495d465952Smartinh if (tmpkey.len != dst->prefix.len) {
22505d465952Smartinh if (btree_adjust_prefix(bt, dst,
22515d465952Smartinh tmpkey.len - dst->prefix.len) != BT_SUCCESS)
22525d465952Smartinh return BT_FAIL;
22535d465952Smartinh bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey));
22545d465952Smartinh }
22555d465952Smartinh
22565d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) {
22575d465952Smartinh struct mpage *low;
22585d465952Smartinh
22595d465952Smartinh /* must find the lowest key below src
22605d465952Smartinh */
22615d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0,
22625d465952Smartinh &low) == BT_SUCCESS);
22635d465952Smartinh expand_prefix(bt, low, 0, &srckey);
22645d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u",
22655d465952Smartinh (int)srckey.len, srckey.str, low->pgno);
22665d465952Smartinh } else {
22675d465952Smartinh srckey.len = srcnode->ksize;
22685d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize);
22695d465952Smartinh }
22705d465952Smartinh find_common_prefix(bt, src);
22715d465952Smartinh
22725d465952Smartinh /* expand the prefix */
22735d465952Smartinh tmpkey.len = sizeof(tmpkey.str);
22745d465952Smartinh concat_prefix(bt, src->prefix.str, src->prefix.len,
22755d465952Smartinh srckey.str, srckey.len, tmpkey.str, &tmpkey.len);
22765d465952Smartinh
22775d465952Smartinh /* Add the node to the destination page. Adjust prefix for
22785d465952Smartinh * destination page.
22795d465952Smartinh */
22805d465952Smartinh key.size = tmpkey.len;
22815d465952Smartinh key.data = tmpkey.str;
22825d465952Smartinh remove_prefix(bt, &key, dst->prefix.len);
22835d465952Smartinh data.size = NODEDSZ(srcnode);
22845d465952Smartinh data.data = NODEDATA(srcnode);
22855d465952Smartinh rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode),
22865d465952Smartinh srcnode->flags);
22875d465952Smartinh if (rc != BT_SUCCESS)
22885d465952Smartinh return rc;
22895d465952Smartinh
22905d465952Smartinh /* Delete the node from the source page.
22915d465952Smartinh */
22925d465952Smartinh btree_del_node(bt, src, srcindx);
22935d465952Smartinh
22945d465952Smartinh /* Update the parent separators.
22955d465952Smartinh */
22965d465952Smartinh if (srcindx == 0 && src->parent_index != 0) {
22975d465952Smartinh expand_prefix(bt, src, 0, &tmpkey);
22985d465952Smartinh key.size = tmpkey.len;
22995d465952Smartinh key.data = tmpkey.str;
23005d465952Smartinh remove_prefix(bt, &key, src->parent->prefix.len);
23015d465952Smartinh
23025d465952Smartinh DPRINTF("update separator for source page %u to [%.*s]",
23035d465952Smartinh src->pgno, (int)key.size, (char *)key.data);
23045d465952Smartinh if (btree_update_key(bt, src->parent, src->parent_index,
23055d465952Smartinh &key) != BT_SUCCESS)
23065d465952Smartinh return BT_FAIL;
23075d465952Smartinh }
23085d465952Smartinh
23095d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) {
23105d465952Smartinh struct btval nullkey;
23115d465952Smartinh nullkey.size = 0;
23125d465952Smartinh assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS);
23135d465952Smartinh }
23145d465952Smartinh
23155d465952Smartinh if (dstindx == 0 && dst->parent_index != 0) {
23165d465952Smartinh expand_prefix(bt, dst, 0, &tmpkey);
23175d465952Smartinh key.size = tmpkey.len;
23185d465952Smartinh key.data = tmpkey.str;
23195d465952Smartinh remove_prefix(bt, &key, dst->parent->prefix.len);
23205d465952Smartinh
23215d465952Smartinh DPRINTF("update separator for destination page %u to [%.*s]",
23225d465952Smartinh dst->pgno, (int)key.size, (char *)key.data);
23235d465952Smartinh if (btree_update_key(bt, dst->parent, dst->parent_index,
23245d465952Smartinh &key) != BT_SUCCESS)
23255d465952Smartinh return BT_FAIL;
23265d465952Smartinh }
23275d465952Smartinh
23285d465952Smartinh if (dstindx == 0 && IS_BRANCH(dst)) {
23295d465952Smartinh struct btval nullkey;
23305d465952Smartinh nullkey.size = 0;
23315d465952Smartinh assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS);
23325d465952Smartinh }
23335d465952Smartinh
23345d465952Smartinh /* We can get a new page prefix here!
23355d465952Smartinh * Must update keys in all nodes of this page!
23365d465952Smartinh */
23375d465952Smartinh pfxlen = src->prefix.len;
23385d465952Smartinh find_common_prefix(bt, src);
23395d465952Smartinh if (src->prefix.len != pfxlen) {
23405d465952Smartinh if (btree_adjust_prefix(bt, src,
23415d465952Smartinh src->prefix.len - pfxlen) != BT_SUCCESS)
23425d465952Smartinh return BT_FAIL;
23435d465952Smartinh }
23445d465952Smartinh
23455d465952Smartinh pfxlen = dst->prefix.len;
23465d465952Smartinh find_common_prefix(bt, dst);
23475d465952Smartinh if (dst->prefix.len != pfxlen) {
23485d465952Smartinh if (btree_adjust_prefix(bt, dst,
23495d465952Smartinh dst->prefix.len - pfxlen) != BT_SUCCESS)
23505d465952Smartinh return BT_FAIL;
23515d465952Smartinh }
23525d465952Smartinh
23535d465952Smartinh if (IS_BRANCH(dst)) {
23542c6c9175Smartinh assert(mp);
23555d465952Smartinh mp->parent = dst;
23565d465952Smartinh mp->parent_index = dstindx;
23575d465952Smartinh find_common_prefix(bt, mp);
23585d465952Smartinh if (mp->prefix.len != mp_pfxlen) {
23595d465952Smartinh DPRINTF("moved branch node has changed prefix");
23607f8a466aSmartinh if ((mp = mpage_touch(bt, mp)) == NULL)
23617f8a466aSmartinh return BT_FAIL;
23625d465952Smartinh if (btree_adjust_prefix(bt, mp,
23635d465952Smartinh mp->prefix.len - mp_pfxlen) != BT_SUCCESS)
23645d465952Smartinh return BT_FAIL;
23655d465952Smartinh }
23665d465952Smartinh }
23675d465952Smartinh
23685d465952Smartinh return BT_SUCCESS;
23695d465952Smartinh }
23705d465952Smartinh
23715d465952Smartinh static int
btree_merge(struct btree * bt,struct mpage * src,struct mpage * dst)23725d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst)
23735d465952Smartinh {
23745d465952Smartinh int rc;
23755d465952Smartinh indx_t i;
2376a33d639aSmartinh unsigned int pfxlen;
23775d465952Smartinh struct node *srcnode;
23785d465952Smartinh struct btkey tmpkey, dstpfx;
23795d465952Smartinh struct btval key, data;
23805d465952Smartinh
23815d465952Smartinh DPRINTF("merging page %u and %u", src->pgno, dst->pgno);
23825d465952Smartinh
23835d465952Smartinh assert(src->parent); /* can't merge root page */
23845d465952Smartinh assert(dst->parent);
23855d465952Smartinh assert(bt->txn != NULL);
23865d465952Smartinh
23875d465952Smartinh /* Mark src and dst as dirty. */
23887f8a466aSmartinh if ((src = mpage_touch(bt, src)) == NULL ||
23897f8a466aSmartinh (dst = mpage_touch(bt, dst)) == NULL)
23907f8a466aSmartinh return BT_FAIL;
23915d465952Smartinh
23925d465952Smartinh find_common_prefix(bt, src);
23935d465952Smartinh find_common_prefix(bt, dst);
23945d465952Smartinh
23955d465952Smartinh /* Check if source nodes has destination page prefix. Otherwise
23965d465952Smartinh * the destination page must expand its prefix on all its nodes.
23975d465952Smartinh */
23985d465952Smartinh common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx);
23995d465952Smartinh if (dstpfx.len != dst->prefix.len) {
24005d465952Smartinh if (btree_adjust_prefix(bt, dst,
24015d465952Smartinh dstpfx.len - dst->prefix.len) != BT_SUCCESS)
24025d465952Smartinh return BT_FAIL;
24035d465952Smartinh bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx));
24045d465952Smartinh }
24055d465952Smartinh
24065d465952Smartinh /* Move all nodes from src to dst.
24075d465952Smartinh */
24085d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) {
24095d465952Smartinh srcnode = NODEPTR(src, i);
24105d465952Smartinh
24115d465952Smartinh /* If branch node 0 (implicit key), find the real key.
24125d465952Smartinh */
24135d465952Smartinh if (i == 0 && IS_BRANCH(src)) {
24145d465952Smartinh struct mpage *low;
24155d465952Smartinh
24165d465952Smartinh /* must find the lowest key below src
24175d465952Smartinh */
24185d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0,
24195d465952Smartinh &low) == BT_SUCCESS);
24205d465952Smartinh expand_prefix(bt, low, 0, &tmpkey);
24215d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u",
24225d465952Smartinh (int)tmpkey.len, tmpkey.str, low->pgno);
24235d465952Smartinh } else {
24245d465952Smartinh expand_prefix(bt, src, i, &tmpkey);
24255d465952Smartinh }
24265d465952Smartinh
24275d465952Smartinh key.size = tmpkey.len;
24285d465952Smartinh key.data = tmpkey.str;
24295d465952Smartinh
24305d465952Smartinh remove_prefix(bt, &key, dst->prefix.len);
24315d465952Smartinh data.size = NODEDSZ(srcnode);
24325d465952Smartinh data.data = NODEDATA(srcnode);
24335d465952Smartinh rc = btree_add_node(bt, dst, NUMKEYS(dst), &key,
24345d465952Smartinh &data, NODEPGNO(srcnode), srcnode->flags);
24355d465952Smartinh if (rc != BT_SUCCESS)
24365d465952Smartinh return rc;
24375d465952Smartinh }
24385d465952Smartinh
24395d465952Smartinh DPRINTF("dst page %u now has %lu keys (%.1f%% filled)",
2440944ad5d4Smartinh dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10);
24415d465952Smartinh
24425d465952Smartinh /* Unlink the src page from parent.
24435d465952Smartinh */
24445d465952Smartinh btree_del_node(bt, src->parent, src->parent_index);
24455d465952Smartinh if (src->parent_index == 0) {
24465d465952Smartinh key.size = 0;
24475d465952Smartinh if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS)
24485d465952Smartinh return BT_FAIL;
24495d465952Smartinh
2450a33d639aSmartinh pfxlen = src->prefix.len;
24515d465952Smartinh find_common_prefix(bt, src);
24525d465952Smartinh assert (src->prefix.len == pfxlen);
24535d465952Smartinh }
24545d465952Smartinh
24555d465952Smartinh if (IS_LEAF(src))
245637ee45a1Smartinh bt->meta.leaf_pages--;
24575d465952Smartinh else
245837ee45a1Smartinh bt->meta.branch_pages--;
24595d465952Smartinh
24605d465952Smartinh return btree_rebalance(bt, src->parent);
24615d465952Smartinh }
24625d465952Smartinh
2463944ad5d4Smartinh #define FILL_THRESHOLD 250
24645d465952Smartinh
24655d465952Smartinh static int
btree_rebalance(struct btree * bt,struct mpage * mp)24665d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp)
24675d465952Smartinh {
246837ee45a1Smartinh struct node *node;
24695d465952Smartinh struct mpage *parent;
247037ee45a1Smartinh struct mpage *root;
24715d465952Smartinh struct mpage *neighbor = NULL;
247237ee45a1Smartinh indx_t si = 0, di = 0;
24735d465952Smartinh
24745d465952Smartinh assert(bt != NULL);
24755d465952Smartinh assert(bt->txn != NULL);
24765d465952Smartinh assert(mp != NULL);
24775d465952Smartinh
24785d465952Smartinh DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)",
24795d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch",
2480944ad5d4Smartinh mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10);
24815d465952Smartinh
248237ee45a1Smartinh if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) {
24835d465952Smartinh DPRINTF("no need to rebalance page %u, above fill threshold",
24845d465952Smartinh mp->pgno);
24855d465952Smartinh return BT_SUCCESS;
24865d465952Smartinh }
24875d465952Smartinh
24885d465952Smartinh parent = mp->parent;
24895d465952Smartinh
24905d465952Smartinh if (parent == NULL) {
24915d465952Smartinh if (NUMKEYS(mp) == 0) {
24925d465952Smartinh DPRINTF("tree is completely empty");
24935d465952Smartinh bt->txn->root = P_INVALID;
249437ee45a1Smartinh bt->meta.depth--;
249537ee45a1Smartinh bt->meta.leaf_pages--;
24965d465952Smartinh } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
24975d465952Smartinh DPRINTF("collapsing root page!");
24985d465952Smartinh bt->txn->root = NODEPGNO(NODEPTR(mp, 0));
249937ee45a1Smartinh if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL)
250037ee45a1Smartinh return BT_FAIL;
250137ee45a1Smartinh root->parent = NULL;
250237ee45a1Smartinh bt->meta.depth--;
250337ee45a1Smartinh bt->meta.branch_pages--;
25045d465952Smartinh } else
25055d465952Smartinh DPRINTF("root page doesn't need rebalancing");
25065d465952Smartinh return BT_SUCCESS;
25075d465952Smartinh }
25085d465952Smartinh
25095d465952Smartinh /* The parent (branch page) must have at least 2 pointers,
25105d465952Smartinh * otherwise the tree is invalid.
25115d465952Smartinh */
25125d465952Smartinh assert(NUMKEYS(parent) > 1);
25135d465952Smartinh
25145d465952Smartinh /* Leaf page fill factor is below the threshold.
25155d465952Smartinh * Try to move keys from left or right neighbor, or
25165d465952Smartinh * merge with a neighbor page.
25175d465952Smartinh */
25185d465952Smartinh
25195d465952Smartinh /* Find neighbors.
25205d465952Smartinh */
25215d465952Smartinh if (mp->parent_index == 0) {
25225d465952Smartinh /* We're the leftmost leaf in our parent.
25235d465952Smartinh */
25245d465952Smartinh DPRINTF("reading right neighbor");
252537ee45a1Smartinh node = NODEPTR(parent, mp->parent_index + 1);
252637ee45a1Smartinh if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25275d465952Smartinh return BT_FAIL;
25285d465952Smartinh neighbor->parent_index = mp->parent_index + 1;
25295d465952Smartinh si = 0;
25305d465952Smartinh di = NUMKEYS(mp);
25315d465952Smartinh } else {
25325d465952Smartinh /* There is at least one neighbor to the left.
25335d465952Smartinh */
25345d465952Smartinh DPRINTF("reading left neighbor");
253537ee45a1Smartinh node = NODEPTR(parent, mp->parent_index - 1);
253637ee45a1Smartinh if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL)
25375d465952Smartinh return BT_FAIL;
25385d465952Smartinh neighbor->parent_index = mp->parent_index - 1;
25395d465952Smartinh si = NUMKEYS(neighbor) - 1;
25405d465952Smartinh di = 0;
25415d465952Smartinh }
25425d465952Smartinh neighbor->parent = parent;
25435d465952Smartinh
25445d465952Smartinh DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)",
2545944ad5d4Smartinh neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10);
25465d465952Smartinh
25475d465952Smartinh /* If the neighbor page is above threshold and has at least two
25485d465952Smartinh * keys, move one key from it.
25495d465952Smartinh *
25505d465952Smartinh * Otherwise we should try to merge them, but that might not be
25515d465952Smartinh * possible, even if both are below threshold, as prefix expansion
25525d465952Smartinh * might make keys larger. FIXME: detect this
25535d465952Smartinh */
2554a73962c3Smartinh if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2)
25555d465952Smartinh return btree_move_node(bt, neighbor, si, mp, di);
25565d465952Smartinh else { /* FIXME: if (has_enough_room()) */
25575d465952Smartinh if (mp->parent_index == 0)
25585d465952Smartinh return btree_merge(bt, neighbor, mp);
25595d465952Smartinh else
25605d465952Smartinh return btree_merge(bt, mp, neighbor);
25615d465952Smartinh }
25625d465952Smartinh }
25635d465952Smartinh
25645d465952Smartinh int
btree_txn_del(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data)25655d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn,
25665d465952Smartinh struct btval *key, struct btval *data)
25675d465952Smartinh {
25685d465952Smartinh int rc, exact, close_txn = 0;
25695d465952Smartinh unsigned int ki;
25705d465952Smartinh struct node *leaf;
25715d465952Smartinh struct mpage *mp;
25725d465952Smartinh
25735d465952Smartinh DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data);
25745d465952Smartinh
25755d465952Smartinh assert(key != NULL);
25765d465952Smartinh
257759916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) {
257859916c99Smartinh errno = EINVAL;
257959916c99Smartinh return BT_FAIL;
258059916c99Smartinh }
258159916c99Smartinh
25827e04cf1eSmartinh if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
25837e04cf1eSmartinh errno = EINVAL;
25847e04cf1eSmartinh return BT_FAIL;
25857e04cf1eSmartinh }
25867e04cf1eSmartinh
258759916c99Smartinh if (bt == NULL) {
258859916c99Smartinh if (txn == NULL) {
258959916c99Smartinh errno = EINVAL;
259059916c99Smartinh return BT_FAIL;
259159916c99Smartinh }
259259916c99Smartinh bt = txn->bt;
259359916c99Smartinh }
259459916c99Smartinh
25955d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) {
25965d465952Smartinh errno = EINVAL;
25975d465952Smartinh return BT_FAIL;
25985d465952Smartinh }
25995d465952Smartinh
26005d465952Smartinh if (txn == NULL) {
26015d465952Smartinh close_txn = 1;
26025d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL)
26035d465952Smartinh return BT_FAIL;
26045d465952Smartinh }
26055d465952Smartinh
26065d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS)
26075d465952Smartinh goto done;
26085d465952Smartinh
26095d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki);
26105d465952Smartinh if (leaf == NULL || !exact) {
26110279e526Smartinh errno = ENOENT;
26120279e526Smartinh rc = BT_FAIL;
26135d465952Smartinh goto done;
26145d465952Smartinh }
26155d465952Smartinh
26165d465952Smartinh if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS)
26175d465952Smartinh goto done;
26185d465952Smartinh
26195d465952Smartinh btree_del_node(bt, mp, ki);
262037ee45a1Smartinh bt->meta.entries--;
26215d465952Smartinh rc = btree_rebalance(bt, mp);
26225d465952Smartinh if (rc != BT_SUCCESS)
26235d465952Smartinh txn->flags |= BT_TXN_ERROR;
26245d465952Smartinh
26255d465952Smartinh done:
26265d465952Smartinh if (close_txn) {
26275d465952Smartinh if (rc == BT_SUCCESS)
26285d465952Smartinh rc = btree_txn_commit(txn);
26295d465952Smartinh else
26305d465952Smartinh btree_txn_abort(txn);
26315d465952Smartinh }
26325d465952Smartinh mpage_prune(bt);
26335d465952Smartinh return rc;
26345d465952Smartinh }
26355d465952Smartinh
26365d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that
26375d465952Smartinh * still makes it uniquely distinguishable from <min>.
26385d465952Smartinh *
26395d465952Smartinh * <min> is guaranteed to be sorted less than <sep>
26405d465952Smartinh *
26415d465952Smartinh * On return, <sep> is modified to the minimum length.
26425d465952Smartinh */
26435d465952Smartinh static void
bt_reduce_separator(struct btree * bt,struct node * min,struct btval * sep)26445d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep)
26455d465952Smartinh {
26465d465952Smartinh size_t n = 0;
26475d465952Smartinh char *p1;
26485d465952Smartinh char *p2;
26495d465952Smartinh
26505d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) {
26515d465952Smartinh
26525d465952Smartinh assert(sep->size > 0);
26535d465952Smartinh
26545d465952Smartinh p1 = (char *)NODEKEY(min) + min->ksize - 1;
26555d465952Smartinh p2 = (char *)sep->data + sep->size - 1;
26565d465952Smartinh
26575d465952Smartinh while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) {
26585d465952Smartinh assert(p2 > (char *)sep->data);
26595d465952Smartinh p1--;
26605d465952Smartinh p2--;
26615d465952Smartinh n++;
26625d465952Smartinh }
26635d465952Smartinh
26645d465952Smartinh sep->data = p2;
26655d465952Smartinh sep->size = n + 1;
26665d465952Smartinh } else {
26675d465952Smartinh
26685d465952Smartinh assert(min->ksize > 0);
26695d465952Smartinh assert(sep->size > 0);
26705d465952Smartinh
26715d465952Smartinh p1 = (char *)NODEKEY(min);
26725d465952Smartinh p2 = (char *)sep->data;
26735d465952Smartinh
26745d465952Smartinh while (*p1 == *p2) {
26755d465952Smartinh p1++;
26765d465952Smartinh p2++;
26775d465952Smartinh n++;
26785d465952Smartinh if (n == min->ksize || n == sep->size)
26795d465952Smartinh break;
26805d465952Smartinh }
26815d465952Smartinh
26825d465952Smartinh sep->size = n + 1;
26835d465952Smartinh }
26845d465952Smartinh
26855d465952Smartinh DPRINTF("reduced separator to [%.*s] > [%.*s]",
26865d465952Smartinh (int)sep->size, (char *)sep->data,
26875d465952Smartinh (int)min->ksize, (char *)NODEKEY(min));
26885d465952Smartinh }
26895d465952Smartinh
26905d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or
26915d465952Smartinh * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and
26925d465952Smartinh * *newindxp with the actual values after split, ie if *mpp and *newindxp
26935d465952Smartinh * refer to a node in the new right sibling page.
26945d465952Smartinh */
26955d465952Smartinh static int
btree_split(struct btree * bt,struct mpage ** mpp,unsigned int * newindxp,struct btval * newkey,struct btval * newdata,pgno_t newpgno)26965d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp,
26975d465952Smartinh struct btval *newkey, struct btval *newdata, pgno_t newpgno)
26985d465952Smartinh {
26995d465952Smartinh uint8_t flags;
27005d465952Smartinh int rc = BT_SUCCESS, ins_new = 0;
27015d465952Smartinh indx_t newindx;
27025d465952Smartinh pgno_t pgno = 0;
27035d465952Smartinh size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff;
27045d465952Smartinh unsigned int i, j, split_indx;
27055d465952Smartinh struct node *node;
27065d465952Smartinh struct mpage *pright, *p, *mp;
27075d465952Smartinh struct btval sepkey, rkey, rdata;
27085d465952Smartinh struct btkey tmpkey;
270937ee45a1Smartinh struct page *copy;
27105d465952Smartinh
27115d465952Smartinh assert(bt != NULL);
27125d465952Smartinh assert(bt->txn != NULL);
27135d465952Smartinh
27145d465952Smartinh mp = *mpp;
27155d465952Smartinh newindx = *newindxp;
27165d465952Smartinh
271748318e0bSderaadt DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %d",
27185d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno,
27195d465952Smartinh (int)newkey->size, (char *)newkey->data, *newindxp);
27205d465952Smartinh DPRINTF("page %u has prefix [%.*s]", mp->pgno,
27215d465952Smartinh (int)mp->prefix.len, (char *)mp->prefix.str);
27225d465952Smartinh orig_pfx_len = mp->prefix.len;
27235d465952Smartinh
27245d465952Smartinh if (mp->parent == NULL) {
27255d465952Smartinh if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL)
27265d465952Smartinh return BT_FAIL;
27275d465952Smartinh mp->parent_index = 0;
27285d465952Smartinh bt->txn->root = mp->parent->pgno;
27295d465952Smartinh DPRINTF("root split! new root = %u", mp->parent->pgno);
273037ee45a1Smartinh bt->meta.depth++;
27315d465952Smartinh
27325d465952Smartinh /* Add left (implicit) pointer. */
27335d465952Smartinh if (btree_add_node(bt, mp->parent, 0, NULL, NULL,
27345d465952Smartinh mp->pgno, 0) != BT_SUCCESS)
27355d465952Smartinh return BT_FAIL;
27365d465952Smartinh } else {
27375d465952Smartinh DPRINTF("parent branch page is %u", mp->parent->pgno);
27385d465952Smartinh }
27395d465952Smartinh
27405d465952Smartinh /* Create a right sibling. */
274137ee45a1Smartinh if ((pright = btree_new_page(bt, mp->page->flags)) == NULL)
27425d465952Smartinh return BT_FAIL;
27435d465952Smartinh pright->parent = mp->parent;
27445d465952Smartinh pright->parent_index = mp->parent_index + 1;
27455d465952Smartinh DPRINTF("new right sibling: page %u", pright->pgno);
27465d465952Smartinh
27475d465952Smartinh /* Move half of the keys to the right sibling. */
274837ee45a1Smartinh if ((copy = malloc(bt->head.psize)) == NULL)
274937ee45a1Smartinh return BT_FAIL;
275037ee45a1Smartinh bcopy(mp->page, copy, bt->head.psize);
27515d465952Smartinh assert(mp->ref == 0); /* XXX */
275237f4b933Smmcc memset(&mp->page->ptrs, 0, bt->head.psize - PAGEHDRSZ);
275337ee45a1Smartinh mp->page->lower = PAGEHDRSZ;
275437ee45a1Smartinh mp->page->upper = bt->head.psize;
27555d465952Smartinh
275637ee45a1Smartinh split_indx = NUMKEYSP(copy) / 2 + 1;
27575d465952Smartinh
27585d465952Smartinh /* First find the separating key between the split pages.
27595d465952Smartinh */
276037f4b933Smmcc memset(&sepkey, 0, sizeof(sepkey));
27615d465952Smartinh if (newindx == split_indx) {
27625d465952Smartinh sepkey.size = newkey->size;
27635d465952Smartinh sepkey.data = newkey->data;
27645d465952Smartinh remove_prefix(bt, &sepkey, mp->prefix.len);
27655d465952Smartinh } else {
276637ee45a1Smartinh node = NODEPTRP(copy, split_indx);
27675d465952Smartinh sepkey.size = node->ksize;
27685d465952Smartinh sepkey.data = NODEKEY(node);
27695d465952Smartinh }
27705d465952Smartinh
27715d465952Smartinh if (IS_LEAF(mp) && bt->cmp == NULL) {
27725d465952Smartinh /* Find the smallest separator. */
27735d465952Smartinh /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */
277437ee45a1Smartinh node = NODEPTRP(copy, split_indx - 1);
27755d465952Smartinh bt_reduce_separator(bt, node, &sepkey);
27765d465952Smartinh }
27775d465952Smartinh
27785d465952Smartinh /* Fix separator wrt parent prefix. */
27795d465952Smartinh if (bt->cmp == NULL) {
27805d465952Smartinh tmpkey.len = sizeof(tmpkey.str);
27815d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len,
27825d465952Smartinh sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len);
27835d465952Smartinh sepkey.data = tmpkey.str;
27845d465952Smartinh sepkey.size = tmpkey.len;
27855d465952Smartinh }
27865d465952Smartinh
27875d465952Smartinh DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data);
27885d465952Smartinh
27895d465952Smartinh /* Copy separator key to the parent.
27905d465952Smartinh */
279137ee45a1Smartinh if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) {
27925d465952Smartinh rc = btree_split(bt, &pright->parent, &pright->parent_index,
27935d465952Smartinh &sepkey, NULL, pright->pgno);
27945d465952Smartinh
27955d465952Smartinh /* Right page might now have changed parent.
27965d465952Smartinh * Check if left page also changed parent.
27975d465952Smartinh */
27985d465952Smartinh if (pright->parent != mp->parent &&
27995d465952Smartinh mp->parent_index >= NUMKEYS(mp->parent)) {
28005d465952Smartinh mp->parent = pright->parent;
28015d465952Smartinh mp->parent_index = pright->parent_index - 1;
28025d465952Smartinh }
28035d465952Smartinh } else {
28045d465952Smartinh remove_prefix(bt, &sepkey, pright->parent->prefix.len);
28055d465952Smartinh rc = btree_add_node(bt, pright->parent, pright->parent_index,
28065d465952Smartinh &sepkey, NULL, pright->pgno, 0);
28075d465952Smartinh }
280837ee45a1Smartinh if (rc != BT_SUCCESS) {
280937ee45a1Smartinh free(copy);
28105d465952Smartinh return BT_FAIL;
281137ee45a1Smartinh }
28125d465952Smartinh
28135d465952Smartinh /* Update prefix for right and left page, if the parent was split.
28145d465952Smartinh */
28155d465952Smartinh find_common_prefix(bt, pright);
28165d465952Smartinh assert(orig_pfx_len <= pright->prefix.len);
28175d465952Smartinh right_pfx_diff = pright->prefix.len - orig_pfx_len;
28185d465952Smartinh
28195d465952Smartinh find_common_prefix(bt, mp);
28205d465952Smartinh assert(orig_pfx_len <= mp->prefix.len);
28215d465952Smartinh left_pfx_diff = mp->prefix.len - orig_pfx_len;
28225d465952Smartinh
282337ee45a1Smartinh for (i = j = 0; i <= NUMKEYSP(copy); j++) {
28245d465952Smartinh if (i < split_indx) {
28255d465952Smartinh /* Re-insert in left sibling. */
28265d465952Smartinh p = mp;
28275d465952Smartinh pfx_diff = left_pfx_diff;
28285d465952Smartinh } else {
28295d465952Smartinh /* Insert in right sibling. */
28305d465952Smartinh if (i == split_indx)
28315d465952Smartinh /* Reset insert index for right sibling. */
28325d465952Smartinh j = (i == newindx && ins_new);
28335d465952Smartinh p = pright;
28345d465952Smartinh pfx_diff = right_pfx_diff;
28355d465952Smartinh }
28365d465952Smartinh
28375d465952Smartinh if (i == newindx && !ins_new) {
28385d465952Smartinh /* Insert the original entry that caused the split. */
28395d465952Smartinh rkey.data = newkey->data;
28405d465952Smartinh rkey.size = newkey->size;
28415d465952Smartinh if (IS_LEAF(mp)) {
28425d465952Smartinh rdata.data = newdata->data;
28435d465952Smartinh rdata.size = newdata->size;
28445d465952Smartinh } else
28455d465952Smartinh pgno = newpgno;
28465d465952Smartinh flags = 0;
28475d465952Smartinh pfx_diff = p->prefix.len;
28485d465952Smartinh
28495d465952Smartinh ins_new = 1;
28505d465952Smartinh
28515d465952Smartinh /* Update page and index for the new key. */
28525d465952Smartinh *newindxp = j;
28535d465952Smartinh *mpp = p;
285437ee45a1Smartinh } else if (i == NUMKEYSP(copy)) {
28555d465952Smartinh break;
28565d465952Smartinh } else {
285737ee45a1Smartinh node = NODEPTRP(copy, i);
28585d465952Smartinh rkey.data = NODEKEY(node);
28595d465952Smartinh rkey.size = node->ksize;
28605d465952Smartinh if (IS_LEAF(mp)) {
28615d465952Smartinh rdata.data = NODEDATA(node);
28625d465952Smartinh rdata.size = node->n_dsize;
28635d465952Smartinh } else
28645d465952Smartinh pgno = node->n_pgno;
28655d465952Smartinh flags = node->flags;
28665d465952Smartinh
28675d465952Smartinh i++;
28685d465952Smartinh }
28695d465952Smartinh
28705d465952Smartinh if (!IS_LEAF(mp) && j == 0) {
28715d465952Smartinh /* First branch index doesn't need key data. */
28725d465952Smartinh rkey.size = 0;
28735d465952Smartinh } else
28745d465952Smartinh remove_prefix(bt, &rkey, pfx_diff);
28755d465952Smartinh
28765d465952Smartinh rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags);
28775d465952Smartinh }
28785d465952Smartinh
287937ee45a1Smartinh free(copy);
28805d465952Smartinh return rc;
28815d465952Smartinh }
28825d465952Smartinh
28835d465952Smartinh int
btree_txn_put(struct btree * bt,struct btree_txn * txn,struct btval * key,struct btval * data,unsigned int flags)28845d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn,
28855d465952Smartinh struct btval *key, struct btval *data, unsigned int flags)
28865d465952Smartinh {
28875d465952Smartinh int rc = BT_SUCCESS, exact, close_txn = 0;
28885d465952Smartinh unsigned int ki;
28895d465952Smartinh struct node *leaf;
28905d465952Smartinh struct mpage *mp;
28915d465952Smartinh struct btval xkey;
28925d465952Smartinh
28935d465952Smartinh assert(key != NULL);
28945d465952Smartinh assert(data != NULL);
28955d465952Smartinh
289659916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) {
289759916c99Smartinh errno = EINVAL;
289859916c99Smartinh return BT_FAIL;
289959916c99Smartinh }
290059916c99Smartinh
29017e04cf1eSmartinh if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) {
29027e04cf1eSmartinh errno = EINVAL;
29037e04cf1eSmartinh return BT_FAIL;
29047e04cf1eSmartinh }
29057e04cf1eSmartinh
290659916c99Smartinh if (bt == NULL) {
290759916c99Smartinh if (txn == NULL) {
290859916c99Smartinh errno = EINVAL;
290959916c99Smartinh return BT_FAIL;
291059916c99Smartinh }
291159916c99Smartinh bt = txn->bt;
291259916c99Smartinh }
291359916c99Smartinh
29145d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) {
29155d465952Smartinh errno = EINVAL;
29165d465952Smartinh return BT_FAIL;
29175d465952Smartinh }
29185d465952Smartinh
29195d465952Smartinh DPRINTF("==> put key %.*s, size %zu, data size %zu",
29205d465952Smartinh (int)key->size, (char *)key->data, key->size, data->size);
29215d465952Smartinh
29225d465952Smartinh if (txn == NULL) {
29235d465952Smartinh close_txn = 1;
29245d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL)
29255d465952Smartinh return BT_FAIL;
29265d465952Smartinh }
29275d465952Smartinh
29285d465952Smartinh rc = btree_search_page(bt, txn, key, NULL, 1, &mp);
29295d465952Smartinh if (rc == BT_SUCCESS) {
29305d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki);
29315d465952Smartinh if (leaf && exact) {
29325d465952Smartinh if (F_ISSET(flags, BT_NOOVERWRITE)) {
29335d465952Smartinh DPRINTF("duplicate key %.*s",
29345d465952Smartinh (int)key->size, (char *)key->data);
29350279e526Smartinh errno = EEXIST;
29360279e526Smartinh rc = BT_FAIL;
29375d465952Smartinh goto done;
29385d465952Smartinh }
29395d465952Smartinh btree_del_node(bt, mp, ki);
29405d465952Smartinh }
29415d465952Smartinh if (leaf == NULL) { /* append if not found */
29425d465952Smartinh ki = NUMKEYS(mp);
294348318e0bSderaadt DPRINTF("appending key at index %d", ki);
29445d465952Smartinh }
29450279e526Smartinh } else if (errno == ENOENT) {
29465d465952Smartinh /* new file, just write a root leaf page */
29475d465952Smartinh DPRINTF("allocating new root leaf page");
29485d465952Smartinh if ((mp = btree_new_page(bt, P_LEAF)) == NULL) {
29495d465952Smartinh rc = BT_FAIL;
29505d465952Smartinh goto done;
29515d465952Smartinh }
29525d465952Smartinh txn->root = mp->pgno;
295337ee45a1Smartinh bt->meta.depth++;
29545d465952Smartinh ki = 0;
29555d465952Smartinh }
29565d465952Smartinh else
29575d465952Smartinh goto done;
29585d465952Smartinh
29595d465952Smartinh assert(IS_LEAF(mp));
296048318e0bSderaadt DPRINTF("there are %lu keys, should insert new key at index %u",
29615d465952Smartinh NUMKEYS(mp), ki);
29625d465952Smartinh
29635d465952Smartinh /* Copy the key pointer as it is modified by the prefix code. The
29645d465952Smartinh * caller might have malloc'ed the data.
29655d465952Smartinh */
29665d465952Smartinh xkey.data = key->data;
29675d465952Smartinh xkey.size = key->size;
29685d465952Smartinh
296937ee45a1Smartinh if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) {
29705d465952Smartinh rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID);
29715d465952Smartinh } else {
29725d465952Smartinh /* There is room already in this leaf page. */
29735d465952Smartinh remove_prefix(bt, &xkey, mp->prefix.len);
29745d465952Smartinh rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0);
29755d465952Smartinh }
29765d465952Smartinh
29775d465952Smartinh if (rc != BT_SUCCESS)
29785d465952Smartinh txn->flags |= BT_TXN_ERROR;
29795d465952Smartinh else
298037ee45a1Smartinh bt->meta.entries++;
29815d465952Smartinh
29825d465952Smartinh done:
29835d465952Smartinh if (close_txn) {
29845d465952Smartinh if (rc == BT_SUCCESS)
29855d465952Smartinh rc = btree_txn_commit(txn);
29865d465952Smartinh else
29875d465952Smartinh btree_txn_abort(txn);
29885d465952Smartinh }
29895d465952Smartinh mpage_prune(bt);
29905d465952Smartinh return rc;
29915d465952Smartinh }
29925d465952Smartinh
29935d465952Smartinh static pgno_t
btree_compact_tree(struct btree * bt,pgno_t pgno,struct btree * btc)299437ee45a1Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc)
29955d465952Smartinh {
299637ee45a1Smartinh ssize_t rc;
29975d465952Smartinh indx_t i;
2998023c5341Smartinh pgno_t *pnext, next;
29995d465952Smartinh struct node *node;
30005d465952Smartinh struct page *p;
30015d465952Smartinh struct mpage *mp;
30025d465952Smartinh
300337ee45a1Smartinh /* Get the page and make a copy of it.
300437ee45a1Smartinh */
300537ee45a1Smartinh if ((mp = btree_get_mpage(bt, pgno)) == NULL)
30065d465952Smartinh return P_INVALID;
300737ee45a1Smartinh if ((p = malloc(bt->head.psize)) == NULL)
300837ee45a1Smartinh return P_INVALID;
300937ee45a1Smartinh bcopy(mp->page, p, bt->head.psize);
30105d465952Smartinh
301137ee45a1Smartinh /* Go through all nodes in the (copied) page and update the
301237ee45a1Smartinh * page pointers.
301337ee45a1Smartinh */
30145d465952Smartinh if (F_ISSET(p->flags, P_BRANCH)) {
30155d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) {
30165d465952Smartinh node = NODEPTRP(p, i);
301737ee45a1Smartinh node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc);
301837ee45a1Smartinh if (node->n_pgno == P_INVALID) {
301937ee45a1Smartinh free(p);
30205d465952Smartinh return P_INVALID;
30215d465952Smartinh }
302237ee45a1Smartinh }
30235d465952Smartinh } else if (F_ISSET(p->flags, P_LEAF)) {
30245d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) {
30255d465952Smartinh node = NODEPTRP(p, i);
30265d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA)) {
3027023c5341Smartinh bcopy(NODEDATA(node), &next, sizeof(next));
3028023c5341Smartinh next = btree_compact_tree(bt, next, btc);
3029023c5341Smartinh if (next == P_INVALID) {
303037ee45a1Smartinh free(p);
30315d465952Smartinh return P_INVALID;
30325d465952Smartinh }
3033023c5341Smartinh bcopy(&next, NODEDATA(node), sizeof(next));
30345d465952Smartinh }
303537ee45a1Smartinh }
30365d465952Smartinh } else if (F_ISSET(p->flags, P_OVERFLOW)) {
30375d465952Smartinh pnext = &p->p_next_pgno;
30385d465952Smartinh if (*pnext > 0) {
303937ee45a1Smartinh *pnext = btree_compact_tree(bt, *pnext, btc);
304037ee45a1Smartinh if (*pnext == P_INVALID) {
304137ee45a1Smartinh free(p);
30425d465952Smartinh return P_INVALID;
30435d465952Smartinh }
304437ee45a1Smartinh }
30455d465952Smartinh } else
30465d465952Smartinh assert(0);
30475d465952Smartinh
304837ee45a1Smartinh pgno = p->pgno = btc->txn->next_pgno++;
304937ee45a1Smartinh rc = write(btc->fd, p, bt->head.psize);
305037ee45a1Smartinh free(p);
305136e226d9Smartinh if (rc != (ssize_t)bt->head.psize)
30525d465952Smartinh return P_INVALID;
30533357b322Smartinh mpage_prune(bt);
305437ee45a1Smartinh return pgno;
30555d465952Smartinh }
30565d465952Smartinh
30575d465952Smartinh int
btree_compact(struct btree * bt)30585d465952Smartinh btree_compact(struct btree *bt)
30595d465952Smartinh {
30605d465952Smartinh char *compact_path = NULL;
306137ee45a1Smartinh struct btree *btc;
306237ee45a1Smartinh struct btree_txn *txn, *txnc = NULL;
306337ee45a1Smartinh int fd;
306437ee45a1Smartinh pgno_t root;
30655d465952Smartinh
30665d465952Smartinh assert(bt != NULL);
30675d465952Smartinh
30680279e526Smartinh DPRINTF("compacting btree %p with path %s", bt, bt->path);
30690279e526Smartinh
30700279e526Smartinh if (bt->path == NULL) {
30710279e526Smartinh errno = EINVAL;
30720279e526Smartinh return BT_FAIL;
30730279e526Smartinh }
30740279e526Smartinh
30750279e526Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL)
30765d465952Smartinh return BT_FAIL;
30775d465952Smartinh
3078aa48e8d1Smillert if (asprintf(&compact_path, "%s.compact.XXXXXX", bt->path) == -1) {
3079aa48e8d1Smillert btree_txn_abort(txn);
3080aa48e8d1Smillert return BT_FAIL;
3081aa48e8d1Smillert }
30825d465952Smartinh fd = mkstemp(compact_path);
30835d465952Smartinh if (fd == -1) {
30845d465952Smartinh free(compact_path);
30850279e526Smartinh btree_txn_abort(txn);
30865d465952Smartinh return BT_FAIL;
30875d465952Smartinh }
30885d465952Smartinh
308937ee45a1Smartinh if ((btc = btree_open_fd(fd, 0)) == NULL)
309037ee45a1Smartinh goto failed;
309137f85b55Smartinh bcopy(&bt->meta, &btc->meta, sizeof(bt->meta));
30925bca2891Smartinh btc->meta.revisions = 0;
30935d465952Smartinh
309437ee45a1Smartinh if ((txnc = btree_txn_begin(btc, 0)) == NULL)
309537ee45a1Smartinh goto failed;
309637ee45a1Smartinh
309737ee45a1Smartinh if (bt->meta.root != P_INVALID) {
309837ee45a1Smartinh root = btree_compact_tree(bt, bt->meta.root, btc);
30995d465952Smartinh if (root == P_INVALID)
31005d465952Smartinh goto failed;
310137ee45a1Smartinh if (btree_write_meta(btc, root, 0) != BT_SUCCESS)
31025d465952Smartinh goto failed;
310337ee45a1Smartinh }
31045d465952Smartinh
31055d465952Smartinh fsync(fd);
31065d465952Smartinh
31075d465952Smartinh DPRINTF("renaming %s to %s", compact_path, bt->path);
31085d465952Smartinh if (rename(compact_path, bt->path) != 0)
31095d465952Smartinh goto failed;
31105d465952Smartinh
3111e14f8e24Smartinh /* Write a "tombstone" meta page so other processes can pick up
3112e14f8e24Smartinh * the change and re-open the file.
3113e14f8e24Smartinh */
3114e14f8e24Smartinh if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS)
3115e14f8e24Smartinh goto failed;
31165d465952Smartinh
3117e14f8e24Smartinh btree_txn_abort(txn);
311837ee45a1Smartinh btree_txn_abort(txnc);
31195d465952Smartinh free(compact_path);
312037ee45a1Smartinh btree_close(btc);
31213357b322Smartinh mpage_prune(bt);
312237ee45a1Smartinh return 0;
31235d465952Smartinh
31245d465952Smartinh failed:
31255d465952Smartinh btree_txn_abort(txn);
312637ee45a1Smartinh btree_txn_abort(txnc);
31275d465952Smartinh unlink(compact_path);
31285d465952Smartinh free(compact_path);
312937ee45a1Smartinh btree_close(btc);
31303357b322Smartinh mpage_prune(bt);
31315d465952Smartinh return BT_FAIL;
31325d465952Smartinh }
31335d465952Smartinh
31345d465952Smartinh /* Reverts the last change. Truncates the file at the last root page.
31355d465952Smartinh */
31365d465952Smartinh int
btree_revert(struct btree * bt)31375d465952Smartinh btree_revert(struct btree *bt)
31385d465952Smartinh {
313937ee45a1Smartinh if (btree_read_meta(bt, NULL) != 0)
314037ee45a1Smartinh return -1;
31415d465952Smartinh
314237ee45a1Smartinh DPRINTF("truncating file at page %u", bt->meta.root);
314337ee45a1Smartinh return ftruncate(bt->fd, bt->head.psize * bt->meta.root);
31445d465952Smartinh }
31455d465952Smartinh
31465d465952Smartinh void
btree_set_cache_size(struct btree * bt,unsigned int cache_size)31475d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size)
31485d465952Smartinh {
31495d465952Smartinh bt->stat.max_cache = cache_size;
31505d465952Smartinh }
31515d465952Smartinh
31525d465952Smartinh unsigned int
btree_get_flags(struct btree * bt)31535d465952Smartinh btree_get_flags(struct btree *bt)
31545d465952Smartinh {
31555d465952Smartinh return (bt->flags & ~BT_FIXPADDING);
31565d465952Smartinh }
31575d465952Smartinh
31585d465952Smartinh const char *
btree_get_path(struct btree * bt)31595d465952Smartinh btree_get_path(struct btree *bt)
31605d465952Smartinh {
31615d465952Smartinh return bt->path;
31625d465952Smartinh }
31635d465952Smartinh
31645d465952Smartinh const struct btree_stat *
btree_stat(struct btree * bt)31655d465952Smartinh btree_stat(struct btree *bt)
31665d465952Smartinh {
31674074e1b0Smartinh if (bt == NULL)
31684074e1b0Smartinh return NULL;
31694074e1b0Smartinh
317037ee45a1Smartinh bt->stat.branch_pages = bt->meta.branch_pages;
317137ee45a1Smartinh bt->stat.leaf_pages = bt->meta.leaf_pages;
317237ee45a1Smartinh bt->stat.overflow_pages = bt->meta.overflow_pages;
317337ee45a1Smartinh bt->stat.revisions = bt->meta.revisions;
317437ee45a1Smartinh bt->stat.depth = bt->meta.depth;
317537ee45a1Smartinh bt->stat.entries = bt->meta.entries;
317637ee45a1Smartinh bt->stat.psize = bt->head.psize;
317737ee45a1Smartinh bt->stat.created_at = bt->meta.created_at;
31785d465952Smartinh
31795d465952Smartinh return &bt->stat;
31805d465952Smartinh }
31815d465952Smartinh
3182