1*36e226d9Smartinh /* $OpenBSD: btree.c,v 1.15 2010/06/29 04:27:15 martinh Exp $ */ 25d465952Smartinh 35d465952Smartinh /* 45d465952Smartinh * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 55d465952Smartinh * 65d465952Smartinh * Permission to use, copy, modify, and distribute this software for any 75d465952Smartinh * purpose with or without fee is hereby granted, provided that the above 85d465952Smartinh * copyright notice and this permission notice appear in all copies. 95d465952Smartinh * 105d465952Smartinh * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 115d465952Smartinh * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 125d465952Smartinh * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 135d465952Smartinh * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 145d465952Smartinh * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 155d465952Smartinh * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 165d465952Smartinh * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 175d465952Smartinh */ 185d465952Smartinh 195d465952Smartinh #include <sys/types.h> 205d465952Smartinh #include <sys/tree.h> 2137ee45a1Smartinh #include <sys/stat.h> 225d465952Smartinh #include <sys/queue.h> 235d465952Smartinh #include <sys/param.h> 245d465952Smartinh #include <sys/uio.h> 255d465952Smartinh 265d465952Smartinh #include <assert.h> 275d465952Smartinh #include <err.h> 285d465952Smartinh #include <errno.h> 295d465952Smartinh #include <fcntl.h> 305d465952Smartinh #include <stddef.h> 315d465952Smartinh #include <stdint.h> 325d465952Smartinh #include <stdio.h> 335d465952Smartinh #include <stdlib.h> 345d465952Smartinh #include <string.h> 355d465952Smartinh #include <time.h> 365d465952Smartinh #include <unistd.h> 375d465952Smartinh 385d465952Smartinh #include "btree.h" 395d465952Smartinh 405d465952Smartinh /* #define DEBUG */ 415d465952Smartinh 425d465952Smartinh #ifdef DEBUG 435d465952Smartinh # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ 445d465952Smartinh fprintf(stderr, __VA_ARGS__); \ 455d465952Smartinh fprintf(stderr, "\n"); } while(0) 465d465952Smartinh #else 475d465952Smartinh # define DPRINTF(...) 485d465952Smartinh #endif 495d465952Smartinh 505d465952Smartinh #define PAGESIZE 4096 515d465952Smartinh #define BT_MINKEYS 4 525d465952Smartinh #define BT_MAGIC 0xB3DBB3DB 5337ee45a1Smartinh #define BT_VERSION 4 545d465952Smartinh #define MAXKEYSIZE 255 555d465952Smartinh 565d465952Smartinh #define P_INVALID 0xFFFFFFFF 575d465952Smartinh 585d465952Smartinh #define F_ISSET(w, f) (((w) & (f)) == (f)) 595d465952Smartinh 60176c8cbcSmartinh typedef uint32_t pgno_t; 61176c8cbcSmartinh typedef uint16_t indx_t; 62176c8cbcSmartinh 635d465952Smartinh /* There are four page types: meta, index, leaf and overflow. 645d465952Smartinh * They all share the same page header. 655d465952Smartinh */ 665d465952Smartinh struct page { /* represents an on-disk page */ 6737ee45a1Smartinh pgno_t pgno; /* page number */ 685d465952Smartinh #define P_BRANCH 0x01 /* branch page */ 695d465952Smartinh #define P_LEAF 0x02 /* leaf page */ 705d465952Smartinh #define P_OVERFLOW 0x04 /* overflow page */ 715d465952Smartinh #define P_META 0x08 /* meta page */ 7237ee45a1Smartinh #define P_HEAD 0x10 /* header page */ 735d465952Smartinh uint32_t flags; 745d465952Smartinh #define lower b.fb.fb_lower 755d465952Smartinh #define upper b.fb.fb_upper 765d465952Smartinh #define p_next_pgno b.pb_next_pgno 775d465952Smartinh union page_bounds { 785d465952Smartinh struct { 795d465952Smartinh indx_t fb_lower; /* lower bound of free space */ 805d465952Smartinh indx_t fb_upper; /* upper bound of free space */ 815d465952Smartinh } fb; 825d465952Smartinh pgno_t pb_next_pgno; /* overflow page linked list */ 835d465952Smartinh } b; 8437ee45a1Smartinh indx_t ptrs[1]; /* dynamic size */ 85bf245c3bSmartinh } __packed; 865d465952Smartinh 8737ee45a1Smartinh #define PAGEHDRSZ offsetof(struct page, ptrs) 885d465952Smartinh 8937ee45a1Smartinh #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1) 9037ee45a1Smartinh #define NUMKEYS(mp) (((mp)->page->lower - PAGEHDRSZ) >> 1) 9137ee45a1Smartinh #define SIZELEFT(mp) (indx_t)((mp)->page->upper - (mp)->page->lower) 9237ee45a1Smartinh #define PAGEFILL(bt, mp) ((float)((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \ 9337ee45a1Smartinh ((bt)->head.psize - PAGEHDRSZ)) 9437ee45a1Smartinh #define IS_LEAF(mp) F_ISSET((mp)->page->flags, P_LEAF) 9537ee45a1Smartinh #define IS_BRANCH(mp) F_ISSET((mp)->page->flags, P_BRANCH) 9637ee45a1Smartinh #define IS_OVERFLOW(mp) F_ISSET((mp)->page->flags, P_OVERFLOW) 9737ee45a1Smartinh 9837ee45a1Smartinh struct bt_head { /* header page content */ 9937ee45a1Smartinh uint32_t magic; 1005d465952Smartinh uint32_t version; 1015d465952Smartinh uint32_t flags; 1025d465952Smartinh uint32_t psize; /* page size */ 10337ee45a1Smartinh } __packed; 10437ee45a1Smartinh 10537ee45a1Smartinh struct bt_meta { /* meta (footer) page content */ 10637ee45a1Smartinh #define BT_TOMBSTONE 0x01 /* file is replaced */ 10737ee45a1Smartinh uint32_t flags; 1085d465952Smartinh pgno_t root; /* page number of root page */ 1095d465952Smartinh pgno_t prev_meta; /* previous meta page number */ 1105d465952Smartinh time_t created_at; 1115d465952Smartinh uint32_t branch_pages; 1125d465952Smartinh uint32_t leaf_pages; 1135d465952Smartinh uint32_t overflow_pages; 1145d465952Smartinh uint32_t revisions; 1155d465952Smartinh uint32_t depth; 1165d465952Smartinh uint64_t entries; 1175d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH]; 118bf245c3bSmartinh } __packed; 1195d465952Smartinh 1205d465952Smartinh struct btkey { 1215d465952Smartinh size_t len; 1225d465952Smartinh char str[MAXKEYSIZE]; 1235d465952Smartinh }; 1245d465952Smartinh 1255d465952Smartinh struct mpage { /* an in-memory cached page */ 1265d465952Smartinh RB_ENTRY(mpage) entry; /* page cache entry */ 1275d465952Smartinh SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */ 1285d465952Smartinh TAILQ_ENTRY(mpage) lru_next; /* LRU queue */ 1295d465952Smartinh struct mpage *parent; /* NULL if root */ 1305d465952Smartinh unsigned int parent_index; /* keep track of node index */ 1315d465952Smartinh struct btkey prefix; 13237ee45a1Smartinh struct page *page; 1335d465952Smartinh pgno_t pgno; /* copy of page->pgno */ 1345d465952Smartinh short ref; /* increased by cursors */ 1355d465952Smartinh short dirty; /* 1 if on dirty queue */ 1365d465952Smartinh }; 1375d465952Smartinh RB_HEAD(page_cache, mpage); 1385d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage); 1395d465952Smartinh TAILQ_HEAD(lru_queue, mpage); 1405d465952Smartinh 1415d465952Smartinh static int mpage_cmp(struct mpage *a, struct mpage *b); 1425d465952Smartinh static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno); 1435d465952Smartinh static void mpage_add(struct btree *bt, struct mpage *mp); 14437ee45a1Smartinh static void mpage_free(struct mpage *mp); 1455d465952Smartinh static void mpage_del(struct btree *bt, struct mpage *mp); 146e14f8e24Smartinh static void mpage_flush(struct btree *bt); 14737ee45a1Smartinh static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp); 1485d465952Smartinh static void mpage_prune(struct btree *bt); 1495d465952Smartinh static void mpage_dirty(struct btree *bt, struct mpage *mp); 1505d465952Smartinh static void mpage_touch(struct btree *bt, struct mpage *mp); 1515d465952Smartinh 1525d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp); 1535d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp); 1545d465952Smartinh 1555d465952Smartinh struct ppage { /* ordered list of pages */ 1565d465952Smartinh SLIST_ENTRY(ppage) entry; 1575d465952Smartinh struct mpage *mpage; 1585d465952Smartinh unsigned int ki; /* cursor index on page */ 1595d465952Smartinh }; 1605d465952Smartinh SLIST_HEAD(page_stack, ppage); 1615d465952Smartinh 1625d465952Smartinh #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack) 1635d465952Smartinh #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack) 1645d465952Smartinh #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry) 1655d465952Smartinh #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry) 1665d465952Smartinh 1675d465952Smartinh struct cursor { 1685d465952Smartinh struct btree *bt; 1695d465952Smartinh struct btree_txn *txn; 1705d465952Smartinh struct page_stack stack; /* stack of parent pages */ 1715d465952Smartinh short initialized; /* 1 if initialized */ 1725d465952Smartinh short eof; /* 1 if end is reached */ 1735d465952Smartinh }; 1745d465952Smartinh 1755d465952Smartinh #define METAHASHLEN offsetof(struct bt_meta, hash) 17637ee45a1Smartinh #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) 1775d465952Smartinh 1785d465952Smartinh struct node { 1795d465952Smartinh #define n_pgno p.np_pgno 1805d465952Smartinh #define n_dsize p.np_dsize 1815d465952Smartinh union { 1825d465952Smartinh pgno_t np_pgno; /* child page number */ 1835d465952Smartinh uint32_t np_dsize; /* leaf data size */ 1845d465952Smartinh } p; 1855d465952Smartinh uint16_t ksize; /* key size */ 1865d465952Smartinh #define F_BIGDATA 0x01 /* data put on overflow page */ 1875d465952Smartinh uint8_t flags; 188a33d639aSmartinh char data[1]; 189bf245c3bSmartinh } __packed; 1905d465952Smartinh 1915d465952Smartinh struct btree_txn { 1925d465952Smartinh pgno_t root; /* current / new root page */ 1935d465952Smartinh pgno_t next_pgno; /* next unallocated page */ 1945d465952Smartinh struct btree *bt; /* btree is ref'd */ 1955d465952Smartinh struct dirty_queue *dirty_queue; /* modified pages */ 1965d465952Smartinh #define BT_TXN_RDONLY 0x01 /* read-only transaction */ 1975d465952Smartinh #define BT_TXN_ERROR 0x02 /* an error has occurred */ 1985d465952Smartinh unsigned int flags; 1995d465952Smartinh }; 2005d465952Smartinh 2015d465952Smartinh struct btree { 2025d465952Smartinh int fd; 2035d465952Smartinh char *path; 2045d465952Smartinh #define BT_FIXPADDING 0x01 /* internal */ 2055d465952Smartinh unsigned int flags; 2065d465952Smartinh bt_cmp_func cmp; /* user compare function */ 20737ee45a1Smartinh struct bt_head head; 20837ee45a1Smartinh struct bt_meta meta; 2095d465952Smartinh struct page_cache *page_cache; 2105d465952Smartinh struct lru_queue *lru_queue; 2115d465952Smartinh struct btree_txn *txn; /* current write transaction */ 212e14f8e24Smartinh int ref; /* increased by cursors & txn */ 2135d465952Smartinh struct btree_stat stat; 2145d465952Smartinh off_t size; /* current file size */ 2155d465952Smartinh }; 2165d465952Smartinh 2175d465952Smartinh #define NODESIZE offsetof(struct node, data) 2185d465952Smartinh 2195d465952Smartinh #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size)) 2205d465952Smartinh #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size) 2215d465952Smartinh #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i])) 22237ee45a1Smartinh #define NODEPTR(mp, i) NODEPTRP((mp)->page, i) 2235d465952Smartinh #define NODEKEY(node) (void *)((node)->data) 2245d465952Smartinh #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize) 2255d465952Smartinh #define NODEPGNO(node) ((node)->p.np_pgno) 2265d465952Smartinh #define NODEDSZ(node) ((node)->p.np_dsize) 2275d465952Smartinh 2285d465952Smartinh #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */ 2295d465952Smartinh #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ 2305d465952Smartinh 2315d465952Smartinh static int btree_read_page(struct btree *bt, pgno_t pgno, 2325d465952Smartinh struct page *page); 23337ee45a1Smartinh static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno); 2345d465952Smartinh static int btree_search_page_root(struct btree *bt, 2355d465952Smartinh struct mpage *root, struct btval *key, 2365d465952Smartinh struct cursor *cursor, int modify, 2375d465952Smartinh struct mpage **mpp); 2385d465952Smartinh static int btree_search_page(struct btree *bt, 2395d465952Smartinh struct btree_txn *txn, struct btval *key, 2405d465952Smartinh struct cursor *cursor, int modify, 2415d465952Smartinh struct mpage **mpp); 2425d465952Smartinh 24337ee45a1Smartinh static int btree_write_header(struct btree *bt, int fd); 24437ee45a1Smartinh static int btree_read_header(struct btree *bt); 2455d465952Smartinh static int btree_is_meta_page(struct page *p); 2465d465952Smartinh static int btree_read_meta(struct btree *bt, pgno_t *p_next); 247e14f8e24Smartinh static int btree_write_meta(struct btree *bt, pgno_t root, 248e14f8e24Smartinh unsigned int flags); 249408be415Smartinh static void btree_ref(struct btree *bt); 2505d465952Smartinh 2515d465952Smartinh static struct node *btree_search_node(struct btree *bt, struct mpage *mp, 2525d465952Smartinh struct btval *key, int *exactp, unsigned int *kip); 2535d465952Smartinh static int btree_add_node(struct btree *bt, struct mpage *mp, 2545d465952Smartinh indx_t indx, struct btval *key, struct btval *data, 2555d465952Smartinh pgno_t pgno, uint8_t flags); 2565d465952Smartinh static void btree_del_node(struct btree *bt, struct mpage *mp, 2575d465952Smartinh indx_t indx); 2585d465952Smartinh static int btree_read_data(struct btree *bt, struct mpage *mp, 2595d465952Smartinh struct node *leaf, struct btval *data); 2605d465952Smartinh 2615d465952Smartinh static int btree_rebalance(struct btree *bt, struct mpage *mp); 2625d465952Smartinh static int btree_update_key(struct btree *bt, struct mpage *mp, 2635d465952Smartinh indx_t indx, struct btval *key); 2645d465952Smartinh static int btree_adjust_prefix(struct btree *bt, 2655d465952Smartinh struct mpage *src, int delta); 2665d465952Smartinh static int btree_move_node(struct btree *bt, struct mpage *src, 2675d465952Smartinh indx_t srcindx, struct mpage *dst, indx_t dstindx); 2685d465952Smartinh static int btree_merge(struct btree *bt, struct mpage *src, 2695d465952Smartinh struct mpage *dst); 2705d465952Smartinh static int btree_split(struct btree *bt, struct mpage **mpp, 2715d465952Smartinh unsigned int *newindxp, struct btval *newkey, 2725d465952Smartinh struct btval *newdata, pgno_t newpgno); 2735d465952Smartinh static struct mpage *btree_new_page(struct btree *bt, uint32_t flags); 2745d465952Smartinh static int btree_write_overflow_data(struct btree *bt, 2755d465952Smartinh struct page *p, struct btval *data); 2765d465952Smartinh 2775d465952Smartinh static void cursor_pop_page(struct cursor *cursor); 2785d465952Smartinh static struct ppage *cursor_push_page(struct cursor *cursor, 2795d465952Smartinh struct mpage *mp); 2805d465952Smartinh 2815d465952Smartinh static int bt_set_key(struct btree *bt, struct mpage *mp, 2825d465952Smartinh struct node *node, struct btval *key); 2835d465952Smartinh static int btree_sibling(struct cursor *cursor, int move_right); 2845d465952Smartinh static int btree_cursor_next(struct cursor *cursor, 2855d465952Smartinh struct btval *key, struct btval *data); 2865d465952Smartinh static int btree_cursor_set(struct cursor *cursor, 2875d465952Smartinh struct btval *key, struct btval *data); 2885d465952Smartinh static int btree_cursor_first(struct cursor *cursor, 2895d465952Smartinh struct btval *key, struct btval *data); 2905d465952Smartinh 2915d465952Smartinh static void bt_reduce_separator(struct btree *bt, struct node *min, 2925d465952Smartinh struct btval *sep); 2935d465952Smartinh static void remove_prefix(struct btree *bt, struct btval *key, 2945d465952Smartinh size_t pfxlen); 2955d465952Smartinh static void expand_prefix(struct btree *bt, struct mpage *mp, 2965d465952Smartinh indx_t indx, struct btkey *expkey); 2975d465952Smartinh static void concat_prefix(struct btree *bt, char *s1, size_t n1, 2985d465952Smartinh char *s2, size_t n2, char *cs, size_t *cn); 2995d465952Smartinh static void common_prefix(struct btree *bt, struct btkey *min, 3005d465952Smartinh struct btkey *max, struct btkey *pfx); 3015d465952Smartinh static void find_common_prefix(struct btree *bt, struct mpage *mp); 3025d465952Smartinh 30337ee45a1Smartinh static size_t bt_leaf_size(struct btree *bt, struct btval *key, 30437ee45a1Smartinh struct btval *data); 30537ee45a1Smartinh static size_t bt_branch_size(struct btree *bt, struct btval *key); 3065d465952Smartinh 3075d465952Smartinh static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno, 30837ee45a1Smartinh struct btree *btc); 3095d465952Smartinh 3105d465952Smartinh static int memncmp(const void *s1, size_t n1, 3115d465952Smartinh const void *s2, size_t n2); 3125d465952Smartinh static int memnrcmp(const void *s1, size_t n1, 3135d465952Smartinh const void *s2, size_t n2); 3145d465952Smartinh 3155d465952Smartinh static int 3165d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2) 3175d465952Smartinh { 3185d465952Smartinh if (n1 < n2) { 3195d465952Smartinh if (memcmp(s1, s2, n1) == 0) 3205d465952Smartinh return -1; 3215d465952Smartinh } 3225d465952Smartinh else if (n1 > n2) { 3235d465952Smartinh if (memcmp(s1, s2, n2) == 0) 3245d465952Smartinh return 1; 3255d465952Smartinh } 3265d465952Smartinh return memcmp(s1, s2, n1); 3275d465952Smartinh } 3285d465952Smartinh 3295d465952Smartinh static int 3305d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) 3315d465952Smartinh { 3325d465952Smartinh const unsigned char *p1; 3335d465952Smartinh const unsigned char *p2; 3345d465952Smartinh 3355d465952Smartinh if (n1 == 0) 3365d465952Smartinh return n2 == 0 ? 0 : -1; 3375d465952Smartinh 3385d465952Smartinh if (n2 == 0) 3395d465952Smartinh return n1 == 0 ? 0 : 1; 3405d465952Smartinh 3415d465952Smartinh p1 = (const unsigned char *)s1 + n1 - 1; 3425d465952Smartinh p2 = (const unsigned char *)s2 + n2 - 1; 3435d465952Smartinh 3445d465952Smartinh while (*p1 == *p2) { 3455d465952Smartinh if (p1 == s1) 3465d465952Smartinh return (p2 == s2) ? 0 : -1; 3475d465952Smartinh if (p2 == s2) 3485d465952Smartinh return (p1 == p2) ? 0 : 1; 3495d465952Smartinh p1--; 3505d465952Smartinh p2--; 3515d465952Smartinh } 3525d465952Smartinh return *p1 - *p2; 3535d465952Smartinh } 3545d465952Smartinh 3555d465952Smartinh int 3565d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b) 3575d465952Smartinh { 3585d465952Smartinh return bt->cmp(a, b); 3595d465952Smartinh } 3605d465952Smartinh 3615d465952Smartinh static void 3625d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max, 3635d465952Smartinh struct btkey *pfx) 3645d465952Smartinh { 3655d465952Smartinh size_t n = 0; 3665d465952Smartinh char *p1; 3675d465952Smartinh char *p2; 3685d465952Smartinh 3695d465952Smartinh if (min->len == 0 || max->len == 0) { 3705d465952Smartinh pfx->len = 0; 3715d465952Smartinh return; 3725d465952Smartinh } 3735d465952Smartinh 3745d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 3755d465952Smartinh p1 = min->str + min->len - 1; 3765d465952Smartinh p2 = max->str + max->len - 1; 3775d465952Smartinh 3785d465952Smartinh while (*p1 == *p2) { 3795d465952Smartinh if (p1 < min->str || p2 < max->str) 3805d465952Smartinh break; 3815d465952Smartinh p1--; 3825d465952Smartinh p2--; 3835d465952Smartinh n++; 3845d465952Smartinh } 3855d465952Smartinh 3865d465952Smartinh assert(n <= (int)sizeof(pfx->str)); 3875d465952Smartinh pfx->len = n; 3885d465952Smartinh bcopy(p2 + 1, pfx->str, n); 3895d465952Smartinh } else { 3905d465952Smartinh p1 = min->str; 3915d465952Smartinh p2 = max->str; 3925d465952Smartinh 3935d465952Smartinh while (*p1 == *p2) { 3945d465952Smartinh if (n == min->len || n == max->len) 3955d465952Smartinh break; 3965d465952Smartinh p1++; 3975d465952Smartinh p2++; 3985d465952Smartinh n++; 3995d465952Smartinh } 4005d465952Smartinh 4015d465952Smartinh assert(n <= (int)sizeof(pfx->str)); 4025d465952Smartinh pfx->len = n; 4035d465952Smartinh bcopy(max->str, pfx->str, n); 4045d465952Smartinh } 4055d465952Smartinh } 4065d465952Smartinh 4075d465952Smartinh static void 4085d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen) 4095d465952Smartinh { 4105d465952Smartinh if (pfxlen == 0 || bt->cmp != NULL) 4115d465952Smartinh return; 4125d465952Smartinh 4135d465952Smartinh DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, 4145d465952Smartinh (int)key->size, (char *)key->data); 4155d465952Smartinh assert(pfxlen <= key->size); 4165d465952Smartinh key->size -= pfxlen; 4175d465952Smartinh if (!F_ISSET(bt->flags, BT_REVERSEKEY)) 4185d465952Smartinh key->data = (char *)key->data + pfxlen; 4195d465952Smartinh } 4205d465952Smartinh 4215d465952Smartinh static void 4225d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx, 4235d465952Smartinh struct btkey *expkey) 4245d465952Smartinh { 4255d465952Smartinh struct node *node; 4265d465952Smartinh 4275d465952Smartinh node = NODEPTR(mp, indx); 4285d465952Smartinh expkey->len = sizeof(expkey->str); 4295d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len, 4305d465952Smartinh NODEKEY(node), node->ksize, expkey->str, &expkey->len); 4315d465952Smartinh } 4325d465952Smartinh 4335d465952Smartinh static int 4345d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2, 4355d465952Smartinh struct btkey *pfx) 4365d465952Smartinh { 4375d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) 4385d465952Smartinh return memnrcmp(key1->data, key1->size - pfx->len, 4395d465952Smartinh key2->data, key2->size); 4405d465952Smartinh else 4415d465952Smartinh return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len, 4425d465952Smartinh key2->data, key2->size); 4435d465952Smartinh } 4445d465952Smartinh 4455d465952Smartinh void 4465d465952Smartinh btval_reset(struct btval *btv) 4475d465952Smartinh { 4485d465952Smartinh if (btv) { 4495d465952Smartinh if (btv->mp) 4505d465952Smartinh btv->mp->ref--; 4515d465952Smartinh if (btv->free_data) 4525d465952Smartinh free(btv->data); 4535d465952Smartinh bzero(btv, sizeof(*btv)); 4545d465952Smartinh } 4555d465952Smartinh } 4565d465952Smartinh 4575d465952Smartinh static int 4585d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b) 4595d465952Smartinh { 4605d465952Smartinh if (a->pgno > b->pgno) 4615d465952Smartinh return 1; 4625d465952Smartinh if (a->pgno < b->pgno) 4635d465952Smartinh return -1; 4645d465952Smartinh return 0; 4655d465952Smartinh } 4665d465952Smartinh 4675d465952Smartinh static struct mpage * 4685d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno) 4695d465952Smartinh { 4705d465952Smartinh struct mpage find, *mp; 4715d465952Smartinh 4725d465952Smartinh find.pgno = pgno; 4735d465952Smartinh mp = RB_FIND(page_cache, bt->page_cache, &find); 4745d465952Smartinh if (mp) { 4755d465952Smartinh bt->stat.hits++; 4765d465952Smartinh /* Update LRU queue. Move page to the end. */ 4775d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 4785d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 4795d465952Smartinh } 4805d465952Smartinh return mp; 4815d465952Smartinh } 4825d465952Smartinh 4835d465952Smartinh static void 4845d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp) 4855d465952Smartinh { 4865d465952Smartinh assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL); 4875d465952Smartinh bt->stat.cache_size++; 4885d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 48937ee45a1Smartinh mpage_prune(bt); 49037ee45a1Smartinh } 49137ee45a1Smartinh 49237ee45a1Smartinh static void 49337ee45a1Smartinh mpage_free(struct mpage *mp) 49437ee45a1Smartinh { 49537ee45a1Smartinh if (mp != NULL) { 49637ee45a1Smartinh free(mp->page); 49737ee45a1Smartinh free(mp); 49837ee45a1Smartinh } 4995d465952Smartinh } 5005d465952Smartinh 5015d465952Smartinh static void 5025d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp) 5035d465952Smartinh { 5045d465952Smartinh assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp); 5055d465952Smartinh assert(bt->stat.cache_size > 0); 5065d465952Smartinh bt->stat.cache_size--; 5075d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 5085d465952Smartinh } 5095d465952Smartinh 510e14f8e24Smartinh static void 511e14f8e24Smartinh mpage_flush(struct btree *bt) 512e14f8e24Smartinh { 513e14f8e24Smartinh struct mpage *mp; 514e14f8e24Smartinh 515e14f8e24Smartinh while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { 516e14f8e24Smartinh mpage_del(bt, mp); 51737ee45a1Smartinh mpage_free(mp); 518e14f8e24Smartinh } 519e14f8e24Smartinh } 520e14f8e24Smartinh 5215d465952Smartinh static struct mpage * 52237ee45a1Smartinh mpage_copy(struct btree *bt, struct mpage *mp) 5235d465952Smartinh { 5245d465952Smartinh struct mpage *copy; 5255d465952Smartinh 5265d465952Smartinh if ((copy = calloc(1, sizeof(*copy))) == NULL) 5275d465952Smartinh return NULL; 52837ee45a1Smartinh if ((copy->page = malloc(bt->head.psize)) == NULL) { 52937ee45a1Smartinh free(copy); 53037ee45a1Smartinh return NULL; 53137ee45a1Smartinh } 53237ee45a1Smartinh bcopy(mp->page, copy->page, bt->head.psize); 5335d465952Smartinh bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix)); 5345d465952Smartinh copy->parent = mp->parent; 5355d465952Smartinh copy->parent_index = mp->parent_index; 5365d465952Smartinh copy->pgno = mp->pgno; 5375d465952Smartinh 5385d465952Smartinh return copy; 5395d465952Smartinh } 5405d465952Smartinh 5415d465952Smartinh /* Remove the least recently used memory pages until the cache size is 5425d465952Smartinh * within the configured bounds. Pages referenced by cursors or returned 5435d465952Smartinh * key/data are not pruned. 5445d465952Smartinh */ 5455d465952Smartinh static void 5465d465952Smartinh mpage_prune(struct btree *bt) 5475d465952Smartinh { 5485d465952Smartinh struct mpage *mp, *next; 5495d465952Smartinh 5505d465952Smartinh for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { 5515d465952Smartinh if (bt->stat.cache_size <= bt->stat.max_cache) 5525d465952Smartinh break; 5535d465952Smartinh next = TAILQ_NEXT(mp, lru_next); 5545d465952Smartinh if (!mp->dirty && mp->ref <= 0) { 5555d465952Smartinh mpage_del(bt, mp); 55637ee45a1Smartinh mpage_free(mp); 5575d465952Smartinh } 5585d465952Smartinh } 5595d465952Smartinh } 5605d465952Smartinh 5615d465952Smartinh /* Mark a page as dirty and push it on the dirty queue. 5625d465952Smartinh */ 5635d465952Smartinh static void 5645d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp) 5655d465952Smartinh { 5665d465952Smartinh assert(bt != NULL); 5675d465952Smartinh assert(bt->txn != NULL); 5685d465952Smartinh 5695d465952Smartinh if (!mp->dirty) { 5705d465952Smartinh mp->dirty = 1; 5715d465952Smartinh SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next); 5725d465952Smartinh } 5735d465952Smartinh } 5745d465952Smartinh 5755d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno. 5765d465952Smartinh */ 5775d465952Smartinh static void 5785d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp) 5795d465952Smartinh { 5805d465952Smartinh assert(bt != NULL); 5815d465952Smartinh assert(bt->txn != NULL); 5825d465952Smartinh assert(mp != NULL); 5835d465952Smartinh 5845d465952Smartinh if (!mp->dirty) { 5855d465952Smartinh DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); 5865d465952Smartinh if (mp->ref == 0) 5875d465952Smartinh mpage_del(bt, mp); 5885d465952Smartinh else 58937ee45a1Smartinh mp = mpage_copy(bt, mp); 59037ee45a1Smartinh mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 5915d465952Smartinh mpage_dirty(bt, mp); 5925d465952Smartinh mpage_add(bt, mp); 5935d465952Smartinh 5945d465952Smartinh /* Update the page number to new touched page. */ 5955d465952Smartinh if (mp->parent != NULL) 5965d465952Smartinh NODEPGNO(NODEPTR(mp->parent, 5975d465952Smartinh mp->parent_index)) = mp->pgno; 5985d465952Smartinh } 5995d465952Smartinh } 6005d465952Smartinh 6015d465952Smartinh static int 6025d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page) 6035d465952Smartinh { 6045d465952Smartinh ssize_t rc; 6055d465952Smartinh 6065d465952Smartinh DPRINTF("reading page %u", pgno); 6075d465952Smartinh bt->stat.reads++; 60837ee45a1Smartinh if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) { 6095d465952Smartinh DPRINTF("page %u doesn't exist", pgno); 6100279e526Smartinh errno = ENOENT; 6110279e526Smartinh return BT_FAIL; 612*36e226d9Smartinh } else if (rc != (ssize_t)bt->head.psize) { 61337ee45a1Smartinh if (rc > 0) 61437ee45a1Smartinh errno = EINVAL; 6155d465952Smartinh DPRINTF("read: %s", strerror(errno)); 6165d465952Smartinh return BT_FAIL; 6175d465952Smartinh } 6185d465952Smartinh 6195d465952Smartinh if (page->pgno != pgno) { 6205d465952Smartinh DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); 6215d465952Smartinh errno = EINVAL; 6225d465952Smartinh return BT_FAIL; 6235d465952Smartinh } 6245d465952Smartinh 62537ee45a1Smartinh DPRINTF("page %u has flags 0x%X", pgno, page->flags); 62637ee45a1Smartinh 6275d465952Smartinh return BT_SUCCESS; 6285d465952Smartinh } 6295d465952Smartinh 6305d465952Smartinh int 6315d465952Smartinh btree_sync(struct btree *bt) 6325d465952Smartinh { 6335d465952Smartinh if (!F_ISSET(bt->flags, BT_NOSYNC)) 6345d465952Smartinh return fsync(bt->fd); 6355d465952Smartinh return 0; 6365d465952Smartinh } 6375d465952Smartinh 6385d465952Smartinh struct btree_txn * 6395d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly) 6405d465952Smartinh { 6415d465952Smartinh struct btree_txn *txn; 6425d465952Smartinh 6435d465952Smartinh if (!rdonly && bt->txn != NULL) { 6445d465952Smartinh DPRINTF("write transaction already begun"); 6450279e526Smartinh errno = EBUSY; 6465d465952Smartinh return NULL; 6475d465952Smartinh } 6485d465952Smartinh 6495d465952Smartinh if ((txn = calloc(1, sizeof(*txn))) == NULL) { 6505d465952Smartinh DPRINTF("calloc: %s", strerror(errno)); 6515d465952Smartinh return NULL; 6525d465952Smartinh } 6535d465952Smartinh 6545d465952Smartinh if (rdonly) { 6555d465952Smartinh txn->flags |= BT_TXN_RDONLY; 6565d465952Smartinh } else { 6575d465952Smartinh txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue)); 6585d465952Smartinh if (txn->dirty_queue == NULL) { 6595d465952Smartinh free(txn); 6605d465952Smartinh return NULL; 6615d465952Smartinh } 6625d465952Smartinh SIMPLEQ_INIT(txn->dirty_queue); 6635d465952Smartinh 6645d465952Smartinh DPRINTF("taking write lock on txn %p", txn); 6655d465952Smartinh if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { 6660279e526Smartinh DPRINTF("flock: %s", strerror(errno)); 6670279e526Smartinh errno = EBUSY; 6685d465952Smartinh free(txn->dirty_queue); 6695d465952Smartinh free(txn); 6705d465952Smartinh return NULL; 6715d465952Smartinh } 6725d465952Smartinh bt->txn = txn; 6735d465952Smartinh } 6745d465952Smartinh 6755d465952Smartinh txn->bt = bt; 676408be415Smartinh btree_ref(bt); 6775d465952Smartinh 6780279e526Smartinh if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) { 6795d465952Smartinh btree_txn_abort(txn); 6805d465952Smartinh return NULL; 6815d465952Smartinh } 6825d465952Smartinh 68337ee45a1Smartinh txn->root = bt->meta.root; 6845d465952Smartinh DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); 6855d465952Smartinh 6865d465952Smartinh return txn; 6875d465952Smartinh } 6885d465952Smartinh 6895d465952Smartinh void 6905d465952Smartinh btree_txn_abort(struct btree_txn *txn) 6915d465952Smartinh { 6925d465952Smartinh struct mpage *mp; 6935d465952Smartinh struct btree *bt; 6945d465952Smartinh 6955d465952Smartinh if (txn == NULL) 6965d465952Smartinh return; 6975d465952Smartinh 6985d465952Smartinh bt = txn->bt; 6995d465952Smartinh DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); 7005d465952Smartinh 701f189d82aSmartinh if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { 7025d465952Smartinh /* Discard all dirty pages. 7035d465952Smartinh */ 7045d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 7055d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue); 7065d465952Smartinh assert(mp->ref == 0); /* cursors should be closed */ 7075d465952Smartinh mpage_del(bt, mp); 7085d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 7095d465952Smartinh } 7105d465952Smartinh 7115d465952Smartinh DPRINTF("releasing write lock on txn %p", txn); 7125d465952Smartinh txn->bt->txn = NULL; 7135d465952Smartinh if (flock(txn->bt->fd, LOCK_UN) != 0) { 7145d465952Smartinh DPRINTF("failed to unlock fd %d: %s", 7155d465952Smartinh txn->bt->fd, strerror(errno)); 7165d465952Smartinh } 7175d465952Smartinh free(txn->dirty_queue); 7185d465952Smartinh } 7195d465952Smartinh 7205d465952Smartinh btree_close(txn->bt); 7215d465952Smartinh free(txn); 7225d465952Smartinh } 7235d465952Smartinh 7245d465952Smartinh int 7255d465952Smartinh btree_txn_commit(struct btree_txn *txn) 7265d465952Smartinh { 7275d465952Smartinh int n, done; 7285d465952Smartinh ssize_t rc; 7295d465952Smartinh off_t size; 7305d465952Smartinh struct mpage *mp; 7315d465952Smartinh struct btree *bt; 7325d465952Smartinh struct iovec iov[BT_COMMIT_PAGES]; 7335d465952Smartinh 7345d465952Smartinh assert(txn != NULL); 7355d465952Smartinh assert(txn->bt != NULL); 7365d465952Smartinh 7375d465952Smartinh bt = txn->bt; 7385d465952Smartinh 7395d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { 7405d465952Smartinh DPRINTF("attempt to commit read-only transaction"); 7415d465952Smartinh btree_txn_abort(txn); 7420279e526Smartinh errno = EPERM; 7435d465952Smartinh return BT_FAIL; 7445d465952Smartinh } 7455d465952Smartinh 7465d465952Smartinh if (txn != bt->txn) { 7475d465952Smartinh DPRINTF("attempt to commit unknown transaction"); 7485d465952Smartinh btree_txn_abort(txn); 7490279e526Smartinh errno = EINVAL; 7505d465952Smartinh return BT_FAIL; 7515d465952Smartinh } 7525d465952Smartinh 7535d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 7545d465952Smartinh DPRINTF("error flag is set, can't commit"); 7555d465952Smartinh btree_txn_abort(txn); 7560279e526Smartinh errno = EINVAL; 7575d465952Smartinh return BT_FAIL; 7585d465952Smartinh } 7595d465952Smartinh 7605d465952Smartinh if (SIMPLEQ_EMPTY(txn->dirty_queue)) 7615d465952Smartinh goto done; 7625d465952Smartinh 7635d465952Smartinh if (F_ISSET(bt->flags, BT_FIXPADDING)) { 7645d465952Smartinh size = lseek(bt->fd, 0, SEEK_END); 76537ee45a1Smartinh size += bt->head.psize - (size % bt->head.psize); 7665d465952Smartinh DPRINTF("extending to multiple of page size: %llu", size); 7675d465952Smartinh if (ftruncate(bt->fd, size) != 0) { 7685d465952Smartinh DPRINTF("ftruncate: %s", strerror(errno)); 7695d465952Smartinh btree_txn_abort(txn); 7705d465952Smartinh return BT_FAIL; 7715d465952Smartinh } 7725d465952Smartinh bt->flags &= ~BT_FIXPADDING; 7735d465952Smartinh } 7745d465952Smartinh 7755d465952Smartinh DPRINTF("committing transaction on btree %p, root page %u", 7765d465952Smartinh bt, txn->root); 7775d465952Smartinh 7785d465952Smartinh /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done. 7795d465952Smartinh */ 7805d465952Smartinh do { 7815d465952Smartinh n = 0; 7825d465952Smartinh done = 1; 7835d465952Smartinh SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { 7845d465952Smartinh DPRINTF("commiting page %u", mp->pgno); 78537ee45a1Smartinh iov[n].iov_len = bt->head.psize; 78637ee45a1Smartinh iov[n].iov_base = mp->page; 7875d465952Smartinh if (++n >= BT_COMMIT_PAGES) { 7885d465952Smartinh done = 0; 7895d465952Smartinh break; 7905d465952Smartinh } 7915d465952Smartinh } 7925d465952Smartinh 7935d465952Smartinh if (n == 0) 7945d465952Smartinh break; 7955d465952Smartinh 7965d465952Smartinh DPRINTF("commiting %u dirty pages", n); 7975d465952Smartinh rc = writev(bt->fd, iov, n); 798*36e226d9Smartinh if (rc != (ssize_t)bt->head.psize*n) { 7995d465952Smartinh if (rc > 0) 8005d465952Smartinh DPRINTF("short write, filesystem full?"); 8015d465952Smartinh else 8025d465952Smartinh DPRINTF("writev: %s", strerror(errno)); 8035d465952Smartinh btree_txn_abort(txn); 8045d465952Smartinh return BT_FAIL; 8055d465952Smartinh } 8065d465952Smartinh 8075d465952Smartinh /* Remove the dirty flag from the written pages. 8085d465952Smartinh */ 8095d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 8105d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue); 8115d465952Smartinh mp->dirty = 0; 8125d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 8135d465952Smartinh if (--n == 0) 8145d465952Smartinh break; 8155d465952Smartinh } 8165d465952Smartinh } while (!done); 8175d465952Smartinh 8185d465952Smartinh if (btree_sync(bt) != 0 || 819e14f8e24Smartinh btree_write_meta(bt, txn->root, 0) != BT_SUCCESS || 8205d465952Smartinh btree_sync(bt) != 0) { 8215d465952Smartinh btree_txn_abort(txn); 8225d465952Smartinh return BT_FAIL; 8235d465952Smartinh } 8245d465952Smartinh 8255d465952Smartinh done: 8265d465952Smartinh mpage_prune(bt); 8275d465952Smartinh btree_txn_abort(txn); 8285d465952Smartinh 8295d465952Smartinh return BT_SUCCESS; 8305d465952Smartinh } 8315d465952Smartinh 8325d465952Smartinh static int 83337ee45a1Smartinh btree_write_header(struct btree *bt, int fd) 83437ee45a1Smartinh { 83537ee45a1Smartinh struct stat sb; 83637ee45a1Smartinh struct bt_head *h; 83737ee45a1Smartinh struct page *p; 83837ee45a1Smartinh ssize_t rc; 83937ee45a1Smartinh unsigned int psize; 84037ee45a1Smartinh 84137ee45a1Smartinh DPRINTF("writing header page"); 84237ee45a1Smartinh assert(bt != NULL); 84337ee45a1Smartinh 84437ee45a1Smartinh /* Ask stat for 'optimal blocksize for I/O'. 84537ee45a1Smartinh */ 84637ee45a1Smartinh if (fstat(fd, &sb) == 0) 84737ee45a1Smartinh psize = sb.st_blksize; 84837ee45a1Smartinh else 84937ee45a1Smartinh psize = PAGESIZE; 85037ee45a1Smartinh 85137ee45a1Smartinh if ((p = calloc(1, psize)) == NULL) 85237ee45a1Smartinh return -1; 85337ee45a1Smartinh p->flags = P_HEAD; 85437ee45a1Smartinh 85537ee45a1Smartinh h = METADATA(p); 85637ee45a1Smartinh h->magic = BT_MAGIC; 85737ee45a1Smartinh h->version = BT_VERSION; 85837ee45a1Smartinh h->psize = psize; 85937ee45a1Smartinh bcopy(h, &bt->head, sizeof(*h)); 86037ee45a1Smartinh 86137ee45a1Smartinh rc = write(fd, p, bt->head.psize); 86237ee45a1Smartinh free(p); 863*36e226d9Smartinh if (rc != (ssize_t)bt->head.psize) { 86437ee45a1Smartinh if (rc > 0) 86537ee45a1Smartinh DPRINTF("short write, filesystem full?"); 86637ee45a1Smartinh return BT_FAIL; 86737ee45a1Smartinh } 86837ee45a1Smartinh 86937ee45a1Smartinh return BT_SUCCESS; 87037ee45a1Smartinh } 87137ee45a1Smartinh 87237ee45a1Smartinh static int 87337ee45a1Smartinh btree_read_header(struct btree *bt) 87437ee45a1Smartinh { 87537ee45a1Smartinh char page[PAGESIZE]; 87637ee45a1Smartinh struct page *p; 87737ee45a1Smartinh struct bt_head *h; 87837ee45a1Smartinh int rc; 87937ee45a1Smartinh 88037ee45a1Smartinh assert(bt != NULL); 88137ee45a1Smartinh 88237ee45a1Smartinh /* We don't know the page size yet, so use a minimum value. 88337ee45a1Smartinh */ 88437ee45a1Smartinh 88537ee45a1Smartinh if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) { 88637ee45a1Smartinh errno = ENOENT; 88737ee45a1Smartinh return -1; 88837ee45a1Smartinh } else if (rc != PAGESIZE) { 88937ee45a1Smartinh if (rc > 0) 89037ee45a1Smartinh errno = EINVAL; 89137ee45a1Smartinh DPRINTF("read: %s", strerror(errno)); 89237ee45a1Smartinh return -1; 89337ee45a1Smartinh } 89437ee45a1Smartinh 89537ee45a1Smartinh p = (struct page *)page; 89637ee45a1Smartinh 89737ee45a1Smartinh if (!F_ISSET(p->flags, P_HEAD)) { 89837ee45a1Smartinh DPRINTF("page %d not a header page", p->pgno); 89937ee45a1Smartinh errno = EINVAL; 90037ee45a1Smartinh return -1; 90137ee45a1Smartinh } 90237ee45a1Smartinh 90337ee45a1Smartinh h = METADATA(p); 90437ee45a1Smartinh if (h->magic != BT_MAGIC) { 90537ee45a1Smartinh DPRINTF("header has invalid magic"); 90637ee45a1Smartinh errno = EINVAL; 90737ee45a1Smartinh return -1; 90837ee45a1Smartinh } 90937ee45a1Smartinh 91037ee45a1Smartinh if (h->version != BT_VERSION) { 91137ee45a1Smartinh DPRINTF("database is version %u, expected version %u", 91237ee45a1Smartinh bt->head.version, BT_VERSION); 91337ee45a1Smartinh errno = EINVAL; 91437ee45a1Smartinh return -1; 91537ee45a1Smartinh } 91637ee45a1Smartinh 91737ee45a1Smartinh bcopy(h, &bt->head, sizeof(*h)); 91837ee45a1Smartinh return 0; 91937ee45a1Smartinh } 92037ee45a1Smartinh 92137ee45a1Smartinh static int 922e14f8e24Smartinh btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags) 9235d465952Smartinh { 92437ee45a1Smartinh struct mpage *mp; 92537ee45a1Smartinh struct bt_meta *meta; 9265d465952Smartinh ssize_t rc; 9275d465952Smartinh 9285d465952Smartinh DPRINTF("writing meta page for root page %u", root); 9295d465952Smartinh 9305d465952Smartinh assert(bt != NULL); 9315d465952Smartinh assert(bt->txn != NULL); 9325d465952Smartinh 93337ee45a1Smartinh if ((mp = btree_new_page(bt, P_META)) == NULL) 93437ee45a1Smartinh return -1; 9355d465952Smartinh 93637ee45a1Smartinh bt->meta.prev_meta = bt->meta.root; 93737ee45a1Smartinh bt->meta.root = root; 93837ee45a1Smartinh bt->meta.flags = flags; 93937ee45a1Smartinh bt->meta.created_at = time(0); 94037ee45a1Smartinh bt->meta.revisions++; 94137ee45a1Smartinh SHA1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash); 9425d465952Smartinh 94337ee45a1Smartinh /* Copy the meta data changes to the new meta page. */ 94437ee45a1Smartinh meta = METADATA(mp->page); 94537ee45a1Smartinh bcopy(&bt->meta, meta, sizeof(*meta)); 94637ee45a1Smartinh 947*36e226d9Smartinh rc = write(bt->fd, mp->page, bt->head.psize); 948*36e226d9Smartinh if (rc != (ssize_t)bt->head.psize) { 9495d465952Smartinh if (rc > 0) 9505d465952Smartinh DPRINTF("short write, filesystem full?"); 9515d465952Smartinh return BT_FAIL; 9525d465952Smartinh } 9535d465952Smartinh 9545d465952Smartinh if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { 9555d465952Smartinh DPRINTF("failed to update file size: %s", strerror(errno)); 9565d465952Smartinh bt->size = 0; 9575d465952Smartinh } 9585d465952Smartinh 9595d465952Smartinh return BT_SUCCESS; 9605d465952Smartinh } 9615d465952Smartinh 9625d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise. 9635d465952Smartinh */ 9645d465952Smartinh static int 9655d465952Smartinh btree_is_meta_page(struct page *p) 9665d465952Smartinh { 9675d465952Smartinh struct bt_meta *m; 9685d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH]; 9695d465952Smartinh 9705d465952Smartinh m = METADATA(p); 9715d465952Smartinh if (!F_ISSET(p->flags, P_META)) { 9725d465952Smartinh DPRINTF("page %d not a meta page", p->pgno); 97337ee45a1Smartinh errno = EINVAL; 9745d465952Smartinh return 0; 9755d465952Smartinh } 9765d465952Smartinh 9775d465952Smartinh if (m->root >= p->pgno && m->root != P_INVALID) { 9785d465952Smartinh DPRINTF("page %d points to an invalid root page", p->pgno); 97937ee45a1Smartinh errno = EINVAL; 9805d465952Smartinh return 0; 9815d465952Smartinh } 9825d465952Smartinh 9835d465952Smartinh SHA1((unsigned char *)m, METAHASHLEN, hash); 9845d465952Smartinh if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) { 9855d465952Smartinh DPRINTF("page %d has an invalid digest", p->pgno); 98637ee45a1Smartinh errno = EINVAL; 9875d465952Smartinh return 0; 9885d465952Smartinh } 9895d465952Smartinh 9905d465952Smartinh return 1; 9915d465952Smartinh } 9925d465952Smartinh 9935d465952Smartinh static int 9945d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next) 9955d465952Smartinh { 99637ee45a1Smartinh struct mpage *mp; 99737ee45a1Smartinh struct bt_meta *meta; 9985d465952Smartinh pgno_t meta_pgno, next_pgno; 9995d465952Smartinh off_t size; 10005d465952Smartinh 10015d465952Smartinh assert(bt != NULL); 10025d465952Smartinh 10035d465952Smartinh if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) 10045d465952Smartinh goto fail; 10055d465952Smartinh 10065d465952Smartinh DPRINTF("btree_read_meta: size = %llu", size); 10075d465952Smartinh 10085d465952Smartinh if (size < bt->size) { 10095d465952Smartinh DPRINTF("file has shrunk!"); 10105d465952Smartinh errno = EIO; 10115d465952Smartinh goto fail; 10125d465952Smartinh } 10135d465952Smartinh 101437ee45a1Smartinh if (size == bt->head.psize) { /* there is only the header */ 10155d465952Smartinh if (p_next != NULL) 101637ee45a1Smartinh *p_next = 1; 10170279e526Smartinh return BT_SUCCESS; /* new file */ 10185d465952Smartinh } 10195d465952Smartinh 102037ee45a1Smartinh next_pgno = size / bt->head.psize; 10215d465952Smartinh if (next_pgno == 0) { 10225d465952Smartinh DPRINTF("corrupt file"); 10235d465952Smartinh errno = EIO; 10245d465952Smartinh goto fail; 10255d465952Smartinh } 10265d465952Smartinh 10275d465952Smartinh meta_pgno = next_pgno - 1; 10285d465952Smartinh 102937ee45a1Smartinh if (size % bt->head.psize != 0) { 10305d465952Smartinh DPRINTF("filesize not a multiple of the page size!"); 10315d465952Smartinh bt->flags |= BT_FIXPADDING; 10325d465952Smartinh next_pgno++; 10335d465952Smartinh } 10345d465952Smartinh 10355d465952Smartinh if (p_next != NULL) 10365d465952Smartinh *p_next = next_pgno; 10375d465952Smartinh 10385d465952Smartinh if (size == bt->size) { 10395d465952Smartinh DPRINTF("size unchanged, keeping current meta page"); 104037ee45a1Smartinh if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) { 104162436105Smartinh DPRINTF("file is dead"); 10420279e526Smartinh errno = ESTALE; 10430279e526Smartinh return BT_FAIL; 104462436105Smartinh } else 10455d465952Smartinh return BT_SUCCESS; 10465d465952Smartinh } 10475d465952Smartinh bt->size = size; 10485d465952Smartinh 10495d465952Smartinh while (meta_pgno > 0) { 105037ee45a1Smartinh if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL) 105137ee45a1Smartinh break; 105237ee45a1Smartinh if (btree_is_meta_page(mp->page)) { 105337ee45a1Smartinh meta = METADATA(mp->page); 105437ee45a1Smartinh DPRINTF("flags = 0x%x", meta->flags); 105537ee45a1Smartinh if (F_ISSET(meta->flags, BT_TOMBSTONE)) { 1056e14f8e24Smartinh DPRINTF("file is dead"); 10570279e526Smartinh errno = ESTALE; 10580279e526Smartinh return BT_FAIL; 105937ee45a1Smartinh } else { 106037ee45a1Smartinh /* Make copy of last meta page. */ 106137ee45a1Smartinh bcopy(meta, &bt->meta, sizeof(bt->meta)); 10625d465952Smartinh return BT_SUCCESS; 10635d465952Smartinh } 106437ee45a1Smartinh } 10655d465952Smartinh --meta_pgno; /* scan backwards to first valid meta page */ 10665d465952Smartinh } 10675d465952Smartinh 10685d465952Smartinh errno = EIO; 10695d465952Smartinh fail: 10705d465952Smartinh if (p_next != NULL) 10715d465952Smartinh *p_next = P_INVALID; 10725d465952Smartinh return BT_FAIL; 10735d465952Smartinh } 10745d465952Smartinh 10755d465952Smartinh struct btree * 1076176c8cbcSmartinh btree_open_fd(int fd, unsigned int flags) 10775d465952Smartinh { 10785d465952Smartinh struct btree *bt; 107937ee45a1Smartinh int fl; 10805d465952Smartinh 10815d465952Smartinh fl = fcntl(fd, F_GETFL, 0); 10825d465952Smartinh if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1) 10835d465952Smartinh return NULL; 10845d465952Smartinh 10855d465952Smartinh if ((bt = calloc(1, sizeof(*bt))) == NULL) 10865d465952Smartinh return NULL; 10875d465952Smartinh bt->fd = fd; 10885d465952Smartinh bt->flags = flags; 10895d465952Smartinh bt->flags &= ~BT_FIXPADDING; 1090408be415Smartinh bt->ref = 1; 109137ee45a1Smartinh bt->meta.root = P_INVALID; 10925d465952Smartinh 10935d465952Smartinh if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL) 10945d465952Smartinh goto fail; 10955d465952Smartinh bt->stat.max_cache = BT_MAXCACHE_DEF; 10965d465952Smartinh RB_INIT(bt->page_cache); 10975d465952Smartinh 10985d465952Smartinh if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL) 10995d465952Smartinh goto fail; 11005d465952Smartinh TAILQ_INIT(bt->lru_queue); 11015d465952Smartinh 110237ee45a1Smartinh if (btree_read_header(bt) != 0) { 110337ee45a1Smartinh if (errno != ENOENT) 11045d465952Smartinh goto fail; 110537ee45a1Smartinh DPRINTF("new database"); 110637ee45a1Smartinh btree_write_header(bt, bt->fd); 110737ee45a1Smartinh } 11085d465952Smartinh 11090279e526Smartinh if (btree_read_meta(bt, NULL) != 0) 11105d465952Smartinh goto fail; 11115d465952Smartinh 11125d465952Smartinh DPRINTF("opened database version %u, pagesize %u", 111337ee45a1Smartinh bt->head.version, bt->head.psize); 111437ee45a1Smartinh DPRINTF("timestamp: %s", ctime(&bt->meta.created_at)); 111537ee45a1Smartinh DPRINTF("depth: %u", bt->meta.depth); 111637ee45a1Smartinh DPRINTF("entries: %llu", bt->meta.entries); 111737ee45a1Smartinh DPRINTF("revisions: %u", bt->meta.revisions); 111837ee45a1Smartinh DPRINTF("branch pages: %u", bt->meta.branch_pages); 111937ee45a1Smartinh DPRINTF("leaf pages: %u", bt->meta.leaf_pages); 112037ee45a1Smartinh DPRINTF("overflow pages: %u", bt->meta.overflow_pages); 112137ee45a1Smartinh DPRINTF("root: %u", bt->meta.root); 112237ee45a1Smartinh DPRINTF("previous meta page: %u", bt->meta.prev_meta); 11235d465952Smartinh 11245d465952Smartinh return bt; 11255d465952Smartinh 11265d465952Smartinh fail: 11275d465952Smartinh free(bt->lru_queue); 11285d465952Smartinh free(bt->page_cache); 11295d465952Smartinh free(bt); 11305d465952Smartinh return NULL; 11315d465952Smartinh } 11325d465952Smartinh 11335d465952Smartinh struct btree * 1134176c8cbcSmartinh btree_open(const char *path, unsigned int flags, mode_t mode) 11355d465952Smartinh { 11365d465952Smartinh int fd, oflags; 11375d465952Smartinh struct btree *bt; 11385d465952Smartinh 11395d465952Smartinh if (F_ISSET(flags, BT_RDONLY)) 11405d465952Smartinh oflags = O_RDONLY; 11415d465952Smartinh else 11425d465952Smartinh oflags = O_RDWR | O_CREAT | O_APPEND; 11435d465952Smartinh 11445d465952Smartinh if ((fd = open(path, oflags, mode)) == -1) 11455d465952Smartinh return NULL; 11465d465952Smartinh 11475d465952Smartinh if ((bt = btree_open_fd(fd, flags)) == NULL) 11485d465952Smartinh close(fd); 11495d465952Smartinh else { 11505d465952Smartinh bt->path = strdup(path); 11515d465952Smartinh DPRINTF("opened btree %p", bt); 11525d465952Smartinh } 11535d465952Smartinh 11545d465952Smartinh return bt; 11555d465952Smartinh } 11565d465952Smartinh 1157408be415Smartinh static void 1158408be415Smartinh btree_ref(struct btree *bt) 1159408be415Smartinh { 1160408be415Smartinh bt->ref++; 1161408be415Smartinh DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1162408be415Smartinh } 1163408be415Smartinh 11645d465952Smartinh void 11655d465952Smartinh btree_close(struct btree *bt) 11665d465952Smartinh { 1167e14f8e24Smartinh if (bt == NULL) 1168e14f8e24Smartinh return; 11695d465952Smartinh 1170e14f8e24Smartinh if (--bt->ref == 0) { 11715d465952Smartinh DPRINTF("ref is zero, closing btree %p", bt); 11725d465952Smartinh close(bt->fd); 1173e14f8e24Smartinh mpage_flush(bt); 11745d465952Smartinh free(bt->page_cache); 11755d465952Smartinh free(bt); 1176e14f8e24Smartinh } else 1177408be415Smartinh DPRINTF("ref is now %d on btree %p", bt->ref, bt); 11785d465952Smartinh } 11795d465952Smartinh 11805d465952Smartinh /* Search for key within a leaf page, using binary search. 11815d465952Smartinh * Returns the smallest entry larger or equal to the key. 11825d465952Smartinh * If exactp is non-null, stores whether the found entry was an exact match 11835d465952Smartinh * in *exactp (1 or 0). 11845d465952Smartinh * If kip is non-null, stores the index of the found entry in *kip. 11855d465952Smartinh * If no entry larger of equal to the key is found, returns NULL. 11865d465952Smartinh */ 11875d465952Smartinh static struct node * 11885d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key, 11895d465952Smartinh int *exactp, unsigned int *kip) 11905d465952Smartinh { 11915d465952Smartinh unsigned int i = 0; 11925d465952Smartinh int low, high; 11935d465952Smartinh int rc = 0; 11945d465952Smartinh struct node *node; 11955d465952Smartinh struct btval nodekey; 11965d465952Smartinh 11975d465952Smartinh DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", 11985d465952Smartinh NUMKEYS(mp), 11995d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 12005d465952Smartinh mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str); 12015d465952Smartinh 12025d465952Smartinh assert(NUMKEYS(mp) > 0); 12035d465952Smartinh 12045d465952Smartinh bzero(&nodekey, sizeof(nodekey)); 12055d465952Smartinh 12065d465952Smartinh low = IS_LEAF(mp) ? 0 : 1; 12075d465952Smartinh high = NUMKEYS(mp) - 1; 12085d465952Smartinh while (low <= high) { 12095d465952Smartinh i = (low + high) >> 1; 12105d465952Smartinh node = NODEPTR(mp, i); 12115d465952Smartinh 12125d465952Smartinh nodekey.size = node->ksize; 12135d465952Smartinh nodekey.data = NODEKEY(node); 12145d465952Smartinh 12155d465952Smartinh if (bt->cmp) 12165d465952Smartinh rc = bt->cmp(key, &nodekey); 12175d465952Smartinh else 12185d465952Smartinh rc = bt_cmp(bt, key, &nodekey, &mp->prefix); 12195d465952Smartinh 12205d465952Smartinh if (IS_LEAF(mp)) 12215d465952Smartinh DPRINTF("found leaf index %u [%.*s], rc = %i", 12225d465952Smartinh i, (int)nodekey.size, (char *)nodekey.data, rc); 12235d465952Smartinh else 12245d465952Smartinh DPRINTF("found branch index %u [%.*s -> %u], rc = %i", 12255d465952Smartinh i, (int)node->ksize, (char *)NODEKEY(node), 12265d465952Smartinh node->n_pgno, rc); 12275d465952Smartinh 12285d465952Smartinh if (rc == 0) 12295d465952Smartinh break; 12305d465952Smartinh if (rc > 0) 12315d465952Smartinh low = i + 1; 12325d465952Smartinh else 12335d465952Smartinh high = i - 1; 12345d465952Smartinh } 12355d465952Smartinh 12365d465952Smartinh if (rc > 0) { /* Found entry is less than the key. */ 12375d465952Smartinh i++; /* Skip to get the smallest entry larger than key. */ 12385d465952Smartinh if (i >= NUMKEYS(mp)) 12395d465952Smartinh /* There is no entry larger or equal to the key. */ 12405d465952Smartinh return NULL; 12415d465952Smartinh } 12425d465952Smartinh if (exactp) 12435d465952Smartinh *exactp = (rc == 0); 12445d465952Smartinh if (kip) /* Store the key index if requested. */ 12455d465952Smartinh *kip = i; 12465d465952Smartinh 12475d465952Smartinh return NODEPTR(mp, i); 12485d465952Smartinh } 12495d465952Smartinh 12505d465952Smartinh static void 12515d465952Smartinh cursor_pop_page(struct cursor *cursor) 12525d465952Smartinh { 12535d465952Smartinh struct ppage *top; 12545d465952Smartinh 12555d465952Smartinh top = CURSOR_TOP(cursor); 12565d465952Smartinh CURSOR_POP(cursor); 12575d465952Smartinh top->mpage->ref--; 12585d465952Smartinh 12595d465952Smartinh DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); 12605d465952Smartinh 12615d465952Smartinh free(top); 12625d465952Smartinh } 12635d465952Smartinh 12645d465952Smartinh static struct ppage * 12655d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp) 12665d465952Smartinh { 12675d465952Smartinh struct ppage *ppage; 12685d465952Smartinh 12695d465952Smartinh DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); 12705d465952Smartinh 12715d465952Smartinh if ((ppage = calloc(1, sizeof(*ppage))) == NULL) 12725d465952Smartinh return NULL; 12735d465952Smartinh ppage->mpage = mp; 12745d465952Smartinh mp->ref++; 12755d465952Smartinh CURSOR_PUSH(cursor, ppage); 12765d465952Smartinh return ppage; 12775d465952Smartinh } 12785d465952Smartinh 127937ee45a1Smartinh static struct mpage * 128037ee45a1Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno) 12815d465952Smartinh { 12825d465952Smartinh int rc; 12835d465952Smartinh struct mpage *mp; 12845d465952Smartinh 12855d465952Smartinh mp = mpage_lookup(bt, pgno); 12865d465952Smartinh if (mp == NULL) { 12875d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL) 128837ee45a1Smartinh return NULL; 128937ee45a1Smartinh if ((mp->page = malloc(bt->head.psize)) == NULL) { 12905d465952Smartinh free(mp); 129137ee45a1Smartinh return NULL; 129237ee45a1Smartinh } 129337ee45a1Smartinh if ((rc = btree_read_page(bt, pgno, mp->page)) != BT_SUCCESS) { 129437ee45a1Smartinh mpage_free(mp); 129537ee45a1Smartinh return NULL; 12965d465952Smartinh } 12975d465952Smartinh mp->pgno = pgno; 12985d465952Smartinh mpage_add(bt, mp); 12995d465952Smartinh } else 13005d465952Smartinh DPRINTF("returning page %u from cache", pgno); 13015d465952Smartinh 130237ee45a1Smartinh return mp; 13035d465952Smartinh } 13045d465952Smartinh 13055d465952Smartinh static void 13065d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2, 13075d465952Smartinh char *cs, size_t *cn) 13085d465952Smartinh { 13095d465952Smartinh assert(*cn >= n1 + n2); 13105d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 13115d465952Smartinh bcopy(s2, cs, n2); 13125d465952Smartinh bcopy(s1, cs + n2, n1); 13135d465952Smartinh } else { 13145d465952Smartinh bcopy(s1, cs, n1); 13155d465952Smartinh bcopy(s2, cs + n1, n2); 13165d465952Smartinh } 13175d465952Smartinh *cn = n1 + n2; 13185d465952Smartinh } 13195d465952Smartinh 13205d465952Smartinh static void 13215d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp) 13225d465952Smartinh { 13235d465952Smartinh indx_t lbound = 0, ubound = 0; 13245d465952Smartinh struct mpage *lp, *up; 13255d465952Smartinh struct btkey lprefix, uprefix; 13265d465952Smartinh 13275d465952Smartinh mp->prefix.len = 0; 13285d465952Smartinh if (bt->cmp != NULL) 13295d465952Smartinh return; 13305d465952Smartinh 13315d465952Smartinh lp = mp; 13325d465952Smartinh while (lp->parent != NULL) { 13335d465952Smartinh if (lp->parent_index > 0) { 13345d465952Smartinh lbound = lp->parent_index; 13355d465952Smartinh break; 13365d465952Smartinh } 13375d465952Smartinh lp = lp->parent; 13385d465952Smartinh } 13395d465952Smartinh 13405d465952Smartinh up = mp; 13415d465952Smartinh while (up->parent != NULL) { 13425d465952Smartinh if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { 13435d465952Smartinh ubound = up->parent_index + 1; 13445d465952Smartinh break; 13455d465952Smartinh } 13465d465952Smartinh up = up->parent; 13475d465952Smartinh } 13485d465952Smartinh 13495d465952Smartinh if (lp->parent != NULL && up->parent != NULL) { 13505d465952Smartinh expand_prefix(bt, lp->parent, lbound, &lprefix); 13515d465952Smartinh expand_prefix(bt, up->parent, ubound, &uprefix); 13525d465952Smartinh common_prefix(bt, &lprefix, &uprefix, &mp->prefix); 13535d465952Smartinh } 13545d465952Smartinh else if (mp->parent) 13555d465952Smartinh bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix)); 13565d465952Smartinh 13575d465952Smartinh DPRINTF("found common prefix [%.*s] (len %zu) for page %u", 13585d465952Smartinh (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno); 13595d465952Smartinh } 13605d465952Smartinh 13615d465952Smartinh static int 13625d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key, 13635d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp) 13645d465952Smartinh { 13655d465952Smartinh struct mpage *mp, *parent; 13665d465952Smartinh 13675d465952Smartinh if (cursor && cursor_push_page(cursor, root) == NULL) 13685d465952Smartinh return BT_FAIL; 13695d465952Smartinh 13705d465952Smartinh mp = root; 13715d465952Smartinh while (IS_BRANCH(mp)) { 13725d465952Smartinh unsigned int i = 0; 13735d465952Smartinh struct node *node; 13745d465952Smartinh 13755d465952Smartinh DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); 13765d465952Smartinh assert(NUMKEYS(mp) > 1); 13775d465952Smartinh node = NODEPTR(mp, 0); 13785d465952Smartinh DPRINTF("found index 0 to page %u", NODEPGNO(node)); 13795d465952Smartinh 13805d465952Smartinh if (key == NULL) /* Initialize cursor to first page. */ 13815d465952Smartinh i = 0; 13825d465952Smartinh else { 13835d465952Smartinh int exact; 13845d465952Smartinh node = btree_search_node(bt, mp, key, &exact, &i); 13855d465952Smartinh if (node == NULL) 13865d465952Smartinh i = NUMKEYS(mp) - 1; 13875d465952Smartinh else if (!exact) { 13885d465952Smartinh assert(i > 0); 13895d465952Smartinh i--; 13905d465952Smartinh } 13915d465952Smartinh } 13925d465952Smartinh 13935d465952Smartinh if (key) 13945d465952Smartinh DPRINTF("following index %u for key %.*s", 13955d465952Smartinh i, (int)key->size, (char *)key->data); 13965d465952Smartinh assert(i >= 0 && i < NUMKEYS(mp)); 13975d465952Smartinh node = NODEPTR(mp, i); 13985d465952Smartinh 13995d465952Smartinh if (cursor) 14005d465952Smartinh CURSOR_TOP(cursor)->ki = i; 14015d465952Smartinh 14025d465952Smartinh parent = mp; 140337ee45a1Smartinh if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 14045d465952Smartinh return BT_FAIL; 14055d465952Smartinh mp->parent = parent; 14065d465952Smartinh mp->parent_index = i; 14075d465952Smartinh find_common_prefix(bt, mp); 14085d465952Smartinh 14095d465952Smartinh if (cursor && cursor_push_page(cursor, mp) == NULL) 14105d465952Smartinh return BT_FAIL; 14115d465952Smartinh 14125d465952Smartinh if (modify) 14135d465952Smartinh mpage_touch(bt, mp); 14145d465952Smartinh } 14155d465952Smartinh 14165d465952Smartinh if (!IS_LEAF(mp)) { 14175d465952Smartinh DPRINTF("internal error, index points to a %02X page!?", 141837ee45a1Smartinh mp->page->flags); 14195d465952Smartinh return BT_FAIL; 14205d465952Smartinh } 14215d465952Smartinh 14225d465952Smartinh DPRINTF("found leaf page %u for key %.*s", mp->pgno, 14235d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL); 14245d465952Smartinh 14255d465952Smartinh *mpp = mp; 14265d465952Smartinh return BT_SUCCESS; 14275d465952Smartinh } 14285d465952Smartinh 14295d465952Smartinh /* Search for the page a given key should be in. 14305d465952Smartinh * Stores a pointer to the found page in *mpp. 14315d465952Smartinh * If key is NULL, search for the lowest page (used by btree_cursor_first). 14325d465952Smartinh * If cursor is non-null, pushes parent pages on the cursor stack. 14335d465952Smartinh * If modify is true, visited pages are updated with new page numbers. 14345d465952Smartinh */ 14355d465952Smartinh static int 14365d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key, 14375d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp) 14385d465952Smartinh { 14395d465952Smartinh int rc; 14405d465952Smartinh pgno_t root; 14415d465952Smartinh struct mpage *mp; 14425d465952Smartinh 14435d465952Smartinh /* Can't modify pages outside a transaction. */ 14445d465952Smartinh if (txn == NULL && modify) { 14455d465952Smartinh errno = EINVAL; 14465d465952Smartinh return BT_FAIL; 14475d465952Smartinh } 14485d465952Smartinh 14495d465952Smartinh /* Choose which root page to start with. If a transaction is given 14505d465952Smartinh * use the root page from the transaction, otherwise read the last 14515d465952Smartinh * committed root page. 14525d465952Smartinh */ 14535d465952Smartinh if (txn == NULL) { 14545d465952Smartinh if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS) 14555d465952Smartinh return rc; 145637ee45a1Smartinh root = bt->meta.root; 14575d465952Smartinh } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 14585d465952Smartinh DPRINTF("transaction has failed, must abort"); 14590279e526Smartinh errno = EINVAL; 14605d465952Smartinh return BT_FAIL; 14615d465952Smartinh } else 14625d465952Smartinh root = txn->root; 14635d465952Smartinh 14640279e526Smartinh if (root == P_INVALID) { /* Tree is empty. */ 146537ee45a1Smartinh DPRINTF("tree is empty"); 14660279e526Smartinh errno = ENOENT; 14670279e526Smartinh return BT_FAIL; 14680279e526Smartinh } 14695d465952Smartinh 147037ee45a1Smartinh if ((mp = btree_get_mpage(bt, root)) == NULL) 147137ee45a1Smartinh return BT_FAIL; 147237ee45a1Smartinh 147337ee45a1Smartinh DPRINTF("root page has flags 0x%X", mp->page->flags); 14745d465952Smartinh 14755d465952Smartinh assert(mp->parent == NULL); 14765d465952Smartinh assert(mp->prefix.len == 0); 14775d465952Smartinh 14785d465952Smartinh if (modify && !mp->dirty) { 14795d465952Smartinh mpage_touch(bt, mp); 14805d465952Smartinh txn->root = mp->pgno; 14815d465952Smartinh } 14825d465952Smartinh 14835d465952Smartinh return btree_search_page_root(bt, mp, key, cursor, modify, mpp); 14845d465952Smartinh } 14855d465952Smartinh 14865d465952Smartinh static int 14875d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf, 14885d465952Smartinh struct btval *data) 14895d465952Smartinh { 149037ee45a1Smartinh struct mpage *omp; /* overflow mpage */ 14915d465952Smartinh size_t psz; 14925d465952Smartinh size_t max; 14935d465952Smartinh size_t sz = 0; 14945d465952Smartinh pgno_t pgno; 14955d465952Smartinh 14965d465952Smartinh bzero(data, sizeof(*data)); 149737ee45a1Smartinh max = bt->head.psize - PAGEHDRSZ; 14985d465952Smartinh 14995d465952Smartinh if (!F_ISSET(leaf->flags, F_BIGDATA)) { 15005d465952Smartinh data->size = leaf->n_dsize; 15015d465952Smartinh if (data->size > 0) { 15025d465952Smartinh if (mp == NULL) { 15035d465952Smartinh if ((data->data = malloc(data->size)) == NULL) 15045d465952Smartinh return BT_FAIL; 15055d465952Smartinh bcopy(NODEDATA(leaf), data->data, data->size); 15065d465952Smartinh data->free_data = 1; 15075d465952Smartinh data->mp = NULL; 15085d465952Smartinh } else { 15095d465952Smartinh data->data = NODEDATA(leaf); 15105d465952Smartinh data->free_data = 0; 15115d465952Smartinh data->mp = mp; 15125d465952Smartinh mp->ref++; 15135d465952Smartinh } 15145d465952Smartinh } 15155d465952Smartinh return BT_SUCCESS; 15165d465952Smartinh } 15175d465952Smartinh 15185d465952Smartinh /* Read overflow data. 15195d465952Smartinh */ 15205d465952Smartinh DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); 15215d465952Smartinh if ((data->data = malloc(leaf->n_dsize)) == NULL) 15225d465952Smartinh return BT_FAIL; 15235d465952Smartinh data->size = leaf->n_dsize; 15245d465952Smartinh data->free_data = 1; 15255d465952Smartinh data->mp = NULL; 15265d465952Smartinh pgno = *(pgno_t *)NODEDATA(leaf); /* XXX: alignment? */ 15275d465952Smartinh for (sz = 0; sz < data->size; ) { 152837ee45a1Smartinh if ((omp = btree_get_mpage(bt, pgno)) == NULL || 152937ee45a1Smartinh !F_ISSET(omp->page->flags, P_OVERFLOW)) { 153037ee45a1Smartinh DPRINTF("read overflow page failed (%02x)", omp->page->flags); 15315d465952Smartinh free(data->data); 153237ee45a1Smartinh mpage_free(omp); 15335d465952Smartinh return BT_FAIL; 15345d465952Smartinh } 15355d465952Smartinh psz = data->size - sz; 15365d465952Smartinh if (psz > max) 15375d465952Smartinh psz = max; 153837ee45a1Smartinh bcopy(omp->page->ptrs, (char *)data->data + sz, psz); 15395d465952Smartinh sz += psz; 154037ee45a1Smartinh pgno = omp->page->p_next_pgno; 15415d465952Smartinh } 15425d465952Smartinh 15435d465952Smartinh return BT_SUCCESS; 15445d465952Smartinh } 15455d465952Smartinh 15465d465952Smartinh int 15475d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn, 15485d465952Smartinh struct btval *key, struct btval *data) 15495d465952Smartinh { 15505d465952Smartinh int rc, exact; 15515d465952Smartinh struct node *leaf; 15525d465952Smartinh struct mpage *mp; 15535d465952Smartinh 15545d465952Smartinh assert(key); 15555d465952Smartinh assert(data); 15565d465952Smartinh DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); 15575d465952Smartinh 155859916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) { 155959916c99Smartinh errno = EINVAL; 156059916c99Smartinh return BT_FAIL; 156159916c99Smartinh } 156259916c99Smartinh 156359916c99Smartinh if (bt == NULL) { 156459916c99Smartinh if (txn == NULL) { 156559916c99Smartinh errno = EINVAL; 156659916c99Smartinh return BT_FAIL; 156759916c99Smartinh } 156859916c99Smartinh bt = txn->bt; 156959916c99Smartinh } 157059916c99Smartinh 15715d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 15725d465952Smartinh errno = EINVAL; 15735d465952Smartinh return BT_FAIL; 15745d465952Smartinh } 15755d465952Smartinh 15765d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS) 15775d465952Smartinh return rc; 15785d465952Smartinh 15795d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, NULL); 15805d465952Smartinh if (leaf && exact) 15815d465952Smartinh rc = btree_read_data(bt, mp, leaf, data); 15820279e526Smartinh else { 15830279e526Smartinh errno = ENOENT; 15840279e526Smartinh rc = BT_FAIL; 15850279e526Smartinh } 15865d465952Smartinh 15875d465952Smartinh mpage_prune(bt); 15885d465952Smartinh return rc; 15895d465952Smartinh } 15905d465952Smartinh 15915d465952Smartinh static int 15925d465952Smartinh btree_sibling(struct cursor *cursor, int move_right) 15935d465952Smartinh { 15945d465952Smartinh int rc; 15955d465952Smartinh struct node *indx; 15965d465952Smartinh struct ppage *parent, *top; 15975d465952Smartinh struct mpage *mp; 15985d465952Smartinh 15995d465952Smartinh top = CURSOR_TOP(cursor); 16000279e526Smartinh if ((parent = SLIST_NEXT(top, entry)) == NULL) { 16010279e526Smartinh errno = ENOENT; 16020279e526Smartinh return BT_FAIL; /* root has no siblings */ 16030279e526Smartinh } 16045d465952Smartinh 16055d465952Smartinh DPRINTF("parent page is page %u, index %u", 16065d465952Smartinh parent->mpage->pgno, parent->ki); 16075d465952Smartinh 16085d465952Smartinh cursor_pop_page(cursor); 16095d465952Smartinh if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage)) 16105d465952Smartinh : (parent->ki == 0)) { 16115d465952Smartinh DPRINTF("no more keys left, moving to %s sibling", 16125d465952Smartinh move_right ? "right" : "left"); 16135d465952Smartinh if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS) 16145d465952Smartinh return rc; 16155d465952Smartinh parent = CURSOR_TOP(cursor); 16165d465952Smartinh } else { 16175d465952Smartinh if (move_right) 16185d465952Smartinh parent->ki++; 16195d465952Smartinh else 16205d465952Smartinh parent->ki--; 16215d465952Smartinh DPRINTF("just moving to %s index key %u", 16225d465952Smartinh move_right ? "right" : "left", parent->ki); 16235d465952Smartinh } 16245d465952Smartinh assert(IS_BRANCH(parent->mpage)); 16255d465952Smartinh 16265d465952Smartinh indx = NODEPTR(parent->mpage, parent->ki); 162737ee45a1Smartinh if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL) 16285d465952Smartinh return BT_FAIL; 16295d465952Smartinh mp->parent = parent->mpage; 16305d465952Smartinh mp->parent_index = parent->ki; 16315d465952Smartinh 16325d465952Smartinh cursor_push_page(cursor, mp); 16335d465952Smartinh find_common_prefix(cursor->bt, mp); 16345d465952Smartinh 16355d465952Smartinh return BT_SUCCESS; 16365d465952Smartinh } 16375d465952Smartinh 16385d465952Smartinh static int 16395d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node, 16405d465952Smartinh struct btval *key) 16415d465952Smartinh { 16425d465952Smartinh if (key == NULL) 16435d465952Smartinh return 0; 16445d465952Smartinh 16455d465952Smartinh if (mp->prefix.len > 0) { 16465d465952Smartinh key->size = node->ksize + mp->prefix.len; 16475d465952Smartinh key->data = malloc(key->size); 16485d465952Smartinh if (key->data == NULL) 16495d465952Smartinh return -1; 16505d465952Smartinh concat_prefix(bt, 16515d465952Smartinh mp->prefix.str, mp->prefix.len, 16525d465952Smartinh NODEKEY(node), node->ksize, 16535d465952Smartinh key->data, &key->size); 16545d465952Smartinh key->free_data = 1; 16555d465952Smartinh } else { 16565d465952Smartinh key->size = node->ksize; 16575d465952Smartinh key->data = NODEKEY(node); 16585d465952Smartinh key->free_data = 0; 16595d465952Smartinh key->mp = mp; 16605d465952Smartinh mp->ref++; 16615d465952Smartinh } 16625d465952Smartinh 16635d465952Smartinh return 0; 16645d465952Smartinh } 16655d465952Smartinh 16665d465952Smartinh static int 16675d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data) 16685d465952Smartinh { 16695d465952Smartinh struct ppage *top; 16705d465952Smartinh struct mpage *mp; 16715d465952Smartinh struct node *leaf; 16725d465952Smartinh 16730279e526Smartinh if (cursor->eof) { 16740279e526Smartinh errno = ENOENT; 16750279e526Smartinh return BT_FAIL; 16760279e526Smartinh } 16775d465952Smartinh 16785d465952Smartinh assert(cursor->initialized); 16795d465952Smartinh 16805d465952Smartinh top = CURSOR_TOP(cursor); 16815d465952Smartinh mp = top->mpage; 16825d465952Smartinh 16835d465952Smartinh DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); 16845d465952Smartinh 16855d465952Smartinh if (top->ki + 1 >= NUMKEYS(mp)) { 16865d465952Smartinh DPRINTF("=====> move to next sibling page"); 16875d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS) { 16885d465952Smartinh cursor->eof = 1; 16890279e526Smartinh return BT_FAIL; 16905d465952Smartinh } 16915d465952Smartinh top = CURSOR_TOP(cursor); 16925d465952Smartinh mp = top->mpage; 16935d465952Smartinh DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); 16945d465952Smartinh } else 16955d465952Smartinh top->ki++; 16965d465952Smartinh 16975d465952Smartinh DPRINTF("==> cursor points to page %u with %lu keys, key index %u", 16985d465952Smartinh mp->pgno, NUMKEYS(mp), top->ki); 16995d465952Smartinh 17005d465952Smartinh assert(IS_LEAF(mp)); 17015d465952Smartinh leaf = NODEPTR(mp, top->ki); 17025d465952Smartinh 17035d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 17045d465952Smartinh return BT_FAIL; 17055d465952Smartinh 17065d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 17075d465952Smartinh return BT_FAIL; 17085d465952Smartinh 17095d465952Smartinh return BT_SUCCESS; 17105d465952Smartinh } 17115d465952Smartinh 17125d465952Smartinh static int 17135d465952Smartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data) 17145d465952Smartinh { 17155d465952Smartinh int rc; 17165d465952Smartinh struct node *leaf; 17175d465952Smartinh struct mpage *mp; 17185d465952Smartinh struct ppage *top; 17195d465952Smartinh 17205d465952Smartinh assert(cursor); 17215d465952Smartinh assert(key); 17225d465952Smartinh assert(key->size > 0); 17235d465952Smartinh 17245d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp); 17255d465952Smartinh if (rc != BT_SUCCESS) 17265d465952Smartinh return rc; 17275d465952Smartinh assert(IS_LEAF(mp)); 17285d465952Smartinh 17295d465952Smartinh top = CURSOR_TOP(cursor); 17305d465952Smartinh leaf = btree_search_node(cursor->bt, mp, key, NULL, &top->ki); 17315d465952Smartinh if (leaf == NULL) { 17325d465952Smartinh DPRINTF("===> inexact leaf not found, goto sibling"); 17335d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS) 17340279e526Smartinh return BT_FAIL; /* no entries matched */ 17355d465952Smartinh top = CURSOR_TOP(cursor); 17365d465952Smartinh top->ki = 0; 17375d465952Smartinh mp = top->mpage; 17385d465952Smartinh assert(IS_LEAF(mp)); 17395d465952Smartinh leaf = NODEPTR(mp, 0); 17405d465952Smartinh } 17415d465952Smartinh 17425d465952Smartinh cursor->initialized = 1; 17435d465952Smartinh cursor->eof = 0; 17445d465952Smartinh 17455d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 17465d465952Smartinh return BT_FAIL; 17475d465952Smartinh 17485d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 17495d465952Smartinh return BT_FAIL; 17505d465952Smartinh DPRINTF("==> cursor placed on key %.*s", 17515d465952Smartinh (int)key->size, (char *)key->data); 17525d465952Smartinh 17535d465952Smartinh return BT_SUCCESS; 17545d465952Smartinh } 17555d465952Smartinh 17565d465952Smartinh static int 17575d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data) 17585d465952Smartinh { 17595d465952Smartinh int rc; 17605d465952Smartinh struct mpage *mp; 17615d465952Smartinh struct node *leaf; 17625d465952Smartinh 17635d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp); 17645d465952Smartinh if (rc != BT_SUCCESS) 17655d465952Smartinh return rc; 17665d465952Smartinh assert(IS_LEAF(mp)); 17675d465952Smartinh 17685d465952Smartinh leaf = NODEPTR(mp, 0); 17695d465952Smartinh cursor->initialized = 1; 17705d465952Smartinh cursor->eof = 0; 17715d465952Smartinh 17725d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 17735d465952Smartinh return BT_FAIL; 17745d465952Smartinh 17755d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 17765d465952Smartinh return BT_FAIL; 17775d465952Smartinh 17785d465952Smartinh return BT_SUCCESS; 17795d465952Smartinh } 17805d465952Smartinh 17815d465952Smartinh int 17825d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data, 17835d465952Smartinh enum cursor_op op) 17845d465952Smartinh { 17855d465952Smartinh int rc; 17865d465952Smartinh 17875d465952Smartinh assert(cursor); 17885d465952Smartinh 17895d465952Smartinh switch (op) { 17905d465952Smartinh case BT_CURSOR: 17915d465952Smartinh while (CURSOR_TOP(cursor) != NULL) 17925d465952Smartinh cursor_pop_page(cursor); 17935d465952Smartinh if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { 17945d465952Smartinh errno = EINVAL; 17955d465952Smartinh rc = BT_FAIL; 17965d465952Smartinh } else 17975d465952Smartinh rc = btree_cursor_set(cursor, key, data); 17985d465952Smartinh break; 17995d465952Smartinh case BT_NEXT: 18005d465952Smartinh if (!cursor->initialized) 18015d465952Smartinh rc = btree_cursor_first(cursor, key, data); 18025d465952Smartinh else 18035d465952Smartinh rc = btree_cursor_next(cursor, key, data); 18045d465952Smartinh break; 18055d465952Smartinh case BT_FIRST: 18065d465952Smartinh while (CURSOR_TOP(cursor) != NULL) 18075d465952Smartinh cursor_pop_page(cursor); 18085d465952Smartinh rc = btree_cursor_first(cursor, key, data); 18095d465952Smartinh break; 18105d465952Smartinh default: 18115d465952Smartinh DPRINTF("unhandled/unimplemented cursor operation %u", op); 18125d465952Smartinh rc = BT_FAIL; 18135d465952Smartinh break; 18145d465952Smartinh } 18155d465952Smartinh 18165d465952Smartinh mpage_prune(cursor->bt); 18175d465952Smartinh 18185d465952Smartinh return rc; 18195d465952Smartinh } 18205d465952Smartinh 18215d465952Smartinh static struct mpage * 18225d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags) 18235d465952Smartinh { 18245d465952Smartinh struct mpage *mp; 18255d465952Smartinh 18265d465952Smartinh assert(bt != NULL); 18275d465952Smartinh assert(bt->txn != NULL); 18285d465952Smartinh 182937ee45a1Smartinh DPRINTF("allocating new mpage %u, page size %u", 183037ee45a1Smartinh bt->txn->next_pgno, bt->head.psize); 18315d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL) 18325d465952Smartinh return NULL; 183337ee45a1Smartinh if ((mp->page = malloc(bt->head.psize)) == NULL) { 183437ee45a1Smartinh free(mp); 183537ee45a1Smartinh return NULL; 183637ee45a1Smartinh } 183737ee45a1Smartinh mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 183837ee45a1Smartinh mp->page->flags = flags; 183937ee45a1Smartinh mp->page->lower = PAGEHDRSZ; 184037ee45a1Smartinh mp->page->upper = bt->head.psize; 18415d465952Smartinh 18425d465952Smartinh if (IS_BRANCH(mp)) 184337ee45a1Smartinh bt->meta.branch_pages++; 18445d465952Smartinh else if (IS_LEAF(mp)) 184537ee45a1Smartinh bt->meta.leaf_pages++; 18465d465952Smartinh else if (IS_OVERFLOW(mp)) 184737ee45a1Smartinh bt->meta.overflow_pages++; 18485d465952Smartinh 18495d465952Smartinh mpage_add(bt, mp); 18505d465952Smartinh mpage_dirty(bt, mp); 18515d465952Smartinh 18525d465952Smartinh return mp; 18535d465952Smartinh } 18545d465952Smartinh 18555d465952Smartinh static size_t 185637ee45a1Smartinh bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data) 18575d465952Smartinh { 18585d465952Smartinh size_t sz; 18595d465952Smartinh 18605d465952Smartinh sz = LEAFSIZE(key, data); 186137ee45a1Smartinh if (data->size >= bt->head.psize / BT_MINKEYS) { 18625d465952Smartinh /* put on overflow page */ 18635d465952Smartinh sz -= data->size - sizeof(pgno_t); 18645d465952Smartinh } 18655d465952Smartinh 18665d465952Smartinh return sz + sizeof(indx_t); 18675d465952Smartinh } 18685d465952Smartinh 18695d465952Smartinh static size_t 187037ee45a1Smartinh bt_branch_size(struct btree *bt, struct btval *key) 18715d465952Smartinh { 18725d465952Smartinh size_t sz; 18735d465952Smartinh 18745d465952Smartinh sz = INDXSIZE(key); 187537ee45a1Smartinh if (sz >= bt->head.psize / BT_MINKEYS) { 18765d465952Smartinh /* put on overflow page */ 18775d465952Smartinh /* not implemented */ 18785d465952Smartinh /* sz -= key->size - sizeof(pgno_t); */ 18795d465952Smartinh } 18805d465952Smartinh 18815d465952Smartinh return sz + sizeof(indx_t); 18825d465952Smartinh } 18835d465952Smartinh 18845d465952Smartinh static int 18855d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data) 18865d465952Smartinh { 18875d465952Smartinh size_t done = 0; 18885d465952Smartinh size_t sz; 18895d465952Smartinh size_t max; 18905d465952Smartinh pgno_t *linkp; /* linked page stored here */ 18915d465952Smartinh struct mpage *next = NULL; 18925d465952Smartinh 189337ee45a1Smartinh max = bt->head.psize - PAGEHDRSZ; 18945d465952Smartinh 18955d465952Smartinh while (done < data->size) { 18965d465952Smartinh linkp = &p->p_next_pgno; 18975d465952Smartinh if (data->size - done > max) { 18985d465952Smartinh /* need another overflow page */ 18995d465952Smartinh if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL) 19005d465952Smartinh return BT_FAIL; 19015d465952Smartinh *linkp = next->pgno; 19025d465952Smartinh DPRINTF("linking overflow page %u", next->pgno); 19035d465952Smartinh } else 19045d465952Smartinh *linkp = 0; /* indicates end of list */ 19055d465952Smartinh sz = data->size - done; 19065d465952Smartinh if (sz > max) 19075d465952Smartinh sz = max; 19085d465952Smartinh DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); 19095d465952Smartinh bcopy((char *)data->data + done, p->ptrs, sz); 19105d465952Smartinh done += sz; 191137ee45a1Smartinh p = next->page; 19125d465952Smartinh } 19135d465952Smartinh 19145d465952Smartinh return BT_SUCCESS; 19155d465952Smartinh } 19165d465952Smartinh 19175d465952Smartinh /* Key prefix should already be stripped. 19185d465952Smartinh */ 19195d465952Smartinh static int 19205d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx, 19215d465952Smartinh struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags) 19225d465952Smartinh { 19235d465952Smartinh unsigned int i; 19245d465952Smartinh size_t node_size = NODESIZE; 19255d465952Smartinh indx_t ofs; 19265d465952Smartinh struct node *node; 19275d465952Smartinh struct page *p; 19285d465952Smartinh struct mpage *ofp = NULL; /* overflow page */ 19295d465952Smartinh 193037ee45a1Smartinh p = mp->page; 19315d465952Smartinh assert(p->upper >= p->lower); 19325d465952Smartinh 193337ee45a1Smartinh DPRINTF("add node [%.*s] to %s page %u at index %i, key size %zu", 19345d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL, 19355d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 193637ee45a1Smartinh mp->pgno, indx, key ? key->size : 0); 19375d465952Smartinh 19385d465952Smartinh if (key != NULL) 19395d465952Smartinh node_size += key->size; 19405d465952Smartinh 19415d465952Smartinh if (IS_LEAF(mp)) { 19425d465952Smartinh assert(data); 19435d465952Smartinh node_size += data->size; 19445d465952Smartinh if (F_ISSET(flags, F_BIGDATA)) { 19455d465952Smartinh /* Data already on overflow page. */ 19465d465952Smartinh node_size -= data->size - sizeof(pgno_t); 194737ee45a1Smartinh } else if (data->size >= bt->head.psize / BT_MINKEYS) { 19485d465952Smartinh /* Put data on overflow page. */ 19495d465952Smartinh DPRINTF("data size is %zu, put on overflow page", 19505d465952Smartinh data->size); 19515d465952Smartinh node_size -= data->size - sizeof(pgno_t); 19525d465952Smartinh if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL) 19535d465952Smartinh return BT_FAIL; 19545d465952Smartinh DPRINTF("allocated overflow page %u", ofp->pgno); 19555d465952Smartinh flags |= F_BIGDATA; 19565d465952Smartinh } 19575d465952Smartinh } 19585d465952Smartinh 19595d465952Smartinh if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { 19605d465952Smartinh DPRINTF("not enough room in page %u, got %lu ptrs", 19615d465952Smartinh mp->pgno, NUMKEYS(mp)); 19625d465952Smartinh DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, 19635d465952Smartinh p->upper - p->lower); 196437ee45a1Smartinh DPRINTF("node size = %zu", node_size); 19655d465952Smartinh return BT_FAIL; 19665d465952Smartinh } 19675d465952Smartinh 19685d465952Smartinh /* Move higher pointers up one slot. */ 19695d465952Smartinh for (i = NUMKEYS(mp); i > indx; i--) 19705d465952Smartinh p->ptrs[i] = p->ptrs[i - 1]; 19715d465952Smartinh 19725d465952Smartinh /* Adjust free space offsets. */ 19735d465952Smartinh ofs = p->upper - node_size; 19745d465952Smartinh assert(ofs >= p->lower + sizeof(indx_t)); 19755d465952Smartinh p->ptrs[indx] = ofs; 19765d465952Smartinh p->upper = ofs; 19775d465952Smartinh p->lower += sizeof(indx_t); 19785d465952Smartinh 19795d465952Smartinh /* Write the node data. */ 19805d465952Smartinh node = NODEPTR(mp, indx); 19815d465952Smartinh node->ksize = (key == NULL) ? 0 : key->size; 19825d465952Smartinh node->flags = flags; 19835d465952Smartinh if (IS_LEAF(mp)) 19845d465952Smartinh node->n_dsize = data->size; 19855d465952Smartinh else 19865d465952Smartinh node->n_pgno = pgno; 19875d465952Smartinh 19885d465952Smartinh if (key) 19895d465952Smartinh bcopy(key->data, NODEKEY(node), key->size); 19905d465952Smartinh 19915d465952Smartinh if (IS_LEAF(mp)) { 19925d465952Smartinh if (ofp == NULL) { 19935d465952Smartinh if (F_ISSET(flags, F_BIGDATA)) 19945d465952Smartinh bcopy(data->data, node->data + key->size, 19955d465952Smartinh sizeof(pgno_t)); 19965d465952Smartinh else 19975d465952Smartinh bcopy(data->data, node->data + key->size, 19985d465952Smartinh data->size); 19995d465952Smartinh } else { 20005d465952Smartinh bcopy(&ofp->pgno, node->data + key->size, 20015d465952Smartinh sizeof(pgno_t)); 200237ee45a1Smartinh if (btree_write_overflow_data(bt, ofp->page, 20035d465952Smartinh data) == BT_FAIL) 20045d465952Smartinh return BT_FAIL; 20055d465952Smartinh } 20065d465952Smartinh } 20075d465952Smartinh 20085d465952Smartinh return BT_SUCCESS; 20095d465952Smartinh } 20105d465952Smartinh 20115d465952Smartinh static void 20125d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx) 20135d465952Smartinh { 20145d465952Smartinh unsigned int sz; 20155d465952Smartinh indx_t i, j, numkeys, ptr; 20165d465952Smartinh struct node *node; 20175d465952Smartinh char *base; 20185d465952Smartinh 20195d465952Smartinh DPRINTF("delete node %u on %s page %u", indx, 20205d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno); 20215d465952Smartinh assert(indx < NUMKEYS(mp)); 20225d465952Smartinh 20235d465952Smartinh node = NODEPTR(mp, indx); 20245d465952Smartinh sz = NODESIZE + node->ksize; 20255d465952Smartinh if (IS_LEAF(mp)) { 20265d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA)) 20275d465952Smartinh sz += sizeof(pgno_t); 20285d465952Smartinh else 20295d465952Smartinh sz += NODEDSZ(node); 20305d465952Smartinh } 20315d465952Smartinh 203237ee45a1Smartinh ptr = mp->page->ptrs[indx]; 20335d465952Smartinh numkeys = NUMKEYS(mp); 20345d465952Smartinh for (i = j = 0; i < numkeys; i++) { 20355d465952Smartinh if (i != indx) { 203637ee45a1Smartinh mp->page->ptrs[j] = mp->page->ptrs[i]; 203737ee45a1Smartinh if (mp->page->ptrs[i] < ptr) 203837ee45a1Smartinh mp->page->ptrs[j] += sz; 20395d465952Smartinh j++; 20405d465952Smartinh } 20415d465952Smartinh } 20425d465952Smartinh 204337ee45a1Smartinh base = (char *)mp->page + mp->page->upper; 204437ee45a1Smartinh bcopy(base, base + sz, ptr - mp->page->upper); 20455d465952Smartinh 204637ee45a1Smartinh mp->page->lower -= sizeof(indx_t); 204737ee45a1Smartinh mp->page->upper += sz; 20485d465952Smartinh } 20495d465952Smartinh 20505d465952Smartinh struct cursor * 20515d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn) 20525d465952Smartinh { 20535d465952Smartinh struct cursor *cursor; 20545d465952Smartinh 205559916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) { 205659916c99Smartinh errno = EINVAL; 205759916c99Smartinh return NULL; 205859916c99Smartinh } 205959916c99Smartinh 206059916c99Smartinh if (bt == NULL) { 206159916c99Smartinh if (txn == NULL) { 206259916c99Smartinh errno = EINVAL; 206359916c99Smartinh return NULL; 206459916c99Smartinh } 206559916c99Smartinh bt = txn->bt; 206659916c99Smartinh } 206759916c99Smartinh 20685d465952Smartinh if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { 20695d465952Smartinh SLIST_INIT(&cursor->stack); 20705d465952Smartinh cursor->bt = bt; 20715d465952Smartinh cursor->txn = txn; 2072408be415Smartinh btree_ref(bt); 20735d465952Smartinh } 20745d465952Smartinh 20755d465952Smartinh return cursor; 20765d465952Smartinh } 20775d465952Smartinh 20785d465952Smartinh void 20795d465952Smartinh btree_cursor_close(struct cursor *cursor) 20805d465952Smartinh { 20815d465952Smartinh if (cursor != NULL) { 20825d465952Smartinh while (!CURSOR_EMPTY(cursor)) 20835d465952Smartinh cursor_pop_page(cursor); 20845d465952Smartinh 20855d465952Smartinh btree_close(cursor->bt); 20865d465952Smartinh free(cursor); 20875d465952Smartinh } 20885d465952Smartinh } 20895d465952Smartinh 20905d465952Smartinh static int 20915d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx, 20925d465952Smartinh struct btval *key) 20935d465952Smartinh { 20945d465952Smartinh indx_t ptr, i, numkeys; 20955d465952Smartinh int delta; 20965d465952Smartinh size_t len; 20975d465952Smartinh struct node *node; 20985d465952Smartinh char *base; 20995d465952Smartinh 21005d465952Smartinh node = NODEPTR(mp, indx); 210137ee45a1Smartinh ptr = mp->page->ptrs[indx]; 21025d465952Smartinh DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", 21035d465952Smartinh indx, ptr, 21045d465952Smartinh (int)node->ksize, (char *)NODEKEY(node), 21055d465952Smartinh (int)key->size, (char *)key->data, 21065d465952Smartinh mp->pgno); 21075d465952Smartinh 21085d465952Smartinh if (key->size != node->ksize) { 21095d465952Smartinh delta = key->size - node->ksize; 21105d465952Smartinh if (delta > 0 && SIZELEFT(mp) < delta) { 21115d465952Smartinh DPRINTF("OUCH! Not enough room, delta = %d", delta); 21125d465952Smartinh return BT_FAIL; 21135d465952Smartinh } 21145d465952Smartinh 21155d465952Smartinh numkeys = NUMKEYS(mp); 21165d465952Smartinh for (i = 0; i < numkeys; i++) { 211737ee45a1Smartinh if (mp->page->ptrs[i] <= ptr) 211837ee45a1Smartinh mp->page->ptrs[i] -= delta; 21195d465952Smartinh } 21205d465952Smartinh 212137ee45a1Smartinh base = (char *)mp->page + mp->page->upper; 212237ee45a1Smartinh len = ptr - mp->page->upper + NODESIZE; 21235d465952Smartinh bcopy(base, base - delta, len); 212437ee45a1Smartinh mp->page->upper -= delta; 21255d465952Smartinh 21265d465952Smartinh node = NODEPTR(mp, indx); 21275d465952Smartinh node->ksize = key->size; 21285d465952Smartinh } 21295d465952Smartinh 21305d465952Smartinh bcopy(key->data, NODEKEY(node), key->size); 21315d465952Smartinh 21325d465952Smartinh return BT_SUCCESS; 21335d465952Smartinh } 21345d465952Smartinh 21355d465952Smartinh static int 21365d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta) 21375d465952Smartinh { 21385d465952Smartinh indx_t i; 21395d465952Smartinh struct node *node; 21405d465952Smartinh struct btkey tmpkey; 21415d465952Smartinh struct btval key; 21425d465952Smartinh 21435d465952Smartinh DPRINTF("adjusting prefix lengths on page %u with delta %d", 21445d465952Smartinh src->pgno, delta); 21455d465952Smartinh assert(delta != 0); 21465d465952Smartinh 21475d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) { 21485d465952Smartinh node = NODEPTR(src, i); 21495d465952Smartinh tmpkey.len = node->ksize - delta; 21505d465952Smartinh if (delta > 0) { 21515d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) 21525d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, tmpkey.len); 21535d465952Smartinh else 21545d465952Smartinh bcopy((char *)NODEKEY(node) + delta, tmpkey.str, 21555d465952Smartinh tmpkey.len); 21565d465952Smartinh } else { 21575d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 21585d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, node->ksize); 21595d465952Smartinh bcopy(src->prefix.str, tmpkey.str + node->ksize, 21605d465952Smartinh -delta); 21615d465952Smartinh } else { 21625d465952Smartinh bcopy(src->prefix.str + src->prefix.len + delta, 21635d465952Smartinh tmpkey.str, -delta); 21645d465952Smartinh bcopy(NODEKEY(node), tmpkey.str - delta, 21655d465952Smartinh node->ksize); 21665d465952Smartinh } 21675d465952Smartinh } 21685d465952Smartinh key.size = tmpkey.len; 21695d465952Smartinh key.data = tmpkey.str; 21705d465952Smartinh if (btree_update_key(bt, src, i, &key) != BT_SUCCESS) 21715d465952Smartinh return BT_FAIL; 21725d465952Smartinh } 21735d465952Smartinh 21745d465952Smartinh return BT_SUCCESS; 21755d465952Smartinh } 21765d465952Smartinh 21775d465952Smartinh /* Move a node from src to dst. 21785d465952Smartinh */ 21795d465952Smartinh static int 21805d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx, 21815d465952Smartinh struct mpage *dst, indx_t dstindx) 21825d465952Smartinh { 21835d465952Smartinh int rc; 21845d465952Smartinh unsigned int pfxlen, mp_pfxlen = 0; 21855d465952Smartinh struct node *node, *srcnode; 218637ee45a1Smartinh struct mpage *mp = NULL; 21875d465952Smartinh struct btkey tmpkey, srckey; 21885d465952Smartinh struct btval key, data; 21895d465952Smartinh 21905d465952Smartinh assert(src->parent); 21915d465952Smartinh assert(dst->parent); 21925d465952Smartinh 21935d465952Smartinh srcnode = NODEPTR(src, srcindx); 21945d465952Smartinh DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", 21955d465952Smartinh IS_LEAF(src) ? "leaf" : "branch", 21965d465952Smartinh srcindx, 21975d465952Smartinh (int)srcnode->ksize, (char *)NODEKEY(srcnode), 21985d465952Smartinh src->pgno, 21995d465952Smartinh dstindx, dst->pgno); 22005d465952Smartinh 22015d465952Smartinh if (IS_BRANCH(src)) { 22025d465952Smartinh /* Need to check if the page the moved node points to 22035d465952Smartinh * changes prefix. 22045d465952Smartinh */ 220537ee45a1Smartinh if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL) 220637ee45a1Smartinh return BT_FAIL; 22075d465952Smartinh mp->parent = src; 22085d465952Smartinh mp->parent_index = srcindx; 22095d465952Smartinh find_common_prefix(bt, mp); 22105d465952Smartinh mp_pfxlen = mp->prefix.len; 22115d465952Smartinh } 22125d465952Smartinh 22135d465952Smartinh /* Mark src and dst as dirty. */ 22145d465952Smartinh mpage_touch(bt, src); 22155d465952Smartinh mpage_touch(bt, dst); 22165d465952Smartinh 22175d465952Smartinh find_common_prefix(bt, dst); 22185d465952Smartinh 22195d465952Smartinh /* Check if src node has destination page prefix. Otherwise the 22205d465952Smartinh * destination page must expand its prefix on all its nodes. 22215d465952Smartinh */ 22225d465952Smartinh srckey.len = srcnode->ksize; 22235d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srckey.len); 22245d465952Smartinh common_prefix(bt, &srckey, &dst->prefix, &tmpkey); 22255d465952Smartinh if (tmpkey.len != dst->prefix.len) { 22265d465952Smartinh if (btree_adjust_prefix(bt, dst, 22275d465952Smartinh tmpkey.len - dst->prefix.len) != BT_SUCCESS) 22285d465952Smartinh return BT_FAIL; 22295d465952Smartinh bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey)); 22305d465952Smartinh } 22315d465952Smartinh 22325d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) { 22335d465952Smartinh struct mpage *low; 22345d465952Smartinh 22355d465952Smartinh /* must find the lowest key below src 22365d465952Smartinh */ 22375d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0, 22385d465952Smartinh &low) == BT_SUCCESS); 22395d465952Smartinh expand_prefix(bt, low, 0, &srckey); 22405d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u", 22415d465952Smartinh (int)srckey.len, srckey.str, low->pgno); 22425d465952Smartinh } else { 22435d465952Smartinh srckey.len = srcnode->ksize; 22445d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize); 22455d465952Smartinh } 22465d465952Smartinh find_common_prefix(bt, src); 22475d465952Smartinh 22485d465952Smartinh /* expand the prefix */ 22495d465952Smartinh tmpkey.len = sizeof(tmpkey.str); 22505d465952Smartinh concat_prefix(bt, src->prefix.str, src->prefix.len, 22515d465952Smartinh srckey.str, srckey.len, tmpkey.str, &tmpkey.len); 22525d465952Smartinh 22535d465952Smartinh /* Add the node to the destination page. Adjust prefix for 22545d465952Smartinh * destination page. 22555d465952Smartinh */ 22565d465952Smartinh key.size = tmpkey.len; 22575d465952Smartinh key.data = tmpkey.str; 22585d465952Smartinh remove_prefix(bt, &key, dst->prefix.len); 22595d465952Smartinh data.size = NODEDSZ(srcnode); 22605d465952Smartinh data.data = NODEDATA(srcnode); 22615d465952Smartinh rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode), 22625d465952Smartinh srcnode->flags); 22635d465952Smartinh if (rc != BT_SUCCESS) 22645d465952Smartinh return rc; 22655d465952Smartinh 22665d465952Smartinh /* Delete the node from the source page. 22675d465952Smartinh */ 22685d465952Smartinh btree_del_node(bt, src, srcindx); 22695d465952Smartinh 22705d465952Smartinh /* Update the parent separators. 22715d465952Smartinh */ 22725d465952Smartinh if (srcindx == 0 && src->parent_index != 0) { 22735d465952Smartinh node = NODEPTR(src->parent, src->parent_index); 22745d465952Smartinh DPRINTF("current parent separator for source page %u is [%.*s]", 22755d465952Smartinh src->pgno, 22765d465952Smartinh (int)node->ksize, (char *)NODEKEY(node)); 22775d465952Smartinh 22785d465952Smartinh expand_prefix(bt, src, 0, &tmpkey); 22795d465952Smartinh key.size = tmpkey.len; 22805d465952Smartinh key.data = tmpkey.str; 22815d465952Smartinh remove_prefix(bt, &key, src->parent->prefix.len); 22825d465952Smartinh 22835d465952Smartinh DPRINTF("update separator for source page %u to [%.*s]", 22845d465952Smartinh src->pgno, (int)key.size, (char *)key.data); 22855d465952Smartinh if (btree_update_key(bt, src->parent, src->parent_index, 22865d465952Smartinh &key) != BT_SUCCESS) 22875d465952Smartinh return BT_FAIL; 22885d465952Smartinh } 22895d465952Smartinh 22905d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) { 22915d465952Smartinh struct btval nullkey; 22925d465952Smartinh nullkey.size = 0; 22935d465952Smartinh assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS); 22945d465952Smartinh } 22955d465952Smartinh 22965d465952Smartinh if (dstindx == 0 && dst->parent_index != 0) { 22975d465952Smartinh node = NODEPTR(dst->parent, dst->parent_index); 22985d465952Smartinh DPRINTF("current parent separator for destination page %u is [%.*s]", 22995d465952Smartinh dst->pgno, 23005d465952Smartinh (int)node->ksize, (char *)NODEKEY(node)); 23015d465952Smartinh 23025d465952Smartinh expand_prefix(bt, dst, 0, &tmpkey); 23035d465952Smartinh key.size = tmpkey.len; 23045d465952Smartinh key.data = tmpkey.str; 23055d465952Smartinh remove_prefix(bt, &key, dst->parent->prefix.len); 23065d465952Smartinh 23075d465952Smartinh DPRINTF("update separator for destination page %u to [%.*s]", 23085d465952Smartinh dst->pgno, (int)key.size, (char *)key.data); 23095d465952Smartinh if (btree_update_key(bt, dst->parent, dst->parent_index, 23105d465952Smartinh &key) != BT_SUCCESS) 23115d465952Smartinh return BT_FAIL; 23125d465952Smartinh } 23135d465952Smartinh 23145d465952Smartinh if (dstindx == 0 && IS_BRANCH(dst)) { 23155d465952Smartinh struct btval nullkey; 23165d465952Smartinh nullkey.size = 0; 23175d465952Smartinh assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS); 23185d465952Smartinh } 23195d465952Smartinh 23205d465952Smartinh /* We can get a new page prefix here! 23215d465952Smartinh * Must update keys in all nodes of this page! 23225d465952Smartinh */ 23235d465952Smartinh pfxlen = src->prefix.len; 23245d465952Smartinh find_common_prefix(bt, src); 23255d465952Smartinh if (src->prefix.len != pfxlen) { 23265d465952Smartinh if (btree_adjust_prefix(bt, src, 23275d465952Smartinh src->prefix.len - pfxlen) != BT_SUCCESS) 23285d465952Smartinh return BT_FAIL; 23295d465952Smartinh } 23305d465952Smartinh 23315d465952Smartinh pfxlen = dst->prefix.len; 23325d465952Smartinh find_common_prefix(bt, dst); 23335d465952Smartinh if (dst->prefix.len != pfxlen) { 23345d465952Smartinh if (btree_adjust_prefix(bt, dst, 23355d465952Smartinh dst->prefix.len - pfxlen) != BT_SUCCESS) 23365d465952Smartinh return BT_FAIL; 23375d465952Smartinh } 23385d465952Smartinh 23395d465952Smartinh if (IS_BRANCH(dst)) { 23405d465952Smartinh mp->parent = dst; 23415d465952Smartinh mp->parent_index = dstindx; 23425d465952Smartinh find_common_prefix(bt, mp); 23435d465952Smartinh if (mp->prefix.len != mp_pfxlen) { 23445d465952Smartinh DPRINTF("moved branch node has changed prefix"); 23455d465952Smartinh mpage_touch(bt, mp); 23465d465952Smartinh if (btree_adjust_prefix(bt, mp, 23475d465952Smartinh mp->prefix.len - mp_pfxlen) != BT_SUCCESS) 23485d465952Smartinh return BT_FAIL; 23495d465952Smartinh } 23505d465952Smartinh } 23515d465952Smartinh 23525d465952Smartinh return BT_SUCCESS; 23535d465952Smartinh } 23545d465952Smartinh 23555d465952Smartinh static int 23565d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst) 23575d465952Smartinh { 23585d465952Smartinh int rc; 23595d465952Smartinh indx_t i; 2360a33d639aSmartinh unsigned int pfxlen; 23615d465952Smartinh struct node *srcnode; 23625d465952Smartinh struct btkey tmpkey, dstpfx; 23635d465952Smartinh struct btval key, data; 23645d465952Smartinh 23655d465952Smartinh DPRINTF("merging page %u and %u", src->pgno, dst->pgno); 23665d465952Smartinh 23675d465952Smartinh assert(src->parent); /* can't merge root page */ 23685d465952Smartinh assert(dst->parent); 23695d465952Smartinh assert(bt->txn != NULL); 23705d465952Smartinh 23715d465952Smartinh /* Mark src and dst as dirty. */ 23725d465952Smartinh mpage_touch(bt, src); 23735d465952Smartinh mpage_touch(bt, dst); 23745d465952Smartinh 23755d465952Smartinh find_common_prefix(bt, src); 23765d465952Smartinh find_common_prefix(bt, dst); 23775d465952Smartinh 23785d465952Smartinh /* Check if source nodes has destination page prefix. Otherwise 23795d465952Smartinh * the destination page must expand its prefix on all its nodes. 23805d465952Smartinh */ 23815d465952Smartinh common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx); 23825d465952Smartinh if (dstpfx.len != dst->prefix.len) { 23835d465952Smartinh if (btree_adjust_prefix(bt, dst, 23845d465952Smartinh dstpfx.len - dst->prefix.len) != BT_SUCCESS) 23855d465952Smartinh return BT_FAIL; 23865d465952Smartinh bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx)); 23875d465952Smartinh } 23885d465952Smartinh 23895d465952Smartinh /* Move all nodes from src to dst. 23905d465952Smartinh */ 23915d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) { 23925d465952Smartinh srcnode = NODEPTR(src, i); 23935d465952Smartinh 23945d465952Smartinh /* If branch node 0 (implicit key), find the real key. 23955d465952Smartinh */ 23965d465952Smartinh if (i == 0 && IS_BRANCH(src)) { 23975d465952Smartinh struct mpage *low; 23985d465952Smartinh 23995d465952Smartinh /* must find the lowest key below src 24005d465952Smartinh */ 24015d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0, 24025d465952Smartinh &low) == BT_SUCCESS); 24035d465952Smartinh expand_prefix(bt, low, 0, &tmpkey); 24045d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u", 24055d465952Smartinh (int)tmpkey.len, tmpkey.str, low->pgno); 24065d465952Smartinh } else { 24075d465952Smartinh expand_prefix(bt, src, i, &tmpkey); 24085d465952Smartinh } 24095d465952Smartinh 24105d465952Smartinh key.size = tmpkey.len; 24115d465952Smartinh key.data = tmpkey.str; 24125d465952Smartinh 24135d465952Smartinh remove_prefix(bt, &key, dst->prefix.len); 24145d465952Smartinh data.size = NODEDSZ(srcnode); 24155d465952Smartinh data.data = NODEDATA(srcnode); 24165d465952Smartinh rc = btree_add_node(bt, dst, NUMKEYS(dst), &key, 24175d465952Smartinh &data, NODEPGNO(srcnode), srcnode->flags); 24185d465952Smartinh if (rc != BT_SUCCESS) 24195d465952Smartinh return rc; 24205d465952Smartinh } 24215d465952Smartinh 24225d465952Smartinh DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", 242337ee45a1Smartinh dst->pgno, NUMKEYS(dst), PAGEFILL(bt, dst) * 100); 24245d465952Smartinh 24255d465952Smartinh /* Unlink the src page from parent. 24265d465952Smartinh */ 24275d465952Smartinh btree_del_node(bt, src->parent, src->parent_index); 24285d465952Smartinh if (src->parent_index == 0) { 24295d465952Smartinh key.size = 0; 24305d465952Smartinh if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS) 24315d465952Smartinh return BT_FAIL; 24325d465952Smartinh 2433a33d639aSmartinh pfxlen = src->prefix.len; 24345d465952Smartinh find_common_prefix(bt, src); 24355d465952Smartinh assert (src->prefix.len == pfxlen); 24365d465952Smartinh } 24375d465952Smartinh 24385d465952Smartinh if (IS_LEAF(src)) 243937ee45a1Smartinh bt->meta.leaf_pages--; 24405d465952Smartinh else 244137ee45a1Smartinh bt->meta.branch_pages--; 24425d465952Smartinh 24435d465952Smartinh return btree_rebalance(bt, src->parent); 24445d465952Smartinh } 24455d465952Smartinh 24465d465952Smartinh #define FILL_THRESHOLD 0.25 24475d465952Smartinh 24485d465952Smartinh static int 24495d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp) 24505d465952Smartinh { 245137ee45a1Smartinh struct node *node; 24525d465952Smartinh struct mpage *parent; 245337ee45a1Smartinh struct mpage *root; 24545d465952Smartinh struct mpage *neighbor = NULL; 245537ee45a1Smartinh indx_t si = 0, di = 0; 24565d465952Smartinh 24575d465952Smartinh assert(bt != NULL); 24585d465952Smartinh assert(bt->txn != NULL); 24595d465952Smartinh assert(mp != NULL); 24605d465952Smartinh 24615d465952Smartinh DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", 24625d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 246337ee45a1Smartinh mp->pgno, NUMKEYS(mp), PAGEFILL(bt, mp) * 100); 24645d465952Smartinh 246537ee45a1Smartinh if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) { 24665d465952Smartinh DPRINTF("no need to rebalance page %u, above fill threshold", 24675d465952Smartinh mp->pgno); 24685d465952Smartinh return BT_SUCCESS; 24695d465952Smartinh } 24705d465952Smartinh 24715d465952Smartinh parent = mp->parent; 24725d465952Smartinh 24735d465952Smartinh if (parent == NULL) { 24745d465952Smartinh if (NUMKEYS(mp) == 0) { 24755d465952Smartinh DPRINTF("tree is completely empty"); 24765d465952Smartinh bt->txn->root = P_INVALID; 247737ee45a1Smartinh bt->meta.depth--; 247837ee45a1Smartinh bt->meta.leaf_pages--; 24795d465952Smartinh } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 24805d465952Smartinh DPRINTF("collapsing root page!"); 24815d465952Smartinh bt->txn->root = NODEPGNO(NODEPTR(mp, 0)); 248237ee45a1Smartinh if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL) 248337ee45a1Smartinh return BT_FAIL; 248437ee45a1Smartinh root->parent = NULL; 248537ee45a1Smartinh bt->meta.depth--; 248637ee45a1Smartinh bt->meta.branch_pages--; 24875d465952Smartinh } else 24885d465952Smartinh DPRINTF("root page doesn't need rebalancing"); 24895d465952Smartinh return BT_SUCCESS; 24905d465952Smartinh } 24915d465952Smartinh 24925d465952Smartinh /* The parent (branch page) must have at least 2 pointers, 24935d465952Smartinh * otherwise the tree is invalid. 24945d465952Smartinh */ 24955d465952Smartinh assert(NUMKEYS(parent) > 1); 24965d465952Smartinh 24975d465952Smartinh /* Leaf page fill factor is below the threshold. 24985d465952Smartinh * Try to move keys from left or right neighbor, or 24995d465952Smartinh * merge with a neighbor page. 25005d465952Smartinh */ 25015d465952Smartinh 25025d465952Smartinh /* Find neighbors. 25035d465952Smartinh */ 25045d465952Smartinh if (mp->parent_index == 0) { 25055d465952Smartinh /* We're the leftmost leaf in our parent. 25065d465952Smartinh */ 25075d465952Smartinh DPRINTF("reading right neighbor"); 250837ee45a1Smartinh node = NODEPTR(parent, mp->parent_index + 1); 250937ee45a1Smartinh if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 25105d465952Smartinh return BT_FAIL; 25115d465952Smartinh neighbor->parent_index = mp->parent_index + 1; 25125d465952Smartinh si = 0; 25135d465952Smartinh di = NUMKEYS(mp); 25145d465952Smartinh } else { 25155d465952Smartinh /* There is at least one neighbor to the left. 25165d465952Smartinh */ 25175d465952Smartinh DPRINTF("reading left neighbor"); 251837ee45a1Smartinh node = NODEPTR(parent, mp->parent_index - 1); 251937ee45a1Smartinh if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 25205d465952Smartinh return BT_FAIL; 25215d465952Smartinh neighbor->parent_index = mp->parent_index - 1; 25225d465952Smartinh si = NUMKEYS(neighbor) - 1; 25235d465952Smartinh di = 0; 25245d465952Smartinh } 25255d465952Smartinh neighbor->parent = parent; 25265d465952Smartinh 25275d465952Smartinh DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", 252837ee45a1Smartinh neighbor->pgno, NUMKEYS(neighbor), PAGEFILL(bt, neighbor) * 100); 25295d465952Smartinh 25305d465952Smartinh /* If the neighbor page is above threshold and has at least two 25315d465952Smartinh * keys, move one key from it. 25325d465952Smartinh * 25335d465952Smartinh * Otherwise we should try to merge them, but that might not be 25345d465952Smartinh * possible, even if both are below threshold, as prefix expansion 25355d465952Smartinh * might make keys larger. FIXME: detect this 25365d465952Smartinh */ 253737ee45a1Smartinh if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && 25385d465952Smartinh NUMKEYS(neighbor) >= NUMKEYS(mp) + 2) 25395d465952Smartinh return btree_move_node(bt, neighbor, si, mp, di); 25405d465952Smartinh else { /* FIXME: if (has_enough_room()) */ 25415d465952Smartinh if (mp->parent_index == 0) 25425d465952Smartinh return btree_merge(bt, neighbor, mp); 25435d465952Smartinh else 25445d465952Smartinh return btree_merge(bt, mp, neighbor); 25455d465952Smartinh } 25465d465952Smartinh } 25475d465952Smartinh 25485d465952Smartinh int 25495d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn, 25505d465952Smartinh struct btval *key, struct btval *data) 25515d465952Smartinh { 25525d465952Smartinh int rc, exact, close_txn = 0; 25535d465952Smartinh unsigned int ki; 25545d465952Smartinh struct node *leaf; 25555d465952Smartinh struct mpage *mp; 25565d465952Smartinh 25575d465952Smartinh DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); 25585d465952Smartinh 25595d465952Smartinh assert(key != NULL); 25605d465952Smartinh 256159916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) { 256259916c99Smartinh errno = EINVAL; 256359916c99Smartinh return BT_FAIL; 256459916c99Smartinh } 256559916c99Smartinh 25667e04cf1eSmartinh if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 25677e04cf1eSmartinh errno = EINVAL; 25687e04cf1eSmartinh return BT_FAIL; 25697e04cf1eSmartinh } 25707e04cf1eSmartinh 257159916c99Smartinh if (bt == NULL) { 257259916c99Smartinh if (txn == NULL) { 257359916c99Smartinh errno = EINVAL; 257459916c99Smartinh return BT_FAIL; 257559916c99Smartinh } 257659916c99Smartinh bt = txn->bt; 257759916c99Smartinh } 257859916c99Smartinh 25795d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 25805d465952Smartinh errno = EINVAL; 25815d465952Smartinh return BT_FAIL; 25825d465952Smartinh } 25835d465952Smartinh 25845d465952Smartinh if (txn == NULL) { 25855d465952Smartinh close_txn = 1; 25865d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 25875d465952Smartinh return BT_FAIL; 25885d465952Smartinh } 25895d465952Smartinh 25905d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS) 25915d465952Smartinh goto done; 25925d465952Smartinh 25935d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki); 25945d465952Smartinh if (leaf == NULL || !exact) { 25950279e526Smartinh errno = ENOENT; 25960279e526Smartinh rc = BT_FAIL; 25975d465952Smartinh goto done; 25985d465952Smartinh } 25995d465952Smartinh 26005d465952Smartinh if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS) 26015d465952Smartinh goto done; 26025d465952Smartinh 26035d465952Smartinh btree_del_node(bt, mp, ki); 260437ee45a1Smartinh bt->meta.entries--; 26055d465952Smartinh rc = btree_rebalance(bt, mp); 26065d465952Smartinh if (rc != BT_SUCCESS) 26075d465952Smartinh txn->flags |= BT_TXN_ERROR; 26085d465952Smartinh 26095d465952Smartinh done: 26105d465952Smartinh if (close_txn) { 26115d465952Smartinh if (rc == BT_SUCCESS) 26125d465952Smartinh rc = btree_txn_commit(txn); 26135d465952Smartinh else 26145d465952Smartinh btree_txn_abort(txn); 26155d465952Smartinh } 26165d465952Smartinh mpage_prune(bt); 26175d465952Smartinh return rc; 26185d465952Smartinh } 26195d465952Smartinh 26205d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that 26215d465952Smartinh * still makes it uniquely distinguishable from <min>. 26225d465952Smartinh * 26235d465952Smartinh * <min> is guaranteed to be sorted less than <sep> 26245d465952Smartinh * 26255d465952Smartinh * On return, <sep> is modified to the minimum length. 26265d465952Smartinh */ 26275d465952Smartinh static void 26285d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep) 26295d465952Smartinh { 26305d465952Smartinh size_t n = 0; 26315d465952Smartinh char *p1; 26325d465952Smartinh char *p2; 26335d465952Smartinh 26345d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 26355d465952Smartinh 26365d465952Smartinh assert(sep->size > 0); 26375d465952Smartinh 26385d465952Smartinh p1 = (char *)NODEKEY(min) + min->ksize - 1; 26395d465952Smartinh p2 = (char *)sep->data + sep->size - 1; 26405d465952Smartinh 26415d465952Smartinh while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { 26425d465952Smartinh assert(p2 > (char *)sep->data); 26435d465952Smartinh p1--; 26445d465952Smartinh p2--; 26455d465952Smartinh n++; 26465d465952Smartinh } 26475d465952Smartinh 26485d465952Smartinh sep->data = p2; 26495d465952Smartinh sep->size = n + 1; 26505d465952Smartinh } else { 26515d465952Smartinh 26525d465952Smartinh assert(min->ksize > 0); 26535d465952Smartinh assert(sep->size > 0); 26545d465952Smartinh 26555d465952Smartinh p1 = (char *)NODEKEY(min); 26565d465952Smartinh p2 = (char *)sep->data; 26575d465952Smartinh 26585d465952Smartinh while (*p1 == *p2) { 26595d465952Smartinh p1++; 26605d465952Smartinh p2++; 26615d465952Smartinh n++; 26625d465952Smartinh if (n == min->ksize || n == sep->size) 26635d465952Smartinh break; 26645d465952Smartinh } 26655d465952Smartinh 26665d465952Smartinh sep->size = n + 1; 26675d465952Smartinh } 26685d465952Smartinh 26695d465952Smartinh DPRINTF("reduced separator to [%.*s] > [%.*s]", 26705d465952Smartinh (int)sep->size, (char *)sep->data, 26715d465952Smartinh (int)min->ksize, (char *)NODEKEY(min)); 26725d465952Smartinh } 26735d465952Smartinh 26745d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or 26755d465952Smartinh * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and 26765d465952Smartinh * *newindxp with the actual values after split, ie if *mpp and *newindxp 26775d465952Smartinh * refer to a node in the new right sibling page. 26785d465952Smartinh */ 26795d465952Smartinh static int 26805d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp, 26815d465952Smartinh struct btval *newkey, struct btval *newdata, pgno_t newpgno) 26825d465952Smartinh { 26835d465952Smartinh uint8_t flags; 26845d465952Smartinh int rc = BT_SUCCESS, ins_new = 0; 26855d465952Smartinh indx_t newindx; 26865d465952Smartinh pgno_t pgno = 0; 26875d465952Smartinh size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff; 26885d465952Smartinh unsigned int i, j, split_indx; 26895d465952Smartinh struct node *node; 26905d465952Smartinh struct mpage *pright, *p, *mp; 26915d465952Smartinh struct btval sepkey, rkey, rdata; 26925d465952Smartinh struct btkey tmpkey; 269337ee45a1Smartinh struct page *copy; 26945d465952Smartinh 26955d465952Smartinh assert(bt != NULL); 26965d465952Smartinh assert(bt->txn != NULL); 26975d465952Smartinh 26985d465952Smartinh mp = *mpp; 26995d465952Smartinh newindx = *newindxp; 27005d465952Smartinh 27015d465952Smartinh DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i", 27025d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno, 27035d465952Smartinh (int)newkey->size, (char *)newkey->data, *newindxp); 27045d465952Smartinh DPRINTF("page %u has prefix [%.*s]", mp->pgno, 27055d465952Smartinh (int)mp->prefix.len, (char *)mp->prefix.str); 27065d465952Smartinh orig_pfx_len = mp->prefix.len; 27075d465952Smartinh 27085d465952Smartinh if (mp->parent == NULL) { 27095d465952Smartinh if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL) 27105d465952Smartinh return BT_FAIL; 27115d465952Smartinh mp->parent_index = 0; 27125d465952Smartinh bt->txn->root = mp->parent->pgno; 27135d465952Smartinh DPRINTF("root split! new root = %u", mp->parent->pgno); 271437ee45a1Smartinh bt->meta.depth++; 27155d465952Smartinh 27165d465952Smartinh /* Add left (implicit) pointer. */ 27175d465952Smartinh if (btree_add_node(bt, mp->parent, 0, NULL, NULL, 27185d465952Smartinh mp->pgno, 0) != BT_SUCCESS) 27195d465952Smartinh return BT_FAIL; 27205d465952Smartinh } else { 27215d465952Smartinh DPRINTF("parent branch page is %u", mp->parent->pgno); 27225d465952Smartinh } 27235d465952Smartinh 27245d465952Smartinh /* Create a right sibling. */ 272537ee45a1Smartinh if ((pright = btree_new_page(bt, mp->page->flags)) == NULL) 27265d465952Smartinh return BT_FAIL; 27275d465952Smartinh pright->parent = mp->parent; 27285d465952Smartinh pright->parent_index = mp->parent_index + 1; 27295d465952Smartinh DPRINTF("new right sibling: page %u", pright->pgno); 27305d465952Smartinh 27315d465952Smartinh /* Move half of the keys to the right sibling. */ 273237ee45a1Smartinh if ((copy = malloc(bt->head.psize)) == NULL) 273337ee45a1Smartinh return BT_FAIL; 273437ee45a1Smartinh bcopy(mp->page, copy, bt->head.psize); 27355d465952Smartinh assert(mp->ref == 0); /* XXX */ 273637ee45a1Smartinh bzero(&mp->page->ptrs, bt->head.psize - PAGEHDRSZ); 273737ee45a1Smartinh mp->page->lower = PAGEHDRSZ; 273837ee45a1Smartinh mp->page->upper = bt->head.psize; 27395d465952Smartinh 274037ee45a1Smartinh split_indx = NUMKEYSP(copy) / 2 + 1; 27415d465952Smartinh 27425d465952Smartinh /* First find the separating key between the split pages. 27435d465952Smartinh */ 27445d465952Smartinh bzero(&sepkey, sizeof(sepkey)); 27455d465952Smartinh if (newindx == split_indx) { 27465d465952Smartinh sepkey.size = newkey->size; 27475d465952Smartinh sepkey.data = newkey->data; 27485d465952Smartinh remove_prefix(bt, &sepkey, mp->prefix.len); 27495d465952Smartinh } else { 275037ee45a1Smartinh node = NODEPTRP(copy, split_indx); 27515d465952Smartinh sepkey.size = node->ksize; 27525d465952Smartinh sepkey.data = NODEKEY(node); 27535d465952Smartinh } 27545d465952Smartinh 27555d465952Smartinh if (IS_LEAF(mp) && bt->cmp == NULL) { 27565d465952Smartinh /* Find the smallest separator. */ 27575d465952Smartinh /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */ 275837ee45a1Smartinh node = NODEPTRP(copy, split_indx - 1); 27595d465952Smartinh bt_reduce_separator(bt, node, &sepkey); 27605d465952Smartinh } 27615d465952Smartinh 27625d465952Smartinh /* Fix separator wrt parent prefix. */ 27635d465952Smartinh if (bt->cmp == NULL) { 27645d465952Smartinh tmpkey.len = sizeof(tmpkey.str); 27655d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len, 27665d465952Smartinh sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len); 27675d465952Smartinh sepkey.data = tmpkey.str; 27685d465952Smartinh sepkey.size = tmpkey.len; 27695d465952Smartinh } 27705d465952Smartinh 27715d465952Smartinh DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); 27725d465952Smartinh 27735d465952Smartinh /* Copy separator key to the parent. 27745d465952Smartinh */ 277537ee45a1Smartinh if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) { 27765d465952Smartinh rc = btree_split(bt, &pright->parent, &pright->parent_index, 27775d465952Smartinh &sepkey, NULL, pright->pgno); 27785d465952Smartinh 27795d465952Smartinh /* Right page might now have changed parent. 27805d465952Smartinh * Check if left page also changed parent. 27815d465952Smartinh */ 27825d465952Smartinh if (pright->parent != mp->parent && 27835d465952Smartinh mp->parent_index >= NUMKEYS(mp->parent)) { 27845d465952Smartinh mp->parent = pright->parent; 27855d465952Smartinh mp->parent_index = pright->parent_index - 1; 27865d465952Smartinh } 27875d465952Smartinh } else { 27885d465952Smartinh remove_prefix(bt, &sepkey, pright->parent->prefix.len); 27895d465952Smartinh rc = btree_add_node(bt, pright->parent, pright->parent_index, 27905d465952Smartinh &sepkey, NULL, pright->pgno, 0); 27915d465952Smartinh } 279237ee45a1Smartinh if (rc != BT_SUCCESS) { 279337ee45a1Smartinh free(copy); 27945d465952Smartinh return BT_FAIL; 279537ee45a1Smartinh } 27965d465952Smartinh 27975d465952Smartinh /* Update prefix for right and left page, if the parent was split. 27985d465952Smartinh */ 27995d465952Smartinh find_common_prefix(bt, pright); 28005d465952Smartinh assert(orig_pfx_len <= pright->prefix.len); 28015d465952Smartinh right_pfx_diff = pright->prefix.len - orig_pfx_len; 28025d465952Smartinh 28035d465952Smartinh find_common_prefix(bt, mp); 28045d465952Smartinh assert(orig_pfx_len <= mp->prefix.len); 28055d465952Smartinh left_pfx_diff = mp->prefix.len - orig_pfx_len; 28065d465952Smartinh 280737ee45a1Smartinh for (i = j = 0; i <= NUMKEYSP(copy); j++) { 28085d465952Smartinh if (i < split_indx) { 28095d465952Smartinh /* Re-insert in left sibling. */ 28105d465952Smartinh p = mp; 28115d465952Smartinh pfx_diff = left_pfx_diff; 28125d465952Smartinh } else { 28135d465952Smartinh /* Insert in right sibling. */ 28145d465952Smartinh if (i == split_indx) 28155d465952Smartinh /* Reset insert index for right sibling. */ 28165d465952Smartinh j = (i == newindx && ins_new); 28175d465952Smartinh p = pright; 28185d465952Smartinh pfx_diff = right_pfx_diff; 28195d465952Smartinh } 28205d465952Smartinh 28215d465952Smartinh if (i == newindx && !ins_new) { 28225d465952Smartinh /* Insert the original entry that caused the split. */ 28235d465952Smartinh rkey.data = newkey->data; 28245d465952Smartinh rkey.size = newkey->size; 28255d465952Smartinh if (IS_LEAF(mp)) { 28265d465952Smartinh rdata.data = newdata->data; 28275d465952Smartinh rdata.size = newdata->size; 28285d465952Smartinh } else 28295d465952Smartinh pgno = newpgno; 28305d465952Smartinh flags = 0; 28315d465952Smartinh pfx_diff = p->prefix.len; 28325d465952Smartinh 28335d465952Smartinh ins_new = 1; 28345d465952Smartinh 28355d465952Smartinh /* Update page and index for the new key. */ 28365d465952Smartinh *newindxp = j; 28375d465952Smartinh *mpp = p; 283837ee45a1Smartinh } else if (i == NUMKEYSP(copy)) { 28395d465952Smartinh break; 28405d465952Smartinh } else { 284137ee45a1Smartinh node = NODEPTRP(copy, i); 28425d465952Smartinh rkey.data = NODEKEY(node); 28435d465952Smartinh rkey.size = node->ksize; 28445d465952Smartinh if (IS_LEAF(mp)) { 28455d465952Smartinh rdata.data = NODEDATA(node); 28465d465952Smartinh rdata.size = node->n_dsize; 28475d465952Smartinh } else 28485d465952Smartinh pgno = node->n_pgno; 28495d465952Smartinh flags = node->flags; 28505d465952Smartinh 28515d465952Smartinh i++; 28525d465952Smartinh } 28535d465952Smartinh 28545d465952Smartinh if (!IS_LEAF(mp) && j == 0) { 28555d465952Smartinh /* First branch index doesn't need key data. */ 28565d465952Smartinh rkey.size = 0; 28575d465952Smartinh } else 28585d465952Smartinh remove_prefix(bt, &rkey, pfx_diff); 28595d465952Smartinh 28605d465952Smartinh rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags); 28615d465952Smartinh } 28625d465952Smartinh 286337ee45a1Smartinh free(copy); 28645d465952Smartinh return rc; 28655d465952Smartinh } 28665d465952Smartinh 28675d465952Smartinh int 28685d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn, 28695d465952Smartinh struct btval *key, struct btval *data, unsigned int flags) 28705d465952Smartinh { 28715d465952Smartinh int rc = BT_SUCCESS, exact, close_txn = 0; 28725d465952Smartinh unsigned int ki; 28735d465952Smartinh struct node *leaf; 28745d465952Smartinh struct mpage *mp; 28755d465952Smartinh struct btval xkey; 28765d465952Smartinh 28775d465952Smartinh assert(key != NULL); 28785d465952Smartinh assert(data != NULL); 28795d465952Smartinh 288059916c99Smartinh if (bt != NULL && txn != NULL && bt != txn->bt) { 288159916c99Smartinh errno = EINVAL; 288259916c99Smartinh return BT_FAIL; 288359916c99Smartinh } 288459916c99Smartinh 28857e04cf1eSmartinh if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 28867e04cf1eSmartinh errno = EINVAL; 28877e04cf1eSmartinh return BT_FAIL; 28887e04cf1eSmartinh } 28897e04cf1eSmartinh 289059916c99Smartinh if (bt == NULL) { 289159916c99Smartinh if (txn == NULL) { 289259916c99Smartinh errno = EINVAL; 289359916c99Smartinh return BT_FAIL; 289459916c99Smartinh } 289559916c99Smartinh bt = txn->bt; 289659916c99Smartinh } 289759916c99Smartinh 28985d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 28995d465952Smartinh errno = EINVAL; 29005d465952Smartinh return BT_FAIL; 29015d465952Smartinh } 29025d465952Smartinh 29035d465952Smartinh DPRINTF("==> put key %.*s, size %zu, data size %zu", 29045d465952Smartinh (int)key->size, (char *)key->data, key->size, data->size); 29055d465952Smartinh 29065d465952Smartinh if (txn == NULL) { 29075d465952Smartinh close_txn = 1; 29085d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 29095d465952Smartinh return BT_FAIL; 29105d465952Smartinh } 29115d465952Smartinh 29125d465952Smartinh rc = btree_search_page(bt, txn, key, NULL, 1, &mp); 29135d465952Smartinh if (rc == BT_SUCCESS) { 29145d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki); 29155d465952Smartinh if (leaf && exact) { 29165d465952Smartinh if (F_ISSET(flags, BT_NOOVERWRITE)) { 29175d465952Smartinh DPRINTF("duplicate key %.*s", 29185d465952Smartinh (int)key->size, (char *)key->data); 29190279e526Smartinh errno = EEXIST; 29200279e526Smartinh rc = BT_FAIL; 29215d465952Smartinh goto done; 29225d465952Smartinh } 29235d465952Smartinh btree_del_node(bt, mp, ki); 29245d465952Smartinh } 29255d465952Smartinh if (leaf == NULL) { /* append if not found */ 29265d465952Smartinh ki = NUMKEYS(mp); 29275d465952Smartinh DPRINTF("appending key at index %i", ki); 29285d465952Smartinh } 29290279e526Smartinh } else if (errno == ENOENT) { 29305d465952Smartinh /* new file, just write a root leaf page */ 29315d465952Smartinh DPRINTF("allocating new root leaf page"); 29325d465952Smartinh if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { 29335d465952Smartinh rc = BT_FAIL; 29345d465952Smartinh goto done; 29355d465952Smartinh } 29365d465952Smartinh txn->root = mp->pgno; 293737ee45a1Smartinh bt->meta.depth++; 29385d465952Smartinh ki = 0; 29395d465952Smartinh } 29405d465952Smartinh else 29415d465952Smartinh goto done; 29425d465952Smartinh 29435d465952Smartinh assert(IS_LEAF(mp)); 29445d465952Smartinh DPRINTF("there are %lu keys, should insert new key at index %i", 29455d465952Smartinh NUMKEYS(mp), ki); 29465d465952Smartinh 29475d465952Smartinh /* Copy the key pointer as it is modified by the prefix code. The 29485d465952Smartinh * caller might have malloc'ed the data. 29495d465952Smartinh */ 29505d465952Smartinh xkey.data = key->data; 29515d465952Smartinh xkey.size = key->size; 29525d465952Smartinh 295337ee45a1Smartinh if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) { 29545d465952Smartinh rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID); 29555d465952Smartinh } else { 29565d465952Smartinh /* There is room already in this leaf page. */ 29575d465952Smartinh remove_prefix(bt, &xkey, mp->prefix.len); 29585d465952Smartinh rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0); 29595d465952Smartinh } 29605d465952Smartinh 29615d465952Smartinh if (rc != BT_SUCCESS) 29625d465952Smartinh txn->flags |= BT_TXN_ERROR; 29635d465952Smartinh else 296437ee45a1Smartinh bt->meta.entries++; 29655d465952Smartinh 29665d465952Smartinh done: 29675d465952Smartinh if (close_txn) { 29685d465952Smartinh if (rc == BT_SUCCESS) 29695d465952Smartinh rc = btree_txn_commit(txn); 29705d465952Smartinh else 29715d465952Smartinh btree_txn_abort(txn); 29725d465952Smartinh } 29735d465952Smartinh mpage_prune(bt); 29745d465952Smartinh return rc; 29755d465952Smartinh } 29765d465952Smartinh 29775d465952Smartinh static pgno_t 297837ee45a1Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc) 29795d465952Smartinh { 298037ee45a1Smartinh ssize_t rc; 29815d465952Smartinh indx_t i; 29825d465952Smartinh pgno_t *pnext; 29835d465952Smartinh struct node *node; 29845d465952Smartinh struct page *p; 29855d465952Smartinh struct mpage *mp; 29865d465952Smartinh 298737ee45a1Smartinh /* Get the page and make a copy of it. 298837ee45a1Smartinh */ 298937ee45a1Smartinh if ((mp = btree_get_mpage(bt, pgno)) == NULL) 29905d465952Smartinh return P_INVALID; 299137ee45a1Smartinh if ((p = malloc(bt->head.psize)) == NULL) 299237ee45a1Smartinh return P_INVALID; 299337ee45a1Smartinh bcopy(mp->page, p, bt->head.psize); 29945d465952Smartinh 299537ee45a1Smartinh /* Go through all nodes in the (copied) page and update the 299637ee45a1Smartinh * page pointers. 299737ee45a1Smartinh */ 29985d465952Smartinh if (F_ISSET(p->flags, P_BRANCH)) { 29995d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) { 30005d465952Smartinh node = NODEPTRP(p, i); 300137ee45a1Smartinh node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc); 300237ee45a1Smartinh if (node->n_pgno == P_INVALID) { 300337ee45a1Smartinh free(p); 30045d465952Smartinh return P_INVALID; 30055d465952Smartinh } 300637ee45a1Smartinh } 30075d465952Smartinh } else if (F_ISSET(p->flags, P_LEAF)) { 30085d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) { 30095d465952Smartinh node = NODEPTRP(p, i); 30105d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA)) { 30115d465952Smartinh pnext = NODEDATA(node); 301237ee45a1Smartinh *pnext = btree_compact_tree(bt, *pnext, btc); 301337ee45a1Smartinh if (*pnext == P_INVALID) { 301437ee45a1Smartinh free(p); 30155d465952Smartinh return P_INVALID; 30165d465952Smartinh } 30175d465952Smartinh } 301837ee45a1Smartinh } 30195d465952Smartinh } else if (F_ISSET(p->flags, P_OVERFLOW)) { 30205d465952Smartinh pnext = &p->p_next_pgno; 30215d465952Smartinh if (*pnext > 0) { 302237ee45a1Smartinh *pnext = btree_compact_tree(bt, *pnext, btc); 302337ee45a1Smartinh if (*pnext == P_INVALID) { 302437ee45a1Smartinh free(p); 30255d465952Smartinh return P_INVALID; 30265d465952Smartinh } 302737ee45a1Smartinh } 30285d465952Smartinh } else 30295d465952Smartinh assert(0); 30305d465952Smartinh 303137ee45a1Smartinh pgno = p->pgno = btc->txn->next_pgno++; 303237ee45a1Smartinh rc = write(btc->fd, p, bt->head.psize); 303337ee45a1Smartinh free(p); 3034*36e226d9Smartinh if (rc != (ssize_t)bt->head.psize) 30355d465952Smartinh return P_INVALID; 303637ee45a1Smartinh return pgno; 30375d465952Smartinh } 30385d465952Smartinh 30395d465952Smartinh int 30405d465952Smartinh btree_compact(struct btree *bt) 30415d465952Smartinh { 30425d465952Smartinh char *compact_path = NULL; 304337ee45a1Smartinh struct btree *btc; 304437ee45a1Smartinh struct btree_txn *txn, *txnc = NULL; 304537ee45a1Smartinh int fd; 304637ee45a1Smartinh pgno_t root; 30475d465952Smartinh 30485d465952Smartinh assert(bt != NULL); 30495d465952Smartinh 30500279e526Smartinh DPRINTF("compacting btree %p with path %s", bt, bt->path); 30510279e526Smartinh 30520279e526Smartinh if (bt->path == NULL) { 30530279e526Smartinh errno = EINVAL; 30540279e526Smartinh return BT_FAIL; 30550279e526Smartinh } 30560279e526Smartinh 30570279e526Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 30585d465952Smartinh return BT_FAIL; 30595d465952Smartinh 30605d465952Smartinh asprintf(&compact_path, "%s.compact.XXXXXX", bt->path); 30615d465952Smartinh fd = mkstemp(compact_path); 30625d465952Smartinh if (fd == -1) { 30635d465952Smartinh free(compact_path); 30640279e526Smartinh btree_txn_abort(txn); 30655d465952Smartinh return BT_FAIL; 30665d465952Smartinh } 30675d465952Smartinh 306837ee45a1Smartinh if ((btc = btree_open_fd(fd, 0)) == NULL) 306937ee45a1Smartinh goto failed; 30705d465952Smartinh 307137ee45a1Smartinh if ((txnc = btree_txn_begin(btc, 0)) == NULL) 307237ee45a1Smartinh goto failed; 307337ee45a1Smartinh 307437ee45a1Smartinh if (bt->meta.root != P_INVALID) { 307537ee45a1Smartinh root = btree_compact_tree(bt, bt->meta.root, btc); 30765d465952Smartinh if (root == P_INVALID) 30775d465952Smartinh goto failed; 307837ee45a1Smartinh if (btree_write_meta(btc, root, 0) != BT_SUCCESS) 30795d465952Smartinh goto failed; 308037ee45a1Smartinh } 30815d465952Smartinh 30825d465952Smartinh fsync(fd); 30835d465952Smartinh 30845d465952Smartinh DPRINTF("renaming %s to %s", compact_path, bt->path); 30855d465952Smartinh if (rename(compact_path, bt->path) != 0) 30865d465952Smartinh goto failed; 30875d465952Smartinh 3088e14f8e24Smartinh /* Write a "tombstone" meta page so other processes can pick up 3089e14f8e24Smartinh * the change and re-open the file. 3090e14f8e24Smartinh */ 3091e14f8e24Smartinh if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS) 3092e14f8e24Smartinh goto failed; 30935d465952Smartinh 3094e14f8e24Smartinh btree_txn_abort(txn); 309537ee45a1Smartinh btree_txn_abort(txnc); 30965d465952Smartinh free(compact_path); 309737ee45a1Smartinh btree_close(btc); 309837ee45a1Smartinh return 0; 30995d465952Smartinh 31005d465952Smartinh failed: 31015d465952Smartinh btree_txn_abort(txn); 310237ee45a1Smartinh btree_txn_abort(txnc); 31035d465952Smartinh unlink(compact_path); 31045d465952Smartinh free(compact_path); 310537ee45a1Smartinh btree_close(btc); 31065d465952Smartinh return BT_FAIL; 31075d465952Smartinh } 31085d465952Smartinh 31095d465952Smartinh /* Reverts the last change. Truncates the file at the last root page. 31105d465952Smartinh */ 31115d465952Smartinh int 31125d465952Smartinh btree_revert(struct btree *bt) 31135d465952Smartinh { 311437ee45a1Smartinh if (btree_read_meta(bt, NULL) != 0) 311537ee45a1Smartinh return -1; 31165d465952Smartinh 311737ee45a1Smartinh DPRINTF("truncating file at page %u", bt->meta.root); 311837ee45a1Smartinh return ftruncate(bt->fd, bt->head.psize * bt->meta.root); 31195d465952Smartinh } 31205d465952Smartinh 31215d465952Smartinh void 31225d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size) 31235d465952Smartinh { 31245d465952Smartinh bt->stat.max_cache = cache_size; 31255d465952Smartinh } 31265d465952Smartinh 31275d465952Smartinh unsigned int 31285d465952Smartinh btree_get_flags(struct btree *bt) 31295d465952Smartinh { 31305d465952Smartinh return (bt->flags & ~BT_FIXPADDING); 31315d465952Smartinh } 31325d465952Smartinh 31335d465952Smartinh const char * 31345d465952Smartinh btree_get_path(struct btree *bt) 31355d465952Smartinh { 31365d465952Smartinh return bt->path; 31375d465952Smartinh } 31385d465952Smartinh 31395d465952Smartinh const struct btree_stat * 31405d465952Smartinh btree_stat(struct btree *bt) 31415d465952Smartinh { 314237ee45a1Smartinh bt->stat.branch_pages = bt->meta.branch_pages; 314337ee45a1Smartinh bt->stat.leaf_pages = bt->meta.leaf_pages; 314437ee45a1Smartinh bt->stat.overflow_pages = bt->meta.overflow_pages; 314537ee45a1Smartinh bt->stat.revisions = bt->meta.revisions; 314637ee45a1Smartinh bt->stat.depth = bt->meta.depth; 314737ee45a1Smartinh bt->stat.entries = bt->meta.entries; 314837ee45a1Smartinh bt->stat.psize = bt->head.psize; 314937ee45a1Smartinh bt->stat.created_at = bt->meta.created_at; 31505d465952Smartinh 31515d465952Smartinh return &bt->stat; 31525d465952Smartinh } 31535d465952Smartinh 3154