1*5d465952Smartinh /* $OpenBSD: btree.c,v 1.1 2010/05/31 17:36:31 martinh Exp $ */ 2*5d465952Smartinh 3*5d465952Smartinh /* 4*5d465952Smartinh * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se> 5*5d465952Smartinh * 6*5d465952Smartinh * Permission to use, copy, modify, and distribute this software for any 7*5d465952Smartinh * purpose with or without fee is hereby granted, provided that the above 8*5d465952Smartinh * copyright notice and this permission notice appear in all copies. 9*5d465952Smartinh * 10*5d465952Smartinh * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11*5d465952Smartinh * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12*5d465952Smartinh * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13*5d465952Smartinh * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14*5d465952Smartinh * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15*5d465952Smartinh * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16*5d465952Smartinh * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17*5d465952Smartinh */ 18*5d465952Smartinh 19*5d465952Smartinh #include <sys/types.h> 20*5d465952Smartinh #include <sys/tree.h> 21*5d465952Smartinh #include <sys/queue.h> 22*5d465952Smartinh #include <sys/param.h> 23*5d465952Smartinh #include <sys/uio.h> 24*5d465952Smartinh 25*5d465952Smartinh #include <assert.h> 26*5d465952Smartinh #include <err.h> 27*5d465952Smartinh #include <errno.h> 28*5d465952Smartinh #include <fcntl.h> 29*5d465952Smartinh #include <stddef.h> 30*5d465952Smartinh #include <stdint.h> 31*5d465952Smartinh #include <stdio.h> 32*5d465952Smartinh #include <stdlib.h> 33*5d465952Smartinh #include <string.h> 34*5d465952Smartinh #include <time.h> 35*5d465952Smartinh #include <unistd.h> 36*5d465952Smartinh 37*5d465952Smartinh #include "btree.h" 38*5d465952Smartinh 39*5d465952Smartinh /* #define DEBUG */ 40*5d465952Smartinh 41*5d465952Smartinh #ifdef DEBUG 42*5d465952Smartinh # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ 43*5d465952Smartinh fprintf(stderr, __VA_ARGS__); \ 44*5d465952Smartinh fprintf(stderr, "\n"); } while(0) 45*5d465952Smartinh #else 46*5d465952Smartinh # define DPRINTF(...) 47*5d465952Smartinh #endif 48*5d465952Smartinh 49*5d465952Smartinh #define PAGESIZE 4096 50*5d465952Smartinh #define BT_MINKEYS 4 51*5d465952Smartinh #define BT_MAGIC 0xB3DBB3DB 52*5d465952Smartinh #define BT_VERSION 3 53*5d465952Smartinh #define MAXKEYSIZE 255 54*5d465952Smartinh 55*5d465952Smartinh #define P_INVALID 0xFFFFFFFF 56*5d465952Smartinh 57*5d465952Smartinh #define F_ISSET(w, f) (((w) & (f)) == (f)) 58*5d465952Smartinh 59*5d465952Smartinh /* There are four page types: meta, index, leaf and overflow. 60*5d465952Smartinh * They all share the same page header. 61*5d465952Smartinh */ 62*5d465952Smartinh struct page { /* represents an on-disk page */ 63*5d465952Smartinh pgno_t pgno; 64*5d465952Smartinh #define P_BRANCH 0x01 /* branch page */ 65*5d465952Smartinh #define P_LEAF 0x02 /* leaf page */ 66*5d465952Smartinh #define P_OVERFLOW 0x04 /* overflow page */ 67*5d465952Smartinh #define P_META 0x08 /* meta page */ 68*5d465952Smartinh uint32_t flags; 69*5d465952Smartinh #define lower b.fb.fb_lower 70*5d465952Smartinh #define upper b.fb.fb_upper 71*5d465952Smartinh #define p_next_pgno b.pb_next_pgno 72*5d465952Smartinh union page_bounds { 73*5d465952Smartinh struct { 74*5d465952Smartinh indx_t fb_lower; /* lower bound of free space */ 75*5d465952Smartinh indx_t fb_upper; /* upper bound of free space */ 76*5d465952Smartinh } fb; 77*5d465952Smartinh pgno_t pb_next_pgno; /* overflow page linked list */ 78*5d465952Smartinh } b; 79*5d465952Smartinh #define PAGEHDRSZ (sizeof(pgno_t) + sizeof(uint32_t) + \ 80*5d465952Smartinh sizeof(union page_bounds)) 81*5d465952Smartinh indx_t ptrs[(PAGESIZE - PAGEHDRSZ) / sizeof(indx_t)]; 82*5d465952Smartinh }; 83*5d465952Smartinh 84*5d465952Smartinh #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1) 85*5d465952Smartinh #define NUMKEYS(mp) (((mp)->page.lower - PAGEHDRSZ) >> 1) 86*5d465952Smartinh #define SIZELEFT(mp) (indx_t)((mp)->page.upper - (mp)->page.lower) 87*5d465952Smartinh #define PAGEFILL(mp) ((float)(PAGESIZE - PAGEHDRSZ - SIZELEFT(mp)) / \ 88*5d465952Smartinh (PAGESIZE - PAGEHDRSZ)) 89*5d465952Smartinh #define IS_LEAF(mp) F_ISSET((mp)->page.flags, P_LEAF) 90*5d465952Smartinh #define IS_BRANCH(mp) F_ISSET((mp)->page.flags, P_BRANCH) 91*5d465952Smartinh #define IS_OVERFLOW(mp) F_ISSET((mp)->page.flags, P_OVERFLOW) 92*5d465952Smartinh 93*5d465952Smartinh struct bt_meta { /* meta (footer) page content */ 94*5d465952Smartinh uint32_t magic; /* really needed? */ 95*5d465952Smartinh uint32_t version; 96*5d465952Smartinh uint32_t flags; 97*5d465952Smartinh uint32_t psize; /* page size */ 98*5d465952Smartinh pgno_t root; /* page number of root page */ 99*5d465952Smartinh pgno_t prev_meta; /* previous meta page number */ 100*5d465952Smartinh time_t created_at; 101*5d465952Smartinh uint32_t branch_pages; 102*5d465952Smartinh uint32_t leaf_pages; 103*5d465952Smartinh uint32_t overflow_pages; 104*5d465952Smartinh uint32_t revisions; 105*5d465952Smartinh uint32_t depth; 106*5d465952Smartinh uint64_t entries; 107*5d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH]; 108*5d465952Smartinh }; 109*5d465952Smartinh 110*5d465952Smartinh struct btkey { 111*5d465952Smartinh size_t len; 112*5d465952Smartinh char str[MAXKEYSIZE]; 113*5d465952Smartinh }; 114*5d465952Smartinh 115*5d465952Smartinh struct mpage { /* an in-memory cached page */ 116*5d465952Smartinh RB_ENTRY(mpage) entry; /* page cache entry */ 117*5d465952Smartinh SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */ 118*5d465952Smartinh TAILQ_ENTRY(mpage) lru_next; /* LRU queue */ 119*5d465952Smartinh struct mpage *parent; /* NULL if root */ 120*5d465952Smartinh unsigned int parent_index; /* keep track of node index */ 121*5d465952Smartinh struct btkey prefix; 122*5d465952Smartinh struct page page; 123*5d465952Smartinh pgno_t pgno; /* copy of page->pgno */ 124*5d465952Smartinh short ref; /* increased by cursors */ 125*5d465952Smartinh short dirty; /* 1 if on dirty queue */ 126*5d465952Smartinh }; 127*5d465952Smartinh RB_HEAD(page_cache, mpage); 128*5d465952Smartinh SIMPLEQ_HEAD(dirty_queue, mpage); 129*5d465952Smartinh TAILQ_HEAD(lru_queue, mpage); 130*5d465952Smartinh 131*5d465952Smartinh static int mpage_cmp(struct mpage *a, struct mpage *b); 132*5d465952Smartinh static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno); 133*5d465952Smartinh static void mpage_add(struct btree *bt, struct mpage *mp); 134*5d465952Smartinh static void mpage_del(struct btree *bt, struct mpage *mp); 135*5d465952Smartinh static struct mpage *mpage_copy(struct mpage *mp); 136*5d465952Smartinh static void mpage_prune(struct btree *bt); 137*5d465952Smartinh static void mpage_dirty(struct btree *bt, struct mpage *mp); 138*5d465952Smartinh static void mpage_touch(struct btree *bt, struct mpage *mp); 139*5d465952Smartinh 140*5d465952Smartinh RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp); 141*5d465952Smartinh RB_GENERATE(page_cache, mpage, entry, mpage_cmp); 142*5d465952Smartinh 143*5d465952Smartinh struct ppage { /* ordered list of pages */ 144*5d465952Smartinh SLIST_ENTRY(ppage) entry; 145*5d465952Smartinh struct mpage *mpage; 146*5d465952Smartinh unsigned int ki; /* cursor index on page */ 147*5d465952Smartinh }; 148*5d465952Smartinh SLIST_HEAD(page_stack, ppage); 149*5d465952Smartinh 150*5d465952Smartinh #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack) 151*5d465952Smartinh #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack) 152*5d465952Smartinh #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry) 153*5d465952Smartinh #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry) 154*5d465952Smartinh 155*5d465952Smartinh struct cursor { 156*5d465952Smartinh struct btree *bt; 157*5d465952Smartinh struct btree_txn *txn; 158*5d465952Smartinh struct page_stack stack; /* stack of parent pages */ 159*5d465952Smartinh short initialized; /* 1 if initialized */ 160*5d465952Smartinh short eof; /* 1 if end is reached */ 161*5d465952Smartinh }; 162*5d465952Smartinh 163*5d465952Smartinh #define METAHASHLEN offsetof(struct bt_meta, hash) 164*5d465952Smartinh #define METADATA(p) ((struct bt_meta *)((char *)p + PAGEHDRSZ)) 165*5d465952Smartinh 166*5d465952Smartinh struct node { 167*5d465952Smartinh #define n_pgno p.np_pgno 168*5d465952Smartinh #define n_dsize p.np_dsize 169*5d465952Smartinh union { 170*5d465952Smartinh pgno_t np_pgno; /* child page number */ 171*5d465952Smartinh uint32_t np_dsize; /* leaf data size */ 172*5d465952Smartinh } p; 173*5d465952Smartinh uint16_t ksize; /* key size */ 174*5d465952Smartinh #define F_BIGDATA 0x01 /* data put on overflow page */ 175*5d465952Smartinh uint8_t flags; 176*5d465952Smartinh char data[]; 177*5d465952Smartinh }; 178*5d465952Smartinh 179*5d465952Smartinh struct btree_txn { 180*5d465952Smartinh pgno_t root; /* current / new root page */ 181*5d465952Smartinh pgno_t next_pgno; /* next unallocated page */ 182*5d465952Smartinh struct btree *bt; /* btree is ref'd */ 183*5d465952Smartinh struct dirty_queue *dirty_queue; /* modified pages */ 184*5d465952Smartinh #define BT_TXN_RDONLY 0x01 /* read-only transaction */ 185*5d465952Smartinh #define BT_TXN_ERROR 0x02 /* an error has occurred */ 186*5d465952Smartinh unsigned int flags; 187*5d465952Smartinh }; 188*5d465952Smartinh 189*5d465952Smartinh struct btree { 190*5d465952Smartinh int fd; 191*5d465952Smartinh char *path; 192*5d465952Smartinh #define BT_FIXPADDING 0x01 /* internal */ 193*5d465952Smartinh unsigned int flags; 194*5d465952Smartinh bt_cmp_func cmp; /* user compare function */ 195*5d465952Smartinh struct page *metapage; /* current last meta page */ 196*5d465952Smartinh struct bt_meta *meta; 197*5d465952Smartinh struct page_cache *page_cache; 198*5d465952Smartinh struct lru_queue *lru_queue; 199*5d465952Smartinh struct btree_txn *txn; /* current write transaction */ 200*5d465952Smartinh int ref; /* increased by cursors */ 201*5d465952Smartinh struct btree_stat stat; 202*5d465952Smartinh off_t size; /* current file size */ 203*5d465952Smartinh }; 204*5d465952Smartinh 205*5d465952Smartinh #define NODESIZE offsetof(struct node, data) 206*5d465952Smartinh 207*5d465952Smartinh #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size)) 208*5d465952Smartinh #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size) 209*5d465952Smartinh #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i])) 210*5d465952Smartinh #define NODEPTR(mp, i) NODEPTRP(&(mp)->page, i) 211*5d465952Smartinh #define NODEKEY(node) (void *)((node)->data) 212*5d465952Smartinh #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize) 213*5d465952Smartinh #define NODEPGNO(node) ((node)->p.np_pgno) 214*5d465952Smartinh #define NODEDSZ(node) ((node)->p.np_dsize) 215*5d465952Smartinh 216*5d465952Smartinh #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */ 217*5d465952Smartinh #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ 218*5d465952Smartinh 219*5d465952Smartinh static int btree_read_page(struct btree *bt, pgno_t pgno, 220*5d465952Smartinh struct page *page); 221*5d465952Smartinh static int btree_get_mpage(struct btree *bt, pgno_t pgno, 222*5d465952Smartinh struct mpage **mpp); 223*5d465952Smartinh static int btree_search_page_root(struct btree *bt, 224*5d465952Smartinh struct mpage *root, struct btval *key, 225*5d465952Smartinh struct cursor *cursor, int modify, 226*5d465952Smartinh struct mpage **mpp); 227*5d465952Smartinh static int btree_search_page(struct btree *bt, 228*5d465952Smartinh struct btree_txn *txn, struct btval *key, 229*5d465952Smartinh struct cursor *cursor, int modify, 230*5d465952Smartinh struct mpage **mpp); 231*5d465952Smartinh 232*5d465952Smartinh static void btree_init_meta(struct btree *bt); 233*5d465952Smartinh static int btree_is_meta_page(struct page *p); 234*5d465952Smartinh static int btree_read_meta(struct btree *bt, pgno_t *p_next); 235*5d465952Smartinh static int btree_write_meta(struct btree *bt, pgno_t root); 236*5d465952Smartinh 237*5d465952Smartinh static struct node *btree_search_node(struct btree *bt, struct mpage *mp, 238*5d465952Smartinh struct btval *key, int *exactp, unsigned int *kip); 239*5d465952Smartinh static int btree_add_node(struct btree *bt, struct mpage *mp, 240*5d465952Smartinh indx_t indx, struct btval *key, struct btval *data, 241*5d465952Smartinh pgno_t pgno, uint8_t flags); 242*5d465952Smartinh static void btree_del_node(struct btree *bt, struct mpage *mp, 243*5d465952Smartinh indx_t indx); 244*5d465952Smartinh static int btree_read_data(struct btree *bt, struct mpage *mp, 245*5d465952Smartinh struct node *leaf, struct btval *data); 246*5d465952Smartinh 247*5d465952Smartinh static int btree_rebalance(struct btree *bt, struct mpage *mp); 248*5d465952Smartinh static int btree_update_key(struct btree *bt, struct mpage *mp, 249*5d465952Smartinh indx_t indx, struct btval *key); 250*5d465952Smartinh static int btree_adjust_prefix(struct btree *bt, 251*5d465952Smartinh struct mpage *src, int delta); 252*5d465952Smartinh static int btree_move_node(struct btree *bt, struct mpage *src, 253*5d465952Smartinh indx_t srcindx, struct mpage *dst, indx_t dstindx); 254*5d465952Smartinh static int btree_merge(struct btree *bt, struct mpage *src, 255*5d465952Smartinh struct mpage *dst); 256*5d465952Smartinh static int btree_split(struct btree *bt, struct mpage **mpp, 257*5d465952Smartinh unsigned int *newindxp, struct btval *newkey, 258*5d465952Smartinh struct btval *newdata, pgno_t newpgno); 259*5d465952Smartinh static struct mpage *btree_new_page(struct btree *bt, uint32_t flags); 260*5d465952Smartinh static int btree_write_overflow_data(struct btree *bt, 261*5d465952Smartinh struct page *p, struct btval *data); 262*5d465952Smartinh 263*5d465952Smartinh static void cursor_pop_page(struct cursor *cursor); 264*5d465952Smartinh static struct ppage *cursor_push_page(struct cursor *cursor, 265*5d465952Smartinh struct mpage *mp); 266*5d465952Smartinh 267*5d465952Smartinh static int bt_set_key(struct btree *bt, struct mpage *mp, 268*5d465952Smartinh struct node *node, struct btval *key); 269*5d465952Smartinh static int btree_sibling(struct cursor *cursor, int move_right); 270*5d465952Smartinh static int btree_cursor_next(struct cursor *cursor, 271*5d465952Smartinh struct btval *key, struct btval *data); 272*5d465952Smartinh static int btree_cursor_set(struct cursor *cursor, 273*5d465952Smartinh struct btval *key, struct btval *data); 274*5d465952Smartinh static int btree_cursor_first(struct cursor *cursor, 275*5d465952Smartinh struct btval *key, struct btval *data); 276*5d465952Smartinh 277*5d465952Smartinh static void bt_reduce_separator(struct btree *bt, struct node *min, 278*5d465952Smartinh struct btval *sep); 279*5d465952Smartinh static void remove_prefix(struct btree *bt, struct btval *key, 280*5d465952Smartinh size_t pfxlen); 281*5d465952Smartinh static void expand_prefix(struct btree *bt, struct mpage *mp, 282*5d465952Smartinh indx_t indx, struct btkey *expkey); 283*5d465952Smartinh static void concat_prefix(struct btree *bt, char *s1, size_t n1, 284*5d465952Smartinh char *s2, size_t n2, char *cs, size_t *cn); 285*5d465952Smartinh static void common_prefix(struct btree *bt, struct btkey *min, 286*5d465952Smartinh struct btkey *max, struct btkey *pfx); 287*5d465952Smartinh static void find_common_prefix(struct btree *bt, struct mpage *mp); 288*5d465952Smartinh 289*5d465952Smartinh static size_t bt_leaf_size(struct btval *key, struct btval *data); 290*5d465952Smartinh static size_t bt_branch_size(struct btval *key); 291*5d465952Smartinh 292*5d465952Smartinh static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno, 293*5d465952Smartinh int fd); 294*5d465952Smartinh 295*5d465952Smartinh static int memncmp(const void *s1, size_t n1, 296*5d465952Smartinh const void *s2, size_t n2); 297*5d465952Smartinh static int memnrcmp(const void *s1, size_t n1, 298*5d465952Smartinh const void *s2, size_t n2); 299*5d465952Smartinh 300*5d465952Smartinh static int 301*5d465952Smartinh memncmp(const void *s1, size_t n1, const void *s2, size_t n2) 302*5d465952Smartinh { 303*5d465952Smartinh if (n1 < n2) { 304*5d465952Smartinh if (memcmp(s1, s2, n1) == 0) 305*5d465952Smartinh return -1; 306*5d465952Smartinh } 307*5d465952Smartinh else if (n1 > n2) { 308*5d465952Smartinh if (memcmp(s1, s2, n2) == 0) 309*5d465952Smartinh return 1; 310*5d465952Smartinh } 311*5d465952Smartinh return memcmp(s1, s2, n1); 312*5d465952Smartinh } 313*5d465952Smartinh 314*5d465952Smartinh static int 315*5d465952Smartinh memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) 316*5d465952Smartinh { 317*5d465952Smartinh const unsigned char *p1; 318*5d465952Smartinh const unsigned char *p2; 319*5d465952Smartinh 320*5d465952Smartinh if (n1 == 0) 321*5d465952Smartinh return n2 == 0 ? 0 : -1; 322*5d465952Smartinh 323*5d465952Smartinh if (n2 == 0) 324*5d465952Smartinh return n1 == 0 ? 0 : 1; 325*5d465952Smartinh 326*5d465952Smartinh p1 = (const unsigned char *)s1 + n1 - 1; 327*5d465952Smartinh p2 = (const unsigned char *)s2 + n2 - 1; 328*5d465952Smartinh 329*5d465952Smartinh while (*p1 == *p2) { 330*5d465952Smartinh if (p1 == s1) 331*5d465952Smartinh return (p2 == s2) ? 0 : -1; 332*5d465952Smartinh if (p2 == s2) 333*5d465952Smartinh return (p1 == p2) ? 0 : 1; 334*5d465952Smartinh p1--; 335*5d465952Smartinh p2--; 336*5d465952Smartinh } 337*5d465952Smartinh return *p1 - *p2; 338*5d465952Smartinh } 339*5d465952Smartinh 340*5d465952Smartinh int 341*5d465952Smartinh btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b) 342*5d465952Smartinh { 343*5d465952Smartinh return bt->cmp(a, b); 344*5d465952Smartinh } 345*5d465952Smartinh 346*5d465952Smartinh static void 347*5d465952Smartinh common_prefix(struct btree *bt, struct btkey *min, struct btkey *max, 348*5d465952Smartinh struct btkey *pfx) 349*5d465952Smartinh { 350*5d465952Smartinh size_t n = 0; 351*5d465952Smartinh char *p1; 352*5d465952Smartinh char *p2; 353*5d465952Smartinh 354*5d465952Smartinh if (min->len == 0 || max->len == 0) { 355*5d465952Smartinh pfx->len = 0; 356*5d465952Smartinh return; 357*5d465952Smartinh } 358*5d465952Smartinh 359*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 360*5d465952Smartinh p1 = min->str + min->len - 1; 361*5d465952Smartinh p2 = max->str + max->len - 1; 362*5d465952Smartinh 363*5d465952Smartinh while (*p1 == *p2) { 364*5d465952Smartinh if (p1 < min->str || p2 < max->str) 365*5d465952Smartinh break; 366*5d465952Smartinh p1--; 367*5d465952Smartinh p2--; 368*5d465952Smartinh n++; 369*5d465952Smartinh } 370*5d465952Smartinh 371*5d465952Smartinh assert(n <= (int)sizeof(pfx->str)); 372*5d465952Smartinh pfx->len = n; 373*5d465952Smartinh bcopy(p2 + 1, pfx->str, n); 374*5d465952Smartinh } else { 375*5d465952Smartinh p1 = min->str; 376*5d465952Smartinh p2 = max->str; 377*5d465952Smartinh 378*5d465952Smartinh while (*p1 == *p2) { 379*5d465952Smartinh if (n == min->len || n == max->len) 380*5d465952Smartinh break; 381*5d465952Smartinh p1++; 382*5d465952Smartinh p2++; 383*5d465952Smartinh n++; 384*5d465952Smartinh } 385*5d465952Smartinh 386*5d465952Smartinh assert(n <= (int)sizeof(pfx->str)); 387*5d465952Smartinh pfx->len = n; 388*5d465952Smartinh bcopy(max->str, pfx->str, n); 389*5d465952Smartinh } 390*5d465952Smartinh } 391*5d465952Smartinh 392*5d465952Smartinh static void 393*5d465952Smartinh remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen) 394*5d465952Smartinh { 395*5d465952Smartinh if (pfxlen == 0 || bt->cmp != NULL) 396*5d465952Smartinh return; 397*5d465952Smartinh 398*5d465952Smartinh DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, 399*5d465952Smartinh (int)key->size, (char *)key->data); 400*5d465952Smartinh assert(pfxlen <= key->size); 401*5d465952Smartinh key->size -= pfxlen; 402*5d465952Smartinh if (!F_ISSET(bt->flags, BT_REVERSEKEY)) 403*5d465952Smartinh key->data = (char *)key->data + pfxlen; 404*5d465952Smartinh } 405*5d465952Smartinh 406*5d465952Smartinh static void 407*5d465952Smartinh expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx, 408*5d465952Smartinh struct btkey *expkey) 409*5d465952Smartinh { 410*5d465952Smartinh struct node *node; 411*5d465952Smartinh 412*5d465952Smartinh node = NODEPTR(mp, indx); 413*5d465952Smartinh expkey->len = sizeof(expkey->str); 414*5d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len, 415*5d465952Smartinh NODEKEY(node), node->ksize, expkey->str, &expkey->len); 416*5d465952Smartinh } 417*5d465952Smartinh 418*5d465952Smartinh static int 419*5d465952Smartinh bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2, 420*5d465952Smartinh struct btkey *pfx) 421*5d465952Smartinh { 422*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) 423*5d465952Smartinh return memnrcmp(key1->data, key1->size - pfx->len, 424*5d465952Smartinh key2->data, key2->size); 425*5d465952Smartinh else 426*5d465952Smartinh return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len, 427*5d465952Smartinh key2->data, key2->size); 428*5d465952Smartinh } 429*5d465952Smartinh 430*5d465952Smartinh void 431*5d465952Smartinh btval_reset(struct btval *btv) 432*5d465952Smartinh { 433*5d465952Smartinh if (btv) { 434*5d465952Smartinh if (btv->mp) 435*5d465952Smartinh btv->mp->ref--; 436*5d465952Smartinh if (btv->free_data) 437*5d465952Smartinh free(btv->data); 438*5d465952Smartinh bzero(btv, sizeof(*btv)); 439*5d465952Smartinh } 440*5d465952Smartinh } 441*5d465952Smartinh 442*5d465952Smartinh static int 443*5d465952Smartinh mpage_cmp(struct mpage *a, struct mpage *b) 444*5d465952Smartinh { 445*5d465952Smartinh if (a->pgno > b->pgno) 446*5d465952Smartinh return 1; 447*5d465952Smartinh if (a->pgno < b->pgno) 448*5d465952Smartinh return -1; 449*5d465952Smartinh return 0; 450*5d465952Smartinh } 451*5d465952Smartinh 452*5d465952Smartinh static struct mpage * 453*5d465952Smartinh mpage_lookup(struct btree *bt, pgno_t pgno) 454*5d465952Smartinh { 455*5d465952Smartinh struct mpage find, *mp; 456*5d465952Smartinh 457*5d465952Smartinh find.pgno = pgno; 458*5d465952Smartinh mp = RB_FIND(page_cache, bt->page_cache, &find); 459*5d465952Smartinh if (mp) { 460*5d465952Smartinh bt->stat.hits++; 461*5d465952Smartinh /* Update LRU queue. Move page to the end. */ 462*5d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 463*5d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 464*5d465952Smartinh } 465*5d465952Smartinh return mp; 466*5d465952Smartinh } 467*5d465952Smartinh 468*5d465952Smartinh static void 469*5d465952Smartinh mpage_add(struct btree *bt, struct mpage *mp) 470*5d465952Smartinh { 471*5d465952Smartinh assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL); 472*5d465952Smartinh bt->stat.cache_size++; 473*5d465952Smartinh TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 474*5d465952Smartinh } 475*5d465952Smartinh 476*5d465952Smartinh static void 477*5d465952Smartinh mpage_del(struct btree *bt, struct mpage *mp) 478*5d465952Smartinh { 479*5d465952Smartinh assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp); 480*5d465952Smartinh assert(bt->stat.cache_size > 0); 481*5d465952Smartinh bt->stat.cache_size--; 482*5d465952Smartinh TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 483*5d465952Smartinh } 484*5d465952Smartinh 485*5d465952Smartinh static struct mpage * 486*5d465952Smartinh mpage_copy(struct mpage *mp) 487*5d465952Smartinh { 488*5d465952Smartinh struct mpage *copy; 489*5d465952Smartinh 490*5d465952Smartinh if ((copy = calloc(1, sizeof(*copy))) == NULL) 491*5d465952Smartinh return NULL; 492*5d465952Smartinh bcopy(&mp->page, ©->page, sizeof(mp->page)); 493*5d465952Smartinh bcopy(&mp->prefix, ©->prefix, sizeof(mp->prefix)); 494*5d465952Smartinh copy->parent = mp->parent; 495*5d465952Smartinh copy->parent_index = mp->parent_index; 496*5d465952Smartinh copy->pgno = mp->pgno; 497*5d465952Smartinh copy->pgno = mp->pgno; 498*5d465952Smartinh 499*5d465952Smartinh return copy; 500*5d465952Smartinh } 501*5d465952Smartinh 502*5d465952Smartinh /* Remove the least recently used memory pages until the cache size is 503*5d465952Smartinh * within the configured bounds. Pages referenced by cursors or returned 504*5d465952Smartinh * key/data are not pruned. 505*5d465952Smartinh */ 506*5d465952Smartinh static void 507*5d465952Smartinh mpage_prune(struct btree *bt) 508*5d465952Smartinh { 509*5d465952Smartinh struct mpage *mp, *next; 510*5d465952Smartinh 511*5d465952Smartinh for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { 512*5d465952Smartinh if (bt->stat.cache_size <= bt->stat.max_cache) 513*5d465952Smartinh break; 514*5d465952Smartinh next = TAILQ_NEXT(mp, lru_next); 515*5d465952Smartinh if (!mp->dirty && mp->ref <= 0) { 516*5d465952Smartinh mpage_del(bt, mp); 517*5d465952Smartinh free(mp); 518*5d465952Smartinh } 519*5d465952Smartinh } 520*5d465952Smartinh } 521*5d465952Smartinh 522*5d465952Smartinh /* Mark a page as dirty and push it on the dirty queue. 523*5d465952Smartinh */ 524*5d465952Smartinh static void 525*5d465952Smartinh mpage_dirty(struct btree *bt, struct mpage *mp) 526*5d465952Smartinh { 527*5d465952Smartinh assert(bt != NULL); 528*5d465952Smartinh assert(bt->txn != NULL); 529*5d465952Smartinh 530*5d465952Smartinh if (!mp->dirty) { 531*5d465952Smartinh mp->dirty = 1; 532*5d465952Smartinh SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next); 533*5d465952Smartinh } 534*5d465952Smartinh } 535*5d465952Smartinh 536*5d465952Smartinh /* Touch a page: make it dirty and re-insert into tree with updated pgno. 537*5d465952Smartinh */ 538*5d465952Smartinh static void 539*5d465952Smartinh mpage_touch(struct btree *bt, struct mpage *mp) 540*5d465952Smartinh { 541*5d465952Smartinh assert(bt != NULL); 542*5d465952Smartinh assert(bt->txn != NULL); 543*5d465952Smartinh assert(mp != NULL); 544*5d465952Smartinh 545*5d465952Smartinh if (!mp->dirty) { 546*5d465952Smartinh DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); 547*5d465952Smartinh if (mp->ref == 0) 548*5d465952Smartinh mpage_del(bt, mp); 549*5d465952Smartinh else 550*5d465952Smartinh mp = mpage_copy(mp); 551*5d465952Smartinh mp->pgno = mp->page.pgno = bt->txn->next_pgno++; 552*5d465952Smartinh mpage_dirty(bt, mp); 553*5d465952Smartinh mpage_add(bt, mp); 554*5d465952Smartinh 555*5d465952Smartinh /* Update the page number to new touched page. */ 556*5d465952Smartinh if (mp->parent != NULL) 557*5d465952Smartinh NODEPGNO(NODEPTR(mp->parent, 558*5d465952Smartinh mp->parent_index)) = mp->pgno; 559*5d465952Smartinh } 560*5d465952Smartinh } 561*5d465952Smartinh 562*5d465952Smartinh static int 563*5d465952Smartinh btree_read_page(struct btree *bt, pgno_t pgno, struct page *page) 564*5d465952Smartinh { 565*5d465952Smartinh ssize_t rc; 566*5d465952Smartinh 567*5d465952Smartinh DPRINTF("reading page %u", pgno); 568*5d465952Smartinh bt->stat.reads++; 569*5d465952Smartinh if ((rc = pread(bt->fd, page, PAGESIZE, (off_t)pgno*PAGESIZE)) == 0) { 570*5d465952Smartinh DPRINTF("page %u doesn't exist", pgno); 571*5d465952Smartinh return BT_NOTFOUND; 572*5d465952Smartinh } 573*5d465952Smartinh else if (rc != PAGESIZE) { 574*5d465952Smartinh DPRINTF("read: %s", strerror(errno)); 575*5d465952Smartinh return BT_FAIL; 576*5d465952Smartinh } 577*5d465952Smartinh 578*5d465952Smartinh if (page->pgno != pgno) { 579*5d465952Smartinh DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); 580*5d465952Smartinh errno = EINVAL; 581*5d465952Smartinh return BT_FAIL; 582*5d465952Smartinh } 583*5d465952Smartinh 584*5d465952Smartinh return BT_SUCCESS; 585*5d465952Smartinh } 586*5d465952Smartinh 587*5d465952Smartinh int 588*5d465952Smartinh btree_sync(struct btree *bt) 589*5d465952Smartinh { 590*5d465952Smartinh if (!F_ISSET(bt->flags, BT_NOSYNC)) 591*5d465952Smartinh return fsync(bt->fd); 592*5d465952Smartinh return 0; 593*5d465952Smartinh } 594*5d465952Smartinh 595*5d465952Smartinh struct btree_txn * 596*5d465952Smartinh btree_txn_begin(struct btree *bt, int rdonly) 597*5d465952Smartinh { 598*5d465952Smartinh int rc; 599*5d465952Smartinh struct btree_txn *txn; 600*5d465952Smartinh 601*5d465952Smartinh if (!rdonly && bt->txn != NULL) { 602*5d465952Smartinh DPRINTF("write transaction already begun"); 603*5d465952Smartinh return NULL; 604*5d465952Smartinh } 605*5d465952Smartinh 606*5d465952Smartinh if ((txn = calloc(1, sizeof(*txn))) == NULL) { 607*5d465952Smartinh DPRINTF("calloc: %s", strerror(errno)); 608*5d465952Smartinh return NULL; 609*5d465952Smartinh } 610*5d465952Smartinh 611*5d465952Smartinh if (rdonly) { 612*5d465952Smartinh txn->flags |= BT_TXN_RDONLY; 613*5d465952Smartinh } else { 614*5d465952Smartinh txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue)); 615*5d465952Smartinh if (txn->dirty_queue == NULL) { 616*5d465952Smartinh free(txn); 617*5d465952Smartinh return NULL; 618*5d465952Smartinh } 619*5d465952Smartinh SIMPLEQ_INIT(txn->dirty_queue); 620*5d465952Smartinh 621*5d465952Smartinh DPRINTF("taking write lock on txn %p", txn); 622*5d465952Smartinh if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { 623*5d465952Smartinh free(txn->dirty_queue); 624*5d465952Smartinh free(txn); 625*5d465952Smartinh return NULL; 626*5d465952Smartinh } 627*5d465952Smartinh bt->txn = txn; 628*5d465952Smartinh } 629*5d465952Smartinh 630*5d465952Smartinh txn->bt = bt; 631*5d465952Smartinh bt->ref++; 632*5d465952Smartinh 633*5d465952Smartinh if ((rc = btree_read_meta(bt, &txn->next_pgno)) == BT_FAIL) { 634*5d465952Smartinh btree_txn_abort(txn); 635*5d465952Smartinh return NULL; 636*5d465952Smartinh } 637*5d465952Smartinh 638*5d465952Smartinh txn->root = bt->meta->root; 639*5d465952Smartinh DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); 640*5d465952Smartinh 641*5d465952Smartinh return txn; 642*5d465952Smartinh } 643*5d465952Smartinh 644*5d465952Smartinh void 645*5d465952Smartinh btree_txn_abort(struct btree_txn *txn) 646*5d465952Smartinh { 647*5d465952Smartinh struct mpage *mp; 648*5d465952Smartinh struct btree *bt; 649*5d465952Smartinh 650*5d465952Smartinh if (txn == NULL) 651*5d465952Smartinh return; 652*5d465952Smartinh 653*5d465952Smartinh bt = txn->bt; 654*5d465952Smartinh DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); 655*5d465952Smartinh 656*5d465952Smartinh /* Discard all dirty pages. 657*5d465952Smartinh */ 658*5d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 659*5d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue); 660*5d465952Smartinh assert(mp->ref == 0); /* cursors should be closed */ 661*5d465952Smartinh mpage_del(bt, mp); 662*5d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 663*5d465952Smartinh } 664*5d465952Smartinh 665*5d465952Smartinh if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { 666*5d465952Smartinh DPRINTF("releasing write lock on txn %p", txn); 667*5d465952Smartinh txn->bt->txn = NULL; 668*5d465952Smartinh if (flock(txn->bt->fd, LOCK_UN) != 0) { 669*5d465952Smartinh DPRINTF("failed to unlock fd %d: %s", 670*5d465952Smartinh txn->bt->fd, strerror(errno)); 671*5d465952Smartinh } 672*5d465952Smartinh free(txn->dirty_queue); 673*5d465952Smartinh } 674*5d465952Smartinh 675*5d465952Smartinh btree_close(txn->bt); 676*5d465952Smartinh free(txn); 677*5d465952Smartinh } 678*5d465952Smartinh 679*5d465952Smartinh int 680*5d465952Smartinh btree_txn_commit(struct btree_txn *txn) 681*5d465952Smartinh { 682*5d465952Smartinh int n, done; 683*5d465952Smartinh ssize_t rc; 684*5d465952Smartinh off_t size; 685*5d465952Smartinh struct mpage *mp; 686*5d465952Smartinh struct btree *bt; 687*5d465952Smartinh struct iovec iov[BT_COMMIT_PAGES]; 688*5d465952Smartinh 689*5d465952Smartinh assert(txn != NULL); 690*5d465952Smartinh assert(txn->bt != NULL); 691*5d465952Smartinh 692*5d465952Smartinh bt = txn->bt; 693*5d465952Smartinh 694*5d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { 695*5d465952Smartinh DPRINTF("attempt to commit read-only transaction"); 696*5d465952Smartinh btree_txn_abort(txn); 697*5d465952Smartinh return BT_FAIL; 698*5d465952Smartinh } 699*5d465952Smartinh 700*5d465952Smartinh if (txn != bt->txn) { 701*5d465952Smartinh DPRINTF("attempt to commit unknown transaction"); 702*5d465952Smartinh btree_txn_abort(txn); 703*5d465952Smartinh return BT_FAIL; 704*5d465952Smartinh } 705*5d465952Smartinh 706*5d465952Smartinh if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 707*5d465952Smartinh DPRINTF("error flag is set, can't commit"); 708*5d465952Smartinh btree_txn_abort(txn); 709*5d465952Smartinh return BT_FAIL; 710*5d465952Smartinh } 711*5d465952Smartinh 712*5d465952Smartinh if (SIMPLEQ_EMPTY(txn->dirty_queue)) 713*5d465952Smartinh goto done; 714*5d465952Smartinh 715*5d465952Smartinh if (F_ISSET(bt->flags, BT_FIXPADDING)) { 716*5d465952Smartinh size = lseek(bt->fd, 0, SEEK_END); 717*5d465952Smartinh size += PAGESIZE - (size % PAGESIZE); 718*5d465952Smartinh DPRINTF("extending to multiple of page size: %llu", size); 719*5d465952Smartinh if (ftruncate(bt->fd, size) != 0) { 720*5d465952Smartinh DPRINTF("ftruncate: %s", strerror(errno)); 721*5d465952Smartinh btree_txn_abort(txn); 722*5d465952Smartinh return BT_FAIL; 723*5d465952Smartinh } 724*5d465952Smartinh bt->flags &= ~BT_FIXPADDING; 725*5d465952Smartinh } 726*5d465952Smartinh 727*5d465952Smartinh DPRINTF("committing transaction on btree %p, root page %u", 728*5d465952Smartinh bt, txn->root); 729*5d465952Smartinh 730*5d465952Smartinh /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done. 731*5d465952Smartinh */ 732*5d465952Smartinh do { 733*5d465952Smartinh n = 0; 734*5d465952Smartinh done = 1; 735*5d465952Smartinh SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { 736*5d465952Smartinh DPRINTF("commiting page %u", mp->pgno); 737*5d465952Smartinh iov[n].iov_len = PAGESIZE; 738*5d465952Smartinh iov[n].iov_base = &mp->page; 739*5d465952Smartinh if (++n >= BT_COMMIT_PAGES) { 740*5d465952Smartinh done = 0; 741*5d465952Smartinh break; 742*5d465952Smartinh } 743*5d465952Smartinh } 744*5d465952Smartinh 745*5d465952Smartinh if (n == 0) 746*5d465952Smartinh break; 747*5d465952Smartinh 748*5d465952Smartinh DPRINTF("commiting %u dirty pages", n); 749*5d465952Smartinh rc = writev(bt->fd, iov, n); 750*5d465952Smartinh if (rc != PAGESIZE*n) { 751*5d465952Smartinh if (rc > 0) 752*5d465952Smartinh DPRINTF("short write, filesystem full?"); 753*5d465952Smartinh else 754*5d465952Smartinh DPRINTF("writev: %s", strerror(errno)); 755*5d465952Smartinh btree_txn_abort(txn); 756*5d465952Smartinh return BT_FAIL; 757*5d465952Smartinh } 758*5d465952Smartinh 759*5d465952Smartinh /* Remove the dirty flag from the written pages. 760*5d465952Smartinh */ 761*5d465952Smartinh while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 762*5d465952Smartinh mp = SIMPLEQ_FIRST(txn->dirty_queue); 763*5d465952Smartinh mp->dirty = 0; 764*5d465952Smartinh SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 765*5d465952Smartinh if (--n == 0) 766*5d465952Smartinh break; 767*5d465952Smartinh } 768*5d465952Smartinh } while (!done); 769*5d465952Smartinh 770*5d465952Smartinh if (btree_sync(bt) != 0 || 771*5d465952Smartinh btree_write_meta(bt, txn->root) != BT_SUCCESS || 772*5d465952Smartinh btree_sync(bt) != 0) { 773*5d465952Smartinh btree_txn_abort(txn); 774*5d465952Smartinh return BT_FAIL; 775*5d465952Smartinh } 776*5d465952Smartinh 777*5d465952Smartinh done: 778*5d465952Smartinh mpage_prune(bt); 779*5d465952Smartinh btree_txn_abort(txn); 780*5d465952Smartinh 781*5d465952Smartinh return BT_SUCCESS; 782*5d465952Smartinh } 783*5d465952Smartinh 784*5d465952Smartinh static int 785*5d465952Smartinh btree_write_meta(struct btree *bt, pgno_t root) 786*5d465952Smartinh { 787*5d465952Smartinh ssize_t rc; 788*5d465952Smartinh 789*5d465952Smartinh DPRINTF("writing meta page for root page %u", root); 790*5d465952Smartinh 791*5d465952Smartinh assert(bt != NULL); 792*5d465952Smartinh assert(bt->txn != NULL); 793*5d465952Smartinh assert(bt->metapage != NULL); 794*5d465952Smartinh assert(bt->meta != NULL); 795*5d465952Smartinh 796*5d465952Smartinh bt->meta->prev_meta = bt->metapage->pgno; 797*5d465952Smartinh bt->metapage->pgno = bt->txn->next_pgno++; 798*5d465952Smartinh 799*5d465952Smartinh bt->meta->root = root; 800*5d465952Smartinh bt->meta->created_at = time(0); 801*5d465952Smartinh bt->meta->revisions++; 802*5d465952Smartinh SHA1((unsigned char *)bt->meta, METAHASHLEN, bt->meta->hash); 803*5d465952Smartinh 804*5d465952Smartinh if ((rc = write(bt->fd, bt->metapage, PAGESIZE)) != PAGESIZE) { 805*5d465952Smartinh if (rc > 0) 806*5d465952Smartinh DPRINTF("short write, filesystem full?"); 807*5d465952Smartinh return BT_FAIL; 808*5d465952Smartinh } 809*5d465952Smartinh 810*5d465952Smartinh if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { 811*5d465952Smartinh DPRINTF("failed to update file size: %s", strerror(errno)); 812*5d465952Smartinh bt->size = 0; 813*5d465952Smartinh } 814*5d465952Smartinh 815*5d465952Smartinh return BT_SUCCESS; 816*5d465952Smartinh } 817*5d465952Smartinh 818*5d465952Smartinh /* Returns true if page p is a valid meta page, false otherwise. 819*5d465952Smartinh */ 820*5d465952Smartinh static int 821*5d465952Smartinh btree_is_meta_page(struct page *p) 822*5d465952Smartinh { 823*5d465952Smartinh struct bt_meta *m; 824*5d465952Smartinh unsigned char hash[SHA_DIGEST_LENGTH]; 825*5d465952Smartinh 826*5d465952Smartinh m = METADATA(p); 827*5d465952Smartinh if (!F_ISSET(p->flags, P_META)) { 828*5d465952Smartinh DPRINTF("page %d not a meta page", p->pgno); 829*5d465952Smartinh return 0; 830*5d465952Smartinh } 831*5d465952Smartinh 832*5d465952Smartinh if (m->magic != BT_MAGIC) { 833*5d465952Smartinh DPRINTF("page %d has invalid magic", p->pgno); 834*5d465952Smartinh return 0; 835*5d465952Smartinh } 836*5d465952Smartinh 837*5d465952Smartinh if (m->root >= p->pgno && m->root != P_INVALID) { 838*5d465952Smartinh DPRINTF("page %d points to an invalid root page", p->pgno); 839*5d465952Smartinh return 0; 840*5d465952Smartinh } 841*5d465952Smartinh 842*5d465952Smartinh SHA1((unsigned char *)m, METAHASHLEN, hash); 843*5d465952Smartinh if (bcmp(hash, m->hash, SHA_DIGEST_LENGTH) != 0) { 844*5d465952Smartinh DPRINTF("page %d has an invalid digest", p->pgno); 845*5d465952Smartinh return 0; 846*5d465952Smartinh } 847*5d465952Smartinh 848*5d465952Smartinh return 1; 849*5d465952Smartinh } 850*5d465952Smartinh 851*5d465952Smartinh static void 852*5d465952Smartinh btree_init_meta(struct btree *bt) 853*5d465952Smartinh { 854*5d465952Smartinh bt->metapage->flags = P_META; 855*5d465952Smartinh bt->meta = METADATA(bt->metapage); 856*5d465952Smartinh bt->meta->magic = BT_MAGIC; 857*5d465952Smartinh bt->meta->version = BT_VERSION; 858*5d465952Smartinh bt->meta->flags = 0; 859*5d465952Smartinh bt->meta->psize = PAGESIZE; 860*5d465952Smartinh bt->meta->root = P_INVALID; 861*5d465952Smartinh } 862*5d465952Smartinh 863*5d465952Smartinh static int 864*5d465952Smartinh btree_read_meta(struct btree *bt, pgno_t *p_next) 865*5d465952Smartinh { 866*5d465952Smartinh pgno_t meta_pgno, next_pgno; 867*5d465952Smartinh off_t size; 868*5d465952Smartinh int rc; 869*5d465952Smartinh 870*5d465952Smartinh assert(bt != NULL); 871*5d465952Smartinh 872*5d465952Smartinh if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) 873*5d465952Smartinh goto fail; 874*5d465952Smartinh 875*5d465952Smartinh DPRINTF("btree_read_meta: size = %llu", size); 876*5d465952Smartinh 877*5d465952Smartinh if (size < bt->size) { 878*5d465952Smartinh DPRINTF("file has shrunk!"); 879*5d465952Smartinh errno = EIO; 880*5d465952Smartinh goto fail; 881*5d465952Smartinh } 882*5d465952Smartinh 883*5d465952Smartinh if (size == 0) { 884*5d465952Smartinh if (p_next != NULL) 885*5d465952Smartinh *p_next = 0; 886*5d465952Smartinh return BT_NOTFOUND; /* new file */ 887*5d465952Smartinh } 888*5d465952Smartinh 889*5d465952Smartinh next_pgno = size / PAGESIZE; 890*5d465952Smartinh if (next_pgno == 0) { 891*5d465952Smartinh DPRINTF("corrupt file"); 892*5d465952Smartinh errno = EIO; 893*5d465952Smartinh goto fail; 894*5d465952Smartinh } 895*5d465952Smartinh 896*5d465952Smartinh meta_pgno = next_pgno - 1; 897*5d465952Smartinh 898*5d465952Smartinh if (size % PAGESIZE != 0) { 899*5d465952Smartinh DPRINTF("filesize not a multiple of the page size!"); 900*5d465952Smartinh bt->flags |= BT_FIXPADDING; 901*5d465952Smartinh next_pgno++; 902*5d465952Smartinh } 903*5d465952Smartinh 904*5d465952Smartinh if (p_next != NULL) 905*5d465952Smartinh *p_next = next_pgno; 906*5d465952Smartinh 907*5d465952Smartinh if (size == bt->size) { 908*5d465952Smartinh DPRINTF("size unchanged, keeping current meta page"); 909*5d465952Smartinh return BT_SUCCESS; 910*5d465952Smartinh } 911*5d465952Smartinh bt->size = size; 912*5d465952Smartinh 913*5d465952Smartinh while (meta_pgno > 0) { 914*5d465952Smartinh rc = btree_read_page(bt, meta_pgno, bt->metapage); 915*5d465952Smartinh if (rc == BT_NOTFOUND) 916*5d465952Smartinh break; /* no meta page found */ 917*5d465952Smartinh if (rc == BT_SUCCESS && btree_is_meta_page(bt->metapage)) { 918*5d465952Smartinh bt->meta = METADATA(bt->metapage); 919*5d465952Smartinh return BT_SUCCESS; 920*5d465952Smartinh } 921*5d465952Smartinh --meta_pgno; /* scan backwards to first valid meta page */ 922*5d465952Smartinh } 923*5d465952Smartinh 924*5d465952Smartinh errno = EIO; 925*5d465952Smartinh fail: 926*5d465952Smartinh if (p_next != NULL) 927*5d465952Smartinh *p_next = P_INVALID; 928*5d465952Smartinh return BT_FAIL; 929*5d465952Smartinh } 930*5d465952Smartinh 931*5d465952Smartinh struct btree * 932*5d465952Smartinh btree_open_fd(int fd, uint32_t flags) 933*5d465952Smartinh { 934*5d465952Smartinh int fl; 935*5d465952Smartinh struct btree *bt; 936*5d465952Smartinh 937*5d465952Smartinh fl = fcntl(fd, F_GETFL, 0); 938*5d465952Smartinh if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1) 939*5d465952Smartinh return NULL; 940*5d465952Smartinh 941*5d465952Smartinh if ((bt = calloc(1, sizeof(*bt))) == NULL) 942*5d465952Smartinh return NULL; 943*5d465952Smartinh bt->fd = fd; 944*5d465952Smartinh bt->flags = flags; 945*5d465952Smartinh bt->flags &= ~BT_FIXPADDING; 946*5d465952Smartinh 947*5d465952Smartinh if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL) 948*5d465952Smartinh goto fail; 949*5d465952Smartinh bt->stat.max_cache = BT_MAXCACHE_DEF; 950*5d465952Smartinh RB_INIT(bt->page_cache); 951*5d465952Smartinh 952*5d465952Smartinh if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL) 953*5d465952Smartinh goto fail; 954*5d465952Smartinh TAILQ_INIT(bt->lru_queue); 955*5d465952Smartinh 956*5d465952Smartinh if ((bt->metapage = calloc(1, PAGESIZE)) == NULL) 957*5d465952Smartinh goto fail; 958*5d465952Smartinh 959*5d465952Smartinh if (btree_read_meta(bt, NULL) == BT_FAIL) 960*5d465952Smartinh goto fail; 961*5d465952Smartinh 962*5d465952Smartinh if (bt->meta == NULL) { 963*5d465952Smartinh DPRINTF("new database"); 964*5d465952Smartinh btree_init_meta(bt); 965*5d465952Smartinh } else if (bt->meta->version != BT_VERSION) { 966*5d465952Smartinh DPRINTF("database is version %u, expected version %u", 967*5d465952Smartinh bt->meta->version, BT_VERSION); 968*5d465952Smartinh errno = EINVAL; 969*5d465952Smartinh goto fail; 970*5d465952Smartinh } else { 971*5d465952Smartinh DPRINTF("opened database version %u, pagesize %u", 972*5d465952Smartinh bt->meta->version, bt->meta->psize); 973*5d465952Smartinh DPRINTF("timestamp: %s", ctime(&bt->meta->created_at)); 974*5d465952Smartinh DPRINTF("depth: %u", bt->meta->depth); 975*5d465952Smartinh DPRINTF("entries: %llu", bt->meta->entries); 976*5d465952Smartinh DPRINTF("revisions: %u", bt->meta->revisions); 977*5d465952Smartinh DPRINTF("branch pages: %u", bt->meta->branch_pages); 978*5d465952Smartinh DPRINTF("leaf pages: %u", bt->meta->leaf_pages); 979*5d465952Smartinh DPRINTF("overflow pages: %u", bt->meta->overflow_pages); 980*5d465952Smartinh DPRINTF("root: %u", bt->meta->root); 981*5d465952Smartinh DPRINTF("previous meta page: %u", bt->meta->prev_meta); 982*5d465952Smartinh } 983*5d465952Smartinh 984*5d465952Smartinh return bt; 985*5d465952Smartinh 986*5d465952Smartinh fail: 987*5d465952Smartinh free(bt->lru_queue); 988*5d465952Smartinh free(bt->page_cache); 989*5d465952Smartinh free(bt->metapage); 990*5d465952Smartinh free(bt); 991*5d465952Smartinh return NULL; 992*5d465952Smartinh } 993*5d465952Smartinh 994*5d465952Smartinh struct btree * 995*5d465952Smartinh btree_open(const char *path, uint32_t flags, mode_t mode) 996*5d465952Smartinh { 997*5d465952Smartinh int fd, oflags; 998*5d465952Smartinh struct btree *bt; 999*5d465952Smartinh 1000*5d465952Smartinh if (F_ISSET(flags, BT_RDONLY)) 1001*5d465952Smartinh oflags = O_RDONLY; 1002*5d465952Smartinh else 1003*5d465952Smartinh oflags = O_RDWR | O_CREAT | O_APPEND; 1004*5d465952Smartinh 1005*5d465952Smartinh if ((fd = open(path, oflags, mode)) == -1) 1006*5d465952Smartinh return NULL; 1007*5d465952Smartinh 1008*5d465952Smartinh if ((bt = btree_open_fd(fd, flags)) == NULL) 1009*5d465952Smartinh close(fd); 1010*5d465952Smartinh else { 1011*5d465952Smartinh bt->path = strdup(path); 1012*5d465952Smartinh bt->ref = 1; 1013*5d465952Smartinh DPRINTF("opened btree %p", bt); 1014*5d465952Smartinh } 1015*5d465952Smartinh 1016*5d465952Smartinh return bt; 1017*5d465952Smartinh } 1018*5d465952Smartinh 1019*5d465952Smartinh void 1020*5d465952Smartinh btree_close(struct btree *bt) 1021*5d465952Smartinh { 1022*5d465952Smartinh struct mpage *mp; 1023*5d465952Smartinh 1024*5d465952Smartinh if (bt != NULL && --bt->ref == 0) { 1025*5d465952Smartinh DPRINTF("ref is zero, closing btree %p", bt); 1026*5d465952Smartinh close(bt->fd); 1027*5d465952Smartinh 1028*5d465952Smartinh /* Free page_cache. */ 1029*5d465952Smartinh while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { 1030*5d465952Smartinh mpage_del(bt, mp); 1031*5d465952Smartinh free(mp); 1032*5d465952Smartinh } 1033*5d465952Smartinh free(bt->page_cache); 1034*5d465952Smartinh free(bt); 1035*5d465952Smartinh } 1036*5d465952Smartinh } 1037*5d465952Smartinh 1038*5d465952Smartinh /* Search for key within a leaf page, using binary search. 1039*5d465952Smartinh * Returns the smallest entry larger or equal to the key. 1040*5d465952Smartinh * If exactp is non-null, stores whether the found entry was an exact match 1041*5d465952Smartinh * in *exactp (1 or 0). 1042*5d465952Smartinh * If kip is non-null, stores the index of the found entry in *kip. 1043*5d465952Smartinh * If no entry larger of equal to the key is found, returns NULL. 1044*5d465952Smartinh */ 1045*5d465952Smartinh static struct node * 1046*5d465952Smartinh btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key, 1047*5d465952Smartinh int *exactp, unsigned int *kip) 1048*5d465952Smartinh { 1049*5d465952Smartinh unsigned int i = 0; 1050*5d465952Smartinh int low, high; 1051*5d465952Smartinh int rc = 0; 1052*5d465952Smartinh struct node *node; 1053*5d465952Smartinh struct btval nodekey; 1054*5d465952Smartinh 1055*5d465952Smartinh DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", 1056*5d465952Smartinh NUMKEYS(mp), 1057*5d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 1058*5d465952Smartinh mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str); 1059*5d465952Smartinh 1060*5d465952Smartinh assert(NUMKEYS(mp) > 0); 1061*5d465952Smartinh 1062*5d465952Smartinh bzero(&nodekey, sizeof(nodekey)); 1063*5d465952Smartinh 1064*5d465952Smartinh low = IS_LEAF(mp) ? 0 : 1; 1065*5d465952Smartinh high = NUMKEYS(mp) - 1; 1066*5d465952Smartinh while (low <= high) { 1067*5d465952Smartinh i = (low + high) >> 1; 1068*5d465952Smartinh node = NODEPTR(mp, i); 1069*5d465952Smartinh 1070*5d465952Smartinh nodekey.size = node->ksize; 1071*5d465952Smartinh nodekey.data = NODEKEY(node); 1072*5d465952Smartinh 1073*5d465952Smartinh if (bt->cmp) 1074*5d465952Smartinh rc = bt->cmp(key, &nodekey); 1075*5d465952Smartinh else 1076*5d465952Smartinh rc = bt_cmp(bt, key, &nodekey, &mp->prefix); 1077*5d465952Smartinh 1078*5d465952Smartinh if (IS_LEAF(mp)) 1079*5d465952Smartinh DPRINTF("found leaf index %u [%.*s], rc = %i", 1080*5d465952Smartinh i, (int)nodekey.size, (char *)nodekey.data, rc); 1081*5d465952Smartinh else 1082*5d465952Smartinh DPRINTF("found branch index %u [%.*s -> %u], rc = %i", 1083*5d465952Smartinh i, (int)node->ksize, (char *)NODEKEY(node), 1084*5d465952Smartinh node->n_pgno, rc); 1085*5d465952Smartinh 1086*5d465952Smartinh if (rc == 0) 1087*5d465952Smartinh break; 1088*5d465952Smartinh if (rc > 0) 1089*5d465952Smartinh low = i + 1; 1090*5d465952Smartinh else 1091*5d465952Smartinh high = i - 1; 1092*5d465952Smartinh } 1093*5d465952Smartinh 1094*5d465952Smartinh if (rc > 0) { /* Found entry is less than the key. */ 1095*5d465952Smartinh i++; /* Skip to get the smallest entry larger than key. */ 1096*5d465952Smartinh if (i >= NUMKEYS(mp)) 1097*5d465952Smartinh /* There is no entry larger or equal to the key. */ 1098*5d465952Smartinh return NULL; 1099*5d465952Smartinh } 1100*5d465952Smartinh if (exactp) 1101*5d465952Smartinh *exactp = (rc == 0); 1102*5d465952Smartinh if (kip) /* Store the key index if requested. */ 1103*5d465952Smartinh *kip = i; 1104*5d465952Smartinh 1105*5d465952Smartinh return NODEPTR(mp, i); 1106*5d465952Smartinh } 1107*5d465952Smartinh 1108*5d465952Smartinh static void 1109*5d465952Smartinh cursor_pop_page(struct cursor *cursor) 1110*5d465952Smartinh { 1111*5d465952Smartinh struct ppage *top; 1112*5d465952Smartinh 1113*5d465952Smartinh top = CURSOR_TOP(cursor); 1114*5d465952Smartinh CURSOR_POP(cursor); 1115*5d465952Smartinh top->mpage->ref--; 1116*5d465952Smartinh 1117*5d465952Smartinh DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); 1118*5d465952Smartinh 1119*5d465952Smartinh free(top); 1120*5d465952Smartinh } 1121*5d465952Smartinh 1122*5d465952Smartinh static struct ppage * 1123*5d465952Smartinh cursor_push_page(struct cursor *cursor, struct mpage *mp) 1124*5d465952Smartinh { 1125*5d465952Smartinh struct ppage *ppage; 1126*5d465952Smartinh 1127*5d465952Smartinh DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); 1128*5d465952Smartinh 1129*5d465952Smartinh if ((ppage = calloc(1, sizeof(*ppage))) == NULL) 1130*5d465952Smartinh return NULL; 1131*5d465952Smartinh ppage->mpage = mp; 1132*5d465952Smartinh mp->ref++; 1133*5d465952Smartinh CURSOR_PUSH(cursor, ppage); 1134*5d465952Smartinh return ppage; 1135*5d465952Smartinh } 1136*5d465952Smartinh 1137*5d465952Smartinh static int 1138*5d465952Smartinh btree_get_mpage(struct btree *bt, pgno_t pgno, struct mpage **mpp) 1139*5d465952Smartinh { 1140*5d465952Smartinh int rc; 1141*5d465952Smartinh struct mpage *mp; 1142*5d465952Smartinh 1143*5d465952Smartinh mp = mpage_lookup(bt, pgno); 1144*5d465952Smartinh if (mp == NULL) { 1145*5d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL) 1146*5d465952Smartinh return BT_FAIL; 1147*5d465952Smartinh if ((rc = btree_read_page(bt, pgno, &mp->page)) != BT_SUCCESS) { 1148*5d465952Smartinh free(mp); 1149*5d465952Smartinh return rc; 1150*5d465952Smartinh } 1151*5d465952Smartinh mp->pgno = pgno; 1152*5d465952Smartinh mpage_add(bt, mp); 1153*5d465952Smartinh } else 1154*5d465952Smartinh DPRINTF("returning page %u from cache", pgno); 1155*5d465952Smartinh 1156*5d465952Smartinh *mpp = mp; 1157*5d465952Smartinh return BT_SUCCESS; 1158*5d465952Smartinh } 1159*5d465952Smartinh 1160*5d465952Smartinh static void 1161*5d465952Smartinh concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2, 1162*5d465952Smartinh char *cs, size_t *cn) 1163*5d465952Smartinh { 1164*5d465952Smartinh assert(*cn >= n1 + n2); 1165*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 1166*5d465952Smartinh bcopy(s2, cs, n2); 1167*5d465952Smartinh bcopy(s1, cs + n2, n1); 1168*5d465952Smartinh } else { 1169*5d465952Smartinh bcopy(s1, cs, n1); 1170*5d465952Smartinh bcopy(s2, cs + n1, n2); 1171*5d465952Smartinh } 1172*5d465952Smartinh *cn = n1 + n2; 1173*5d465952Smartinh } 1174*5d465952Smartinh 1175*5d465952Smartinh static void 1176*5d465952Smartinh find_common_prefix(struct btree *bt, struct mpage *mp) 1177*5d465952Smartinh { 1178*5d465952Smartinh indx_t lbound = 0, ubound = 0; 1179*5d465952Smartinh struct mpage *lp, *up; 1180*5d465952Smartinh struct btkey lprefix, uprefix; 1181*5d465952Smartinh 1182*5d465952Smartinh mp->prefix.len = 0; 1183*5d465952Smartinh if (bt->cmp != NULL) 1184*5d465952Smartinh return; 1185*5d465952Smartinh 1186*5d465952Smartinh lp = mp; 1187*5d465952Smartinh while (lp->parent != NULL) { 1188*5d465952Smartinh if (lp->parent_index > 0) { 1189*5d465952Smartinh lbound = lp->parent_index; 1190*5d465952Smartinh break; 1191*5d465952Smartinh } 1192*5d465952Smartinh lp = lp->parent; 1193*5d465952Smartinh } 1194*5d465952Smartinh 1195*5d465952Smartinh up = mp; 1196*5d465952Smartinh while (up->parent != NULL) { 1197*5d465952Smartinh if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { 1198*5d465952Smartinh ubound = up->parent_index + 1; 1199*5d465952Smartinh break; 1200*5d465952Smartinh } 1201*5d465952Smartinh up = up->parent; 1202*5d465952Smartinh } 1203*5d465952Smartinh 1204*5d465952Smartinh if (lp->parent != NULL && up->parent != NULL) { 1205*5d465952Smartinh expand_prefix(bt, lp->parent, lbound, &lprefix); 1206*5d465952Smartinh expand_prefix(bt, up->parent, ubound, &uprefix); 1207*5d465952Smartinh common_prefix(bt, &lprefix, &uprefix, &mp->prefix); 1208*5d465952Smartinh } 1209*5d465952Smartinh else if (mp->parent) 1210*5d465952Smartinh bcopy(&mp->parent->prefix, &mp->prefix, sizeof(mp->prefix)); 1211*5d465952Smartinh 1212*5d465952Smartinh DPRINTF("found common prefix [%.*s] (len %zu) for page %u", 1213*5d465952Smartinh (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno); 1214*5d465952Smartinh } 1215*5d465952Smartinh 1216*5d465952Smartinh static int 1217*5d465952Smartinh btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key, 1218*5d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp) 1219*5d465952Smartinh { 1220*5d465952Smartinh struct mpage *mp, *parent; 1221*5d465952Smartinh 1222*5d465952Smartinh if (cursor && cursor_push_page(cursor, root) == NULL) 1223*5d465952Smartinh return BT_FAIL; 1224*5d465952Smartinh 1225*5d465952Smartinh mp = root; 1226*5d465952Smartinh while (IS_BRANCH(mp)) { 1227*5d465952Smartinh unsigned int i = 0; 1228*5d465952Smartinh struct node *node; 1229*5d465952Smartinh 1230*5d465952Smartinh DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); 1231*5d465952Smartinh assert(NUMKEYS(mp) > 1); 1232*5d465952Smartinh node = NODEPTR(mp, 0); 1233*5d465952Smartinh DPRINTF("found index 0 to page %u", NODEPGNO(node)); 1234*5d465952Smartinh 1235*5d465952Smartinh if (key == NULL) /* Initialize cursor to first page. */ 1236*5d465952Smartinh i = 0; 1237*5d465952Smartinh else { 1238*5d465952Smartinh int exact; 1239*5d465952Smartinh node = btree_search_node(bt, mp, key, &exact, &i); 1240*5d465952Smartinh if (node == NULL) 1241*5d465952Smartinh i = NUMKEYS(mp) - 1; 1242*5d465952Smartinh else if (!exact) { 1243*5d465952Smartinh assert(i > 0); 1244*5d465952Smartinh i--; 1245*5d465952Smartinh } 1246*5d465952Smartinh } 1247*5d465952Smartinh 1248*5d465952Smartinh if (key) 1249*5d465952Smartinh DPRINTF("following index %u for key %.*s", 1250*5d465952Smartinh i, (int)key->size, (char *)key->data); 1251*5d465952Smartinh assert(i >= 0 && i < NUMKEYS(mp)); 1252*5d465952Smartinh node = NODEPTR(mp, i); 1253*5d465952Smartinh 1254*5d465952Smartinh if (cursor) 1255*5d465952Smartinh CURSOR_TOP(cursor)->ki = i; 1256*5d465952Smartinh 1257*5d465952Smartinh parent = mp; 1258*5d465952Smartinh if (btree_get_mpage(bt, NODEPGNO(node), &mp) != BT_SUCCESS) 1259*5d465952Smartinh return BT_FAIL; 1260*5d465952Smartinh mp->parent = parent; 1261*5d465952Smartinh mp->parent_index = i; 1262*5d465952Smartinh find_common_prefix(bt, mp); 1263*5d465952Smartinh 1264*5d465952Smartinh if (cursor && cursor_push_page(cursor, mp) == NULL) 1265*5d465952Smartinh return BT_FAIL; 1266*5d465952Smartinh 1267*5d465952Smartinh if (modify) 1268*5d465952Smartinh mpage_touch(bt, mp); 1269*5d465952Smartinh } 1270*5d465952Smartinh 1271*5d465952Smartinh if (!IS_LEAF(mp)) { 1272*5d465952Smartinh DPRINTF("internal error, index points to a %02X page!?", 1273*5d465952Smartinh mp->page.flags); 1274*5d465952Smartinh return BT_FAIL; 1275*5d465952Smartinh } 1276*5d465952Smartinh 1277*5d465952Smartinh DPRINTF("found leaf page %u for key %.*s", mp->pgno, 1278*5d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL); 1279*5d465952Smartinh 1280*5d465952Smartinh *mpp = mp; 1281*5d465952Smartinh return BT_SUCCESS; 1282*5d465952Smartinh } 1283*5d465952Smartinh 1284*5d465952Smartinh /* Search for the page a given key should be in. 1285*5d465952Smartinh * Stores a pointer to the found page in *mpp. 1286*5d465952Smartinh * If key is NULL, search for the lowest page (used by btree_cursor_first). 1287*5d465952Smartinh * If cursor is non-null, pushes parent pages on the cursor stack. 1288*5d465952Smartinh * If modify is true, visited pages are updated with new page numbers. 1289*5d465952Smartinh */ 1290*5d465952Smartinh static int 1291*5d465952Smartinh btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key, 1292*5d465952Smartinh struct cursor *cursor, int modify, struct mpage **mpp) 1293*5d465952Smartinh { 1294*5d465952Smartinh int rc; 1295*5d465952Smartinh pgno_t root; 1296*5d465952Smartinh struct mpage *mp; 1297*5d465952Smartinh 1298*5d465952Smartinh /* Can't modify pages outside a transaction. */ 1299*5d465952Smartinh if (txn == NULL && modify) { 1300*5d465952Smartinh errno = EINVAL; 1301*5d465952Smartinh return BT_FAIL; 1302*5d465952Smartinh } 1303*5d465952Smartinh 1304*5d465952Smartinh /* Choose which root page to start with. If a transaction is given 1305*5d465952Smartinh * use the root page from the transaction, otherwise read the last 1306*5d465952Smartinh * committed root page. 1307*5d465952Smartinh */ 1308*5d465952Smartinh if (txn == NULL) { 1309*5d465952Smartinh if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS) 1310*5d465952Smartinh return rc; 1311*5d465952Smartinh root = bt->meta->root; 1312*5d465952Smartinh } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 1313*5d465952Smartinh DPRINTF("transaction has failed, must abort"); 1314*5d465952Smartinh return BT_FAIL; 1315*5d465952Smartinh } else 1316*5d465952Smartinh root = txn->root; 1317*5d465952Smartinh 1318*5d465952Smartinh if (root == P_INVALID) /* Tree is empty. */ 1319*5d465952Smartinh return BT_NOTFOUND; 1320*5d465952Smartinh 1321*5d465952Smartinh if ((rc = btree_get_mpage(bt, root, &mp)) != BT_SUCCESS) 1322*5d465952Smartinh return rc; 1323*5d465952Smartinh 1324*5d465952Smartinh assert(mp->parent == NULL); 1325*5d465952Smartinh assert(mp->prefix.len == 0); 1326*5d465952Smartinh 1327*5d465952Smartinh if (modify && !mp->dirty) { 1328*5d465952Smartinh mpage_touch(bt, mp); 1329*5d465952Smartinh txn->root = mp->pgno; 1330*5d465952Smartinh } 1331*5d465952Smartinh 1332*5d465952Smartinh return btree_search_page_root(bt, mp, key, cursor, modify, mpp); 1333*5d465952Smartinh } 1334*5d465952Smartinh 1335*5d465952Smartinh static int 1336*5d465952Smartinh btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf, 1337*5d465952Smartinh struct btval *data) 1338*5d465952Smartinh { 1339*5d465952Smartinh size_t psz; 1340*5d465952Smartinh size_t max; 1341*5d465952Smartinh size_t sz = 0; 1342*5d465952Smartinh pgno_t pgno; 1343*5d465952Smartinh struct page p; 1344*5d465952Smartinh 1345*5d465952Smartinh bzero(data, sizeof(*data)); 1346*5d465952Smartinh max = PAGESIZE - PAGEHDRSZ; 1347*5d465952Smartinh 1348*5d465952Smartinh if (!F_ISSET(leaf->flags, F_BIGDATA)) { 1349*5d465952Smartinh data->size = leaf->n_dsize; 1350*5d465952Smartinh if (data->size > 0) { 1351*5d465952Smartinh if (mp == NULL) { 1352*5d465952Smartinh if ((data->data = malloc(data->size)) == NULL) 1353*5d465952Smartinh return BT_FAIL; 1354*5d465952Smartinh bcopy(NODEDATA(leaf), data->data, data->size); 1355*5d465952Smartinh data->free_data = 1; 1356*5d465952Smartinh data->mp = NULL; 1357*5d465952Smartinh } else { 1358*5d465952Smartinh data->data = NODEDATA(leaf); 1359*5d465952Smartinh data->free_data = 0; 1360*5d465952Smartinh data->mp = mp; 1361*5d465952Smartinh mp->ref++; 1362*5d465952Smartinh } 1363*5d465952Smartinh } 1364*5d465952Smartinh return BT_SUCCESS; 1365*5d465952Smartinh } 1366*5d465952Smartinh 1367*5d465952Smartinh /* Read overflow data. 1368*5d465952Smartinh */ 1369*5d465952Smartinh DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); 1370*5d465952Smartinh if ((data->data = malloc(leaf->n_dsize)) == NULL) 1371*5d465952Smartinh return BT_FAIL; 1372*5d465952Smartinh data->size = leaf->n_dsize; 1373*5d465952Smartinh data->free_data = 1; 1374*5d465952Smartinh data->mp = NULL; 1375*5d465952Smartinh pgno = *(pgno_t *)NODEDATA(leaf); /* XXX: alignment? */ 1376*5d465952Smartinh for (sz = 0; sz < data->size; ) { 1377*5d465952Smartinh if (btree_read_page(bt, pgno, &p) != 0 || 1378*5d465952Smartinh !F_ISSET(p.flags, P_OVERFLOW)) { 1379*5d465952Smartinh DPRINTF("read overflow page failed (%02x)", p.flags); 1380*5d465952Smartinh free(data->data); 1381*5d465952Smartinh return BT_FAIL; 1382*5d465952Smartinh } 1383*5d465952Smartinh psz = data->size - sz; 1384*5d465952Smartinh if (psz > max) 1385*5d465952Smartinh psz = max; 1386*5d465952Smartinh bcopy(p.ptrs, (char *)data->data + sz, psz); 1387*5d465952Smartinh sz += psz; 1388*5d465952Smartinh pgno = p.p_next_pgno; 1389*5d465952Smartinh } 1390*5d465952Smartinh 1391*5d465952Smartinh return BT_SUCCESS; 1392*5d465952Smartinh } 1393*5d465952Smartinh 1394*5d465952Smartinh int 1395*5d465952Smartinh btree_txn_get(struct btree *bt, struct btree_txn *txn, 1396*5d465952Smartinh struct btval *key, struct btval *data) 1397*5d465952Smartinh { 1398*5d465952Smartinh int rc, exact; 1399*5d465952Smartinh struct node *leaf; 1400*5d465952Smartinh struct mpage *mp; 1401*5d465952Smartinh 1402*5d465952Smartinh assert(bt); 1403*5d465952Smartinh assert(key); 1404*5d465952Smartinh assert(data); 1405*5d465952Smartinh DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); 1406*5d465952Smartinh 1407*5d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 1408*5d465952Smartinh errno = EINVAL; 1409*5d465952Smartinh return BT_FAIL; 1410*5d465952Smartinh } 1411*5d465952Smartinh 1412*5d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS) 1413*5d465952Smartinh return rc; 1414*5d465952Smartinh 1415*5d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, NULL); 1416*5d465952Smartinh if (leaf && exact) 1417*5d465952Smartinh rc = btree_read_data(bt, mp, leaf, data); 1418*5d465952Smartinh else 1419*5d465952Smartinh rc = BT_NOTFOUND; 1420*5d465952Smartinh 1421*5d465952Smartinh mpage_prune(bt); 1422*5d465952Smartinh return rc; 1423*5d465952Smartinh } 1424*5d465952Smartinh 1425*5d465952Smartinh static int 1426*5d465952Smartinh btree_sibling(struct cursor *cursor, int move_right) 1427*5d465952Smartinh { 1428*5d465952Smartinh int rc; 1429*5d465952Smartinh struct node *indx; 1430*5d465952Smartinh struct ppage *parent, *top; 1431*5d465952Smartinh struct mpage *mp; 1432*5d465952Smartinh 1433*5d465952Smartinh top = CURSOR_TOP(cursor); 1434*5d465952Smartinh if ((parent = SLIST_NEXT(top, entry)) == NULL) 1435*5d465952Smartinh return BT_NOTFOUND; /* root has no siblings */ 1436*5d465952Smartinh 1437*5d465952Smartinh DPRINTF("parent page is page %u, index %u", 1438*5d465952Smartinh parent->mpage->pgno, parent->ki); 1439*5d465952Smartinh 1440*5d465952Smartinh cursor_pop_page(cursor); 1441*5d465952Smartinh if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage)) 1442*5d465952Smartinh : (parent->ki == 0)) { 1443*5d465952Smartinh DPRINTF("no more keys left, moving to %s sibling", 1444*5d465952Smartinh move_right ? "right" : "left"); 1445*5d465952Smartinh if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS) 1446*5d465952Smartinh return rc; 1447*5d465952Smartinh parent = CURSOR_TOP(cursor); 1448*5d465952Smartinh } else { 1449*5d465952Smartinh if (move_right) 1450*5d465952Smartinh parent->ki++; 1451*5d465952Smartinh else 1452*5d465952Smartinh parent->ki--; 1453*5d465952Smartinh DPRINTF("just moving to %s index key %u", 1454*5d465952Smartinh move_right ? "right" : "left", parent->ki); 1455*5d465952Smartinh } 1456*5d465952Smartinh assert(IS_BRANCH(parent->mpage)); 1457*5d465952Smartinh 1458*5d465952Smartinh indx = NODEPTR(parent->mpage, parent->ki); 1459*5d465952Smartinh if (btree_get_mpage(cursor->bt, indx->n_pgno, &mp) != BT_SUCCESS) 1460*5d465952Smartinh return BT_FAIL; 1461*5d465952Smartinh mp->parent = parent->mpage; 1462*5d465952Smartinh mp->parent_index = parent->ki; 1463*5d465952Smartinh 1464*5d465952Smartinh cursor_push_page(cursor, mp); 1465*5d465952Smartinh find_common_prefix(cursor->bt, mp); 1466*5d465952Smartinh 1467*5d465952Smartinh return BT_SUCCESS; 1468*5d465952Smartinh } 1469*5d465952Smartinh 1470*5d465952Smartinh static int 1471*5d465952Smartinh bt_set_key(struct btree *bt, struct mpage *mp, struct node *node, 1472*5d465952Smartinh struct btval *key) 1473*5d465952Smartinh { 1474*5d465952Smartinh if (key == NULL) 1475*5d465952Smartinh return 0; 1476*5d465952Smartinh 1477*5d465952Smartinh if (mp->prefix.len > 0) { 1478*5d465952Smartinh key->size = node->ksize + mp->prefix.len; 1479*5d465952Smartinh key->data = malloc(key->size); 1480*5d465952Smartinh if (key->data == NULL) 1481*5d465952Smartinh return -1; 1482*5d465952Smartinh concat_prefix(bt, 1483*5d465952Smartinh mp->prefix.str, mp->prefix.len, 1484*5d465952Smartinh NODEKEY(node), node->ksize, 1485*5d465952Smartinh key->data, &key->size); 1486*5d465952Smartinh key->free_data = 1; 1487*5d465952Smartinh } else { 1488*5d465952Smartinh key->size = node->ksize; 1489*5d465952Smartinh key->data = NODEKEY(node); 1490*5d465952Smartinh key->free_data = 0; 1491*5d465952Smartinh key->mp = mp; 1492*5d465952Smartinh mp->ref++; 1493*5d465952Smartinh } 1494*5d465952Smartinh 1495*5d465952Smartinh return 0; 1496*5d465952Smartinh } 1497*5d465952Smartinh 1498*5d465952Smartinh static int 1499*5d465952Smartinh btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data) 1500*5d465952Smartinh { 1501*5d465952Smartinh struct ppage *top; 1502*5d465952Smartinh struct mpage *mp; 1503*5d465952Smartinh struct node *leaf; 1504*5d465952Smartinh 1505*5d465952Smartinh if (cursor->eof) 1506*5d465952Smartinh return BT_NOTFOUND; 1507*5d465952Smartinh 1508*5d465952Smartinh assert(cursor->initialized); 1509*5d465952Smartinh 1510*5d465952Smartinh top = CURSOR_TOP(cursor); 1511*5d465952Smartinh mp = top->mpage; 1512*5d465952Smartinh 1513*5d465952Smartinh DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); 1514*5d465952Smartinh 1515*5d465952Smartinh if (top->ki + 1 >= NUMKEYS(mp)) { 1516*5d465952Smartinh DPRINTF("=====> move to next sibling page"); 1517*5d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS) { 1518*5d465952Smartinh cursor->eof = 1; 1519*5d465952Smartinh return BT_NOTFOUND; 1520*5d465952Smartinh } 1521*5d465952Smartinh top = CURSOR_TOP(cursor); 1522*5d465952Smartinh mp = top->mpage; 1523*5d465952Smartinh DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); 1524*5d465952Smartinh } else 1525*5d465952Smartinh top->ki++; 1526*5d465952Smartinh 1527*5d465952Smartinh DPRINTF("==> cursor points to page %u with %lu keys, key index %u", 1528*5d465952Smartinh mp->pgno, NUMKEYS(mp), top->ki); 1529*5d465952Smartinh 1530*5d465952Smartinh assert(IS_LEAF(mp)); 1531*5d465952Smartinh leaf = NODEPTR(mp, top->ki); 1532*5d465952Smartinh 1533*5d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1534*5d465952Smartinh return BT_FAIL; 1535*5d465952Smartinh 1536*5d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1537*5d465952Smartinh return BT_FAIL; 1538*5d465952Smartinh 1539*5d465952Smartinh return BT_SUCCESS; 1540*5d465952Smartinh } 1541*5d465952Smartinh 1542*5d465952Smartinh static int 1543*5d465952Smartinh btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data) 1544*5d465952Smartinh { 1545*5d465952Smartinh int rc; 1546*5d465952Smartinh struct node *leaf; 1547*5d465952Smartinh struct mpage *mp; 1548*5d465952Smartinh struct ppage *top; 1549*5d465952Smartinh 1550*5d465952Smartinh assert(cursor); 1551*5d465952Smartinh assert(key); 1552*5d465952Smartinh assert(key->size > 0); 1553*5d465952Smartinh 1554*5d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp); 1555*5d465952Smartinh if (rc != BT_SUCCESS) 1556*5d465952Smartinh return rc; 1557*5d465952Smartinh assert(IS_LEAF(mp)); 1558*5d465952Smartinh 1559*5d465952Smartinh top = CURSOR_TOP(cursor); 1560*5d465952Smartinh leaf = btree_search_node(cursor->bt, mp, key, NULL, &top->ki); 1561*5d465952Smartinh if (leaf == NULL) { 1562*5d465952Smartinh DPRINTF("===> inexact leaf not found, goto sibling"); 1563*5d465952Smartinh if (btree_sibling(cursor, 1) != BT_SUCCESS) 1564*5d465952Smartinh return BT_NOTFOUND; /* no entries matched */ 1565*5d465952Smartinh top = CURSOR_TOP(cursor); 1566*5d465952Smartinh top->ki = 0; 1567*5d465952Smartinh mp = top->mpage; 1568*5d465952Smartinh assert(IS_LEAF(mp)); 1569*5d465952Smartinh leaf = NODEPTR(mp, 0); 1570*5d465952Smartinh } 1571*5d465952Smartinh 1572*5d465952Smartinh cursor->initialized = 1; 1573*5d465952Smartinh cursor->eof = 0; 1574*5d465952Smartinh 1575*5d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1576*5d465952Smartinh return BT_FAIL; 1577*5d465952Smartinh 1578*5d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1579*5d465952Smartinh return BT_FAIL; 1580*5d465952Smartinh DPRINTF("==> cursor placed on key %.*s", 1581*5d465952Smartinh (int)key->size, (char *)key->data); 1582*5d465952Smartinh 1583*5d465952Smartinh return BT_SUCCESS; 1584*5d465952Smartinh } 1585*5d465952Smartinh 1586*5d465952Smartinh static int 1587*5d465952Smartinh btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data) 1588*5d465952Smartinh { 1589*5d465952Smartinh int rc; 1590*5d465952Smartinh struct mpage *mp; 1591*5d465952Smartinh struct node *leaf; 1592*5d465952Smartinh 1593*5d465952Smartinh rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp); 1594*5d465952Smartinh if (rc != BT_SUCCESS) 1595*5d465952Smartinh return rc; 1596*5d465952Smartinh assert(IS_LEAF(mp)); 1597*5d465952Smartinh 1598*5d465952Smartinh leaf = NODEPTR(mp, 0); 1599*5d465952Smartinh cursor->initialized = 1; 1600*5d465952Smartinh cursor->eof = 0; 1601*5d465952Smartinh 1602*5d465952Smartinh if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1603*5d465952Smartinh return BT_FAIL; 1604*5d465952Smartinh 1605*5d465952Smartinh if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1606*5d465952Smartinh return BT_FAIL; 1607*5d465952Smartinh 1608*5d465952Smartinh return BT_SUCCESS; 1609*5d465952Smartinh } 1610*5d465952Smartinh 1611*5d465952Smartinh int 1612*5d465952Smartinh btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data, 1613*5d465952Smartinh enum cursor_op op) 1614*5d465952Smartinh { 1615*5d465952Smartinh int rc; 1616*5d465952Smartinh 1617*5d465952Smartinh assert(cursor); 1618*5d465952Smartinh 1619*5d465952Smartinh switch (op) { 1620*5d465952Smartinh case BT_CURSOR: 1621*5d465952Smartinh while (CURSOR_TOP(cursor) != NULL) 1622*5d465952Smartinh cursor_pop_page(cursor); 1623*5d465952Smartinh if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { 1624*5d465952Smartinh errno = EINVAL; 1625*5d465952Smartinh rc = BT_FAIL; 1626*5d465952Smartinh } else 1627*5d465952Smartinh rc = btree_cursor_set(cursor, key, data); 1628*5d465952Smartinh break; 1629*5d465952Smartinh case BT_NEXT: 1630*5d465952Smartinh if (!cursor->initialized) 1631*5d465952Smartinh rc = btree_cursor_first(cursor, key, data); 1632*5d465952Smartinh else 1633*5d465952Smartinh rc = btree_cursor_next(cursor, key, data); 1634*5d465952Smartinh break; 1635*5d465952Smartinh case BT_FIRST: 1636*5d465952Smartinh while (CURSOR_TOP(cursor) != NULL) 1637*5d465952Smartinh cursor_pop_page(cursor); 1638*5d465952Smartinh rc = btree_cursor_first(cursor, key, data); 1639*5d465952Smartinh break; 1640*5d465952Smartinh default: 1641*5d465952Smartinh DPRINTF("unhandled/unimplemented cursor operation %u", op); 1642*5d465952Smartinh rc = BT_FAIL; 1643*5d465952Smartinh break; 1644*5d465952Smartinh } 1645*5d465952Smartinh 1646*5d465952Smartinh mpage_prune(cursor->bt); 1647*5d465952Smartinh 1648*5d465952Smartinh return rc; 1649*5d465952Smartinh } 1650*5d465952Smartinh 1651*5d465952Smartinh static struct mpage * 1652*5d465952Smartinh btree_new_page(struct btree *bt, uint32_t flags) 1653*5d465952Smartinh { 1654*5d465952Smartinh struct mpage *mp; 1655*5d465952Smartinh 1656*5d465952Smartinh assert(bt != NULL); 1657*5d465952Smartinh assert(bt->txn != NULL); 1658*5d465952Smartinh 1659*5d465952Smartinh DPRINTF("allocating new mpage %u", bt->txn->next_pgno); 1660*5d465952Smartinh if ((mp = calloc(1, sizeof(*mp))) == NULL) 1661*5d465952Smartinh return NULL; 1662*5d465952Smartinh mp->pgno = mp->page.pgno = bt->txn->next_pgno++; 1663*5d465952Smartinh mp->page.flags = flags; 1664*5d465952Smartinh mp->page.lower = PAGEHDRSZ; 1665*5d465952Smartinh mp->page.upper = PAGESIZE; 1666*5d465952Smartinh 1667*5d465952Smartinh if (IS_BRANCH(mp)) 1668*5d465952Smartinh bt->meta->branch_pages++; 1669*5d465952Smartinh else if (IS_LEAF(mp)) 1670*5d465952Smartinh bt->meta->leaf_pages++; 1671*5d465952Smartinh else if (IS_OVERFLOW(mp)) 1672*5d465952Smartinh bt->meta->overflow_pages++; 1673*5d465952Smartinh 1674*5d465952Smartinh mpage_add(bt, mp); 1675*5d465952Smartinh mpage_dirty(bt, mp); 1676*5d465952Smartinh 1677*5d465952Smartinh return mp; 1678*5d465952Smartinh } 1679*5d465952Smartinh 1680*5d465952Smartinh static size_t 1681*5d465952Smartinh bt_leaf_size(struct btval *key, struct btval *data) 1682*5d465952Smartinh { 1683*5d465952Smartinh size_t sz; 1684*5d465952Smartinh 1685*5d465952Smartinh sz = LEAFSIZE(key, data); 1686*5d465952Smartinh if (data->size >= PAGESIZE / BT_MINKEYS) { 1687*5d465952Smartinh /* put on overflow page */ 1688*5d465952Smartinh sz -= data->size - sizeof(pgno_t); 1689*5d465952Smartinh } 1690*5d465952Smartinh 1691*5d465952Smartinh return sz + sizeof(indx_t); 1692*5d465952Smartinh } 1693*5d465952Smartinh 1694*5d465952Smartinh static size_t 1695*5d465952Smartinh bt_branch_size(struct btval *key) 1696*5d465952Smartinh { 1697*5d465952Smartinh size_t sz; 1698*5d465952Smartinh 1699*5d465952Smartinh sz = INDXSIZE(key); 1700*5d465952Smartinh if (sz >= PAGESIZE / BT_MINKEYS) { 1701*5d465952Smartinh /* put on overflow page */ 1702*5d465952Smartinh /* not implemented */ 1703*5d465952Smartinh /* sz -= key->size - sizeof(pgno_t); */ 1704*5d465952Smartinh } 1705*5d465952Smartinh 1706*5d465952Smartinh return sz + sizeof(indx_t); 1707*5d465952Smartinh } 1708*5d465952Smartinh 1709*5d465952Smartinh static int 1710*5d465952Smartinh btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data) 1711*5d465952Smartinh { 1712*5d465952Smartinh size_t done = 0; 1713*5d465952Smartinh size_t sz; 1714*5d465952Smartinh size_t max; 1715*5d465952Smartinh pgno_t *linkp; /* linked page stored here */ 1716*5d465952Smartinh struct mpage *next = NULL; 1717*5d465952Smartinh 1718*5d465952Smartinh max = PAGESIZE - PAGEHDRSZ; 1719*5d465952Smartinh 1720*5d465952Smartinh while (done < data->size) { 1721*5d465952Smartinh linkp = &p->p_next_pgno; 1722*5d465952Smartinh if (data->size - done > max) { 1723*5d465952Smartinh /* need another overflow page */ 1724*5d465952Smartinh if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL) 1725*5d465952Smartinh return BT_FAIL; 1726*5d465952Smartinh *linkp = next->pgno; 1727*5d465952Smartinh DPRINTF("linking overflow page %u", next->pgno); 1728*5d465952Smartinh } else 1729*5d465952Smartinh *linkp = 0; /* indicates end of list */ 1730*5d465952Smartinh sz = data->size - done; 1731*5d465952Smartinh if (sz > max) 1732*5d465952Smartinh sz = max; 1733*5d465952Smartinh DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); 1734*5d465952Smartinh bcopy((char *)data->data + done, p->ptrs, sz); 1735*5d465952Smartinh done += sz; 1736*5d465952Smartinh p = &next->page; 1737*5d465952Smartinh } 1738*5d465952Smartinh 1739*5d465952Smartinh return BT_SUCCESS; 1740*5d465952Smartinh } 1741*5d465952Smartinh 1742*5d465952Smartinh /* Key prefix should already be stripped. 1743*5d465952Smartinh */ 1744*5d465952Smartinh static int 1745*5d465952Smartinh btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx, 1746*5d465952Smartinh struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags) 1747*5d465952Smartinh { 1748*5d465952Smartinh unsigned int i; 1749*5d465952Smartinh size_t node_size = NODESIZE; 1750*5d465952Smartinh indx_t ofs; 1751*5d465952Smartinh struct node *node; 1752*5d465952Smartinh struct page *p; 1753*5d465952Smartinh struct mpage *ofp = NULL; /* overflow page */ 1754*5d465952Smartinh 1755*5d465952Smartinh p = &mp->page; 1756*5d465952Smartinh assert(p->upper >= p->lower); 1757*5d465952Smartinh 1758*5d465952Smartinh DPRINTF("add node [%.*s] to %s page %u at index %i", 1759*5d465952Smartinh key ? (int)key->size : 0, key ? (char *)key->data : NULL, 1760*5d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 1761*5d465952Smartinh mp->pgno, indx); 1762*5d465952Smartinh 1763*5d465952Smartinh if (key != NULL) 1764*5d465952Smartinh node_size += key->size; 1765*5d465952Smartinh 1766*5d465952Smartinh if (IS_LEAF(mp)) { 1767*5d465952Smartinh assert(data); 1768*5d465952Smartinh node_size += data->size; 1769*5d465952Smartinh if (F_ISSET(flags, F_BIGDATA)) { 1770*5d465952Smartinh /* Data already on overflow page. */ 1771*5d465952Smartinh node_size -= data->size - sizeof(pgno_t); 1772*5d465952Smartinh } else if (data->size >= PAGESIZE / BT_MINKEYS) { 1773*5d465952Smartinh /* Put data on overflow page. */ 1774*5d465952Smartinh DPRINTF("data size is %zu, put on overflow page", 1775*5d465952Smartinh data->size); 1776*5d465952Smartinh node_size -= data->size - sizeof(pgno_t); 1777*5d465952Smartinh if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL) 1778*5d465952Smartinh return BT_FAIL; 1779*5d465952Smartinh DPRINTF("allocated overflow page %u", ofp->pgno); 1780*5d465952Smartinh flags |= F_BIGDATA; 1781*5d465952Smartinh } 1782*5d465952Smartinh } 1783*5d465952Smartinh 1784*5d465952Smartinh if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { 1785*5d465952Smartinh DPRINTF("not enough room in page %u, got %lu ptrs", 1786*5d465952Smartinh mp->pgno, NUMKEYS(mp)); 1787*5d465952Smartinh DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, 1788*5d465952Smartinh p->upper - p->lower); 1789*5d465952Smartinh DPRINTF("node size = %lu", node_size); 1790*5d465952Smartinh return BT_FAIL; 1791*5d465952Smartinh } 1792*5d465952Smartinh 1793*5d465952Smartinh /* Move higher pointers up one slot. */ 1794*5d465952Smartinh for (i = NUMKEYS(mp); i > indx; i--) 1795*5d465952Smartinh p->ptrs[i] = p->ptrs[i - 1]; 1796*5d465952Smartinh 1797*5d465952Smartinh /* Adjust free space offsets. */ 1798*5d465952Smartinh ofs = p->upper - node_size; 1799*5d465952Smartinh assert(ofs >= p->lower + sizeof(indx_t)); 1800*5d465952Smartinh p->ptrs[indx] = ofs; 1801*5d465952Smartinh p->upper = ofs; 1802*5d465952Smartinh p->lower += sizeof(indx_t); 1803*5d465952Smartinh 1804*5d465952Smartinh /* Write the node data. */ 1805*5d465952Smartinh node = NODEPTR(mp, indx); 1806*5d465952Smartinh node->ksize = (key == NULL) ? 0 : key->size; 1807*5d465952Smartinh node->flags = flags; 1808*5d465952Smartinh if (IS_LEAF(mp)) 1809*5d465952Smartinh node->n_dsize = data->size; 1810*5d465952Smartinh else 1811*5d465952Smartinh node->n_pgno = pgno; 1812*5d465952Smartinh 1813*5d465952Smartinh if (key) 1814*5d465952Smartinh bcopy(key->data, NODEKEY(node), key->size); 1815*5d465952Smartinh 1816*5d465952Smartinh if (IS_LEAF(mp)) { 1817*5d465952Smartinh if (ofp == NULL) { 1818*5d465952Smartinh if (F_ISSET(flags, F_BIGDATA)) 1819*5d465952Smartinh bcopy(data->data, node->data + key->size, 1820*5d465952Smartinh sizeof(pgno_t)); 1821*5d465952Smartinh else 1822*5d465952Smartinh bcopy(data->data, node->data + key->size, 1823*5d465952Smartinh data->size); 1824*5d465952Smartinh } else { 1825*5d465952Smartinh bcopy(&ofp->pgno, node->data + key->size, 1826*5d465952Smartinh sizeof(pgno_t)); 1827*5d465952Smartinh if (btree_write_overflow_data(bt, &ofp->page, 1828*5d465952Smartinh data) == BT_FAIL) 1829*5d465952Smartinh return BT_FAIL; 1830*5d465952Smartinh } 1831*5d465952Smartinh } 1832*5d465952Smartinh 1833*5d465952Smartinh return BT_SUCCESS; 1834*5d465952Smartinh } 1835*5d465952Smartinh 1836*5d465952Smartinh static void 1837*5d465952Smartinh btree_del_node(struct btree *bt, struct mpage *mp, indx_t indx) 1838*5d465952Smartinh { 1839*5d465952Smartinh unsigned int sz; 1840*5d465952Smartinh indx_t i, j, numkeys, ptr; 1841*5d465952Smartinh struct node *node; 1842*5d465952Smartinh char *base; 1843*5d465952Smartinh 1844*5d465952Smartinh DPRINTF("delete node %u on %s page %u", indx, 1845*5d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno); 1846*5d465952Smartinh assert(indx < NUMKEYS(mp)); 1847*5d465952Smartinh 1848*5d465952Smartinh node = NODEPTR(mp, indx); 1849*5d465952Smartinh sz = NODESIZE + node->ksize; 1850*5d465952Smartinh if (IS_LEAF(mp)) { 1851*5d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA)) 1852*5d465952Smartinh sz += sizeof(pgno_t); 1853*5d465952Smartinh else 1854*5d465952Smartinh sz += NODEDSZ(node); 1855*5d465952Smartinh } 1856*5d465952Smartinh 1857*5d465952Smartinh ptr = mp->page.ptrs[indx]; 1858*5d465952Smartinh numkeys = NUMKEYS(mp); 1859*5d465952Smartinh for (i = j = 0; i < numkeys; i++) { 1860*5d465952Smartinh if (i != indx) { 1861*5d465952Smartinh mp->page.ptrs[j] = mp->page.ptrs[i]; 1862*5d465952Smartinh if (mp->page.ptrs[i] < ptr) 1863*5d465952Smartinh mp->page.ptrs[j] += sz; 1864*5d465952Smartinh j++; 1865*5d465952Smartinh } 1866*5d465952Smartinh } 1867*5d465952Smartinh 1868*5d465952Smartinh base = (char *)&mp->page + mp->page.upper; 1869*5d465952Smartinh bcopy(base, base + sz, ptr - mp->page.upper); 1870*5d465952Smartinh 1871*5d465952Smartinh mp->page.lower -= sizeof(indx_t); 1872*5d465952Smartinh mp->page.upper += sz; 1873*5d465952Smartinh } 1874*5d465952Smartinh 1875*5d465952Smartinh struct cursor * 1876*5d465952Smartinh btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn) 1877*5d465952Smartinh { 1878*5d465952Smartinh struct cursor *cursor; 1879*5d465952Smartinh 1880*5d465952Smartinh if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { 1881*5d465952Smartinh SLIST_INIT(&cursor->stack); 1882*5d465952Smartinh cursor->bt = bt; 1883*5d465952Smartinh cursor->txn = txn; 1884*5d465952Smartinh bt->ref++; 1885*5d465952Smartinh } 1886*5d465952Smartinh 1887*5d465952Smartinh return cursor; 1888*5d465952Smartinh } 1889*5d465952Smartinh 1890*5d465952Smartinh void 1891*5d465952Smartinh btree_cursor_close(struct cursor *cursor) 1892*5d465952Smartinh { 1893*5d465952Smartinh if (cursor != NULL) { 1894*5d465952Smartinh while (!CURSOR_EMPTY(cursor)) 1895*5d465952Smartinh cursor_pop_page(cursor); 1896*5d465952Smartinh 1897*5d465952Smartinh btree_close(cursor->bt); 1898*5d465952Smartinh free(cursor); 1899*5d465952Smartinh } 1900*5d465952Smartinh } 1901*5d465952Smartinh 1902*5d465952Smartinh static int 1903*5d465952Smartinh btree_update_key(struct btree *bt, struct mpage *mp, indx_t indx, 1904*5d465952Smartinh struct btval *key) 1905*5d465952Smartinh { 1906*5d465952Smartinh indx_t ptr, i, numkeys; 1907*5d465952Smartinh int delta; 1908*5d465952Smartinh size_t len; 1909*5d465952Smartinh struct node *node; 1910*5d465952Smartinh char *base; 1911*5d465952Smartinh 1912*5d465952Smartinh node = NODEPTR(mp, indx); 1913*5d465952Smartinh ptr = mp->page.ptrs[indx]; 1914*5d465952Smartinh DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", 1915*5d465952Smartinh indx, ptr, 1916*5d465952Smartinh (int)node->ksize, (char *)NODEKEY(node), 1917*5d465952Smartinh (int)key->size, (char *)key->data, 1918*5d465952Smartinh mp->pgno); 1919*5d465952Smartinh 1920*5d465952Smartinh if (key->size != node->ksize) { 1921*5d465952Smartinh delta = key->size - node->ksize; 1922*5d465952Smartinh if (delta > 0 && SIZELEFT(mp) < delta) { 1923*5d465952Smartinh DPRINTF("OUCH! Not enough room, delta = %d", delta); 1924*5d465952Smartinh return BT_FAIL; 1925*5d465952Smartinh } 1926*5d465952Smartinh 1927*5d465952Smartinh numkeys = NUMKEYS(mp); 1928*5d465952Smartinh for (i = 0; i < numkeys; i++) { 1929*5d465952Smartinh if (mp->page.ptrs[i] <= ptr) 1930*5d465952Smartinh mp->page.ptrs[i] -= delta; 1931*5d465952Smartinh } 1932*5d465952Smartinh 1933*5d465952Smartinh base = (char *)&mp->page + mp->page.upper; 1934*5d465952Smartinh len = ptr - mp->page.upper + NODESIZE; 1935*5d465952Smartinh bcopy(base, base - delta, len); 1936*5d465952Smartinh mp->page.upper -= delta; 1937*5d465952Smartinh 1938*5d465952Smartinh node = NODEPTR(mp, indx); 1939*5d465952Smartinh node->ksize = key->size; 1940*5d465952Smartinh } 1941*5d465952Smartinh 1942*5d465952Smartinh bcopy(key->data, NODEKEY(node), key->size); 1943*5d465952Smartinh 1944*5d465952Smartinh return BT_SUCCESS; 1945*5d465952Smartinh } 1946*5d465952Smartinh 1947*5d465952Smartinh static int 1948*5d465952Smartinh btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta) 1949*5d465952Smartinh { 1950*5d465952Smartinh indx_t i; 1951*5d465952Smartinh struct node *node; 1952*5d465952Smartinh struct btkey tmpkey; 1953*5d465952Smartinh struct btval key; 1954*5d465952Smartinh 1955*5d465952Smartinh DPRINTF("adjusting prefix lengths on page %u with delta %d", 1956*5d465952Smartinh src->pgno, delta); 1957*5d465952Smartinh assert(delta != 0); 1958*5d465952Smartinh 1959*5d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) { 1960*5d465952Smartinh node = NODEPTR(src, i); 1961*5d465952Smartinh tmpkey.len = node->ksize - delta; 1962*5d465952Smartinh if (delta > 0) { 1963*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) 1964*5d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, tmpkey.len); 1965*5d465952Smartinh else 1966*5d465952Smartinh bcopy((char *)NODEKEY(node) + delta, tmpkey.str, 1967*5d465952Smartinh tmpkey.len); 1968*5d465952Smartinh } else { 1969*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 1970*5d465952Smartinh bcopy(NODEKEY(node), tmpkey.str, node->ksize); 1971*5d465952Smartinh bcopy(src->prefix.str, tmpkey.str + node->ksize, 1972*5d465952Smartinh -delta); 1973*5d465952Smartinh } else { 1974*5d465952Smartinh bcopy(src->prefix.str + src->prefix.len + delta, 1975*5d465952Smartinh tmpkey.str, -delta); 1976*5d465952Smartinh bcopy(NODEKEY(node), tmpkey.str - delta, 1977*5d465952Smartinh node->ksize); 1978*5d465952Smartinh } 1979*5d465952Smartinh } 1980*5d465952Smartinh key.size = tmpkey.len; 1981*5d465952Smartinh key.data = tmpkey.str; 1982*5d465952Smartinh if (btree_update_key(bt, src, i, &key) != BT_SUCCESS) 1983*5d465952Smartinh return BT_FAIL; 1984*5d465952Smartinh } 1985*5d465952Smartinh 1986*5d465952Smartinh return BT_SUCCESS; 1987*5d465952Smartinh } 1988*5d465952Smartinh 1989*5d465952Smartinh /* Move a node from src to dst. 1990*5d465952Smartinh */ 1991*5d465952Smartinh static int 1992*5d465952Smartinh btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx, 1993*5d465952Smartinh struct mpage *dst, indx_t dstindx) 1994*5d465952Smartinh { 1995*5d465952Smartinh int rc; 1996*5d465952Smartinh unsigned int pfxlen, mp_pfxlen = 0; 1997*5d465952Smartinh struct node *node, *srcnode; 1998*5d465952Smartinh struct mpage *mp; 1999*5d465952Smartinh struct btkey tmpkey, srckey; 2000*5d465952Smartinh struct btval key, data; 2001*5d465952Smartinh 2002*5d465952Smartinh assert(src->parent); 2003*5d465952Smartinh assert(dst->parent); 2004*5d465952Smartinh 2005*5d465952Smartinh srcnode = NODEPTR(src, srcindx); 2006*5d465952Smartinh DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", 2007*5d465952Smartinh IS_LEAF(src) ? "leaf" : "branch", 2008*5d465952Smartinh srcindx, 2009*5d465952Smartinh (int)srcnode->ksize, (char *)NODEKEY(srcnode), 2010*5d465952Smartinh src->pgno, 2011*5d465952Smartinh dstindx, dst->pgno); 2012*5d465952Smartinh 2013*5d465952Smartinh if (IS_BRANCH(src)) { 2014*5d465952Smartinh /* Need to check if the page the moved node points to 2015*5d465952Smartinh * changes prefix. 2016*5d465952Smartinh */ 2017*5d465952Smartinh btree_get_mpage(bt, NODEPGNO(srcnode), &mp); 2018*5d465952Smartinh mp->parent = src; 2019*5d465952Smartinh mp->parent_index = srcindx; 2020*5d465952Smartinh find_common_prefix(bt, mp); 2021*5d465952Smartinh mp_pfxlen = mp->prefix.len; 2022*5d465952Smartinh } 2023*5d465952Smartinh 2024*5d465952Smartinh /* Mark src and dst as dirty. */ 2025*5d465952Smartinh mpage_touch(bt, src); 2026*5d465952Smartinh mpage_touch(bt, dst); 2027*5d465952Smartinh 2028*5d465952Smartinh find_common_prefix(bt, dst); 2029*5d465952Smartinh 2030*5d465952Smartinh /* Check if src node has destination page prefix. Otherwise the 2031*5d465952Smartinh * destination page must expand its prefix on all its nodes. 2032*5d465952Smartinh */ 2033*5d465952Smartinh srckey.len = srcnode->ksize; 2034*5d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srckey.len); 2035*5d465952Smartinh common_prefix(bt, &srckey, &dst->prefix, &tmpkey); 2036*5d465952Smartinh if (tmpkey.len != dst->prefix.len) { 2037*5d465952Smartinh if (btree_adjust_prefix(bt, dst, 2038*5d465952Smartinh tmpkey.len - dst->prefix.len) != BT_SUCCESS) 2039*5d465952Smartinh return BT_FAIL; 2040*5d465952Smartinh bcopy(&tmpkey, &dst->prefix, sizeof(tmpkey)); 2041*5d465952Smartinh } 2042*5d465952Smartinh 2043*5d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) { 2044*5d465952Smartinh struct mpage *low; 2045*5d465952Smartinh 2046*5d465952Smartinh /* must find the lowest key below src 2047*5d465952Smartinh */ 2048*5d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2049*5d465952Smartinh &low) == BT_SUCCESS); 2050*5d465952Smartinh expand_prefix(bt, low, 0, &srckey); 2051*5d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u", 2052*5d465952Smartinh (int)srckey.len, srckey.str, low->pgno); 2053*5d465952Smartinh } else { 2054*5d465952Smartinh srckey.len = srcnode->ksize; 2055*5d465952Smartinh bcopy(NODEKEY(srcnode), srckey.str, srcnode->ksize); 2056*5d465952Smartinh } 2057*5d465952Smartinh find_common_prefix(bt, src); 2058*5d465952Smartinh 2059*5d465952Smartinh /* expand the prefix */ 2060*5d465952Smartinh tmpkey.len = sizeof(tmpkey.str); 2061*5d465952Smartinh concat_prefix(bt, src->prefix.str, src->prefix.len, 2062*5d465952Smartinh srckey.str, srckey.len, tmpkey.str, &tmpkey.len); 2063*5d465952Smartinh 2064*5d465952Smartinh /* Add the node to the destination page. Adjust prefix for 2065*5d465952Smartinh * destination page. 2066*5d465952Smartinh */ 2067*5d465952Smartinh key.size = tmpkey.len; 2068*5d465952Smartinh key.data = tmpkey.str; 2069*5d465952Smartinh remove_prefix(bt, &key, dst->prefix.len); 2070*5d465952Smartinh data.size = NODEDSZ(srcnode); 2071*5d465952Smartinh data.data = NODEDATA(srcnode); 2072*5d465952Smartinh rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode), 2073*5d465952Smartinh srcnode->flags); 2074*5d465952Smartinh if (rc != BT_SUCCESS) 2075*5d465952Smartinh return rc; 2076*5d465952Smartinh 2077*5d465952Smartinh /* Delete the node from the source page. 2078*5d465952Smartinh */ 2079*5d465952Smartinh btree_del_node(bt, src, srcindx); 2080*5d465952Smartinh 2081*5d465952Smartinh /* Update the parent separators. 2082*5d465952Smartinh */ 2083*5d465952Smartinh if (srcindx == 0 && src->parent_index != 0) { 2084*5d465952Smartinh node = NODEPTR(src->parent, src->parent_index); 2085*5d465952Smartinh DPRINTF("current parent separator for source page %u is [%.*s]", 2086*5d465952Smartinh src->pgno, 2087*5d465952Smartinh (int)node->ksize, (char *)NODEKEY(node)); 2088*5d465952Smartinh 2089*5d465952Smartinh expand_prefix(bt, src, 0, &tmpkey); 2090*5d465952Smartinh key.size = tmpkey.len; 2091*5d465952Smartinh key.data = tmpkey.str; 2092*5d465952Smartinh remove_prefix(bt, &key, src->parent->prefix.len); 2093*5d465952Smartinh 2094*5d465952Smartinh DPRINTF("update separator for source page %u to [%.*s]", 2095*5d465952Smartinh src->pgno, (int)key.size, (char *)key.data); 2096*5d465952Smartinh // ? bt_reduce_separator(bt, node, &key); 2097*5d465952Smartinh if (btree_update_key(bt, src->parent, src->parent_index, 2098*5d465952Smartinh &key) != BT_SUCCESS) 2099*5d465952Smartinh return BT_FAIL; 2100*5d465952Smartinh } 2101*5d465952Smartinh 2102*5d465952Smartinh if (srcindx == 0 && IS_BRANCH(src)) { 2103*5d465952Smartinh struct btval nullkey; 2104*5d465952Smartinh nullkey.size = 0; 2105*5d465952Smartinh assert(btree_update_key(bt, src, 0, &nullkey) == BT_SUCCESS); 2106*5d465952Smartinh } 2107*5d465952Smartinh 2108*5d465952Smartinh if (dstindx == 0 && dst->parent_index != 0) { 2109*5d465952Smartinh node = NODEPTR(dst->parent, dst->parent_index); 2110*5d465952Smartinh DPRINTF("current parent separator for destination page %u is [%.*s]", 2111*5d465952Smartinh dst->pgno, 2112*5d465952Smartinh (int)node->ksize, (char *)NODEKEY(node)); 2113*5d465952Smartinh 2114*5d465952Smartinh expand_prefix(bt, dst, 0, &tmpkey); 2115*5d465952Smartinh key.size = tmpkey.len; 2116*5d465952Smartinh key.data = tmpkey.str; 2117*5d465952Smartinh remove_prefix(bt, &key, dst->parent->prefix.len); 2118*5d465952Smartinh 2119*5d465952Smartinh DPRINTF("update separator for destination page %u to [%.*s]", 2120*5d465952Smartinh dst->pgno, (int)key.size, (char *)key.data); 2121*5d465952Smartinh // ? bt_reduce_separator(bt, node, &key); 2122*5d465952Smartinh if (btree_update_key(bt, dst->parent, dst->parent_index, 2123*5d465952Smartinh &key) != BT_SUCCESS) 2124*5d465952Smartinh return BT_FAIL; 2125*5d465952Smartinh } 2126*5d465952Smartinh 2127*5d465952Smartinh if (dstindx == 0 && IS_BRANCH(dst)) { 2128*5d465952Smartinh struct btval nullkey; 2129*5d465952Smartinh nullkey.size = 0; 2130*5d465952Smartinh assert(btree_update_key(bt, dst, 0, &nullkey) == BT_SUCCESS); 2131*5d465952Smartinh } 2132*5d465952Smartinh 2133*5d465952Smartinh /* We can get a new page prefix here! 2134*5d465952Smartinh * Must update keys in all nodes of this page! 2135*5d465952Smartinh */ 2136*5d465952Smartinh pfxlen = src->prefix.len; 2137*5d465952Smartinh find_common_prefix(bt, src); 2138*5d465952Smartinh if (src->prefix.len != pfxlen) { 2139*5d465952Smartinh if (btree_adjust_prefix(bt, src, 2140*5d465952Smartinh src->prefix.len - pfxlen) != BT_SUCCESS) 2141*5d465952Smartinh return BT_FAIL; 2142*5d465952Smartinh } 2143*5d465952Smartinh 2144*5d465952Smartinh pfxlen = dst->prefix.len; 2145*5d465952Smartinh find_common_prefix(bt, dst); 2146*5d465952Smartinh if (dst->prefix.len != pfxlen) { 2147*5d465952Smartinh if (btree_adjust_prefix(bt, dst, 2148*5d465952Smartinh dst->prefix.len - pfxlen) != BT_SUCCESS) 2149*5d465952Smartinh return BT_FAIL; 2150*5d465952Smartinh } 2151*5d465952Smartinh 2152*5d465952Smartinh if (IS_BRANCH(dst)) { 2153*5d465952Smartinh mp->parent = dst; 2154*5d465952Smartinh mp->parent_index = dstindx; 2155*5d465952Smartinh find_common_prefix(bt, mp); 2156*5d465952Smartinh if (mp->prefix.len != mp_pfxlen) { 2157*5d465952Smartinh DPRINTF("moved branch node has changed prefix"); 2158*5d465952Smartinh mpage_touch(bt, mp); 2159*5d465952Smartinh if (btree_adjust_prefix(bt, mp, 2160*5d465952Smartinh mp->prefix.len - mp_pfxlen) != BT_SUCCESS) 2161*5d465952Smartinh return BT_FAIL; 2162*5d465952Smartinh } 2163*5d465952Smartinh } 2164*5d465952Smartinh 2165*5d465952Smartinh return BT_SUCCESS; 2166*5d465952Smartinh } 2167*5d465952Smartinh 2168*5d465952Smartinh static int 2169*5d465952Smartinh btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst) 2170*5d465952Smartinh { 2171*5d465952Smartinh int rc; 2172*5d465952Smartinh indx_t i; 2173*5d465952Smartinh struct node *srcnode; 2174*5d465952Smartinh struct btkey tmpkey, dstpfx; 2175*5d465952Smartinh struct btval key, data; 2176*5d465952Smartinh 2177*5d465952Smartinh DPRINTF("merging page %u and %u", src->pgno, dst->pgno); 2178*5d465952Smartinh 2179*5d465952Smartinh assert(src->parent); /* can't merge root page */ 2180*5d465952Smartinh assert(dst->parent); 2181*5d465952Smartinh assert(bt->txn != NULL); 2182*5d465952Smartinh 2183*5d465952Smartinh /* Mark src and dst as dirty. */ 2184*5d465952Smartinh mpage_touch(bt, src); 2185*5d465952Smartinh mpage_touch(bt, dst); 2186*5d465952Smartinh 2187*5d465952Smartinh find_common_prefix(bt, src); 2188*5d465952Smartinh find_common_prefix(bt, dst); 2189*5d465952Smartinh 2190*5d465952Smartinh /* Check if source nodes has destination page prefix. Otherwise 2191*5d465952Smartinh * the destination page must expand its prefix on all its nodes. 2192*5d465952Smartinh */ 2193*5d465952Smartinh common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx); 2194*5d465952Smartinh if (dstpfx.len != dst->prefix.len) { 2195*5d465952Smartinh if (btree_adjust_prefix(bt, dst, 2196*5d465952Smartinh dstpfx.len - dst->prefix.len) != BT_SUCCESS) 2197*5d465952Smartinh return BT_FAIL; 2198*5d465952Smartinh bcopy(&dstpfx, &dst->prefix, sizeof(dstpfx)); 2199*5d465952Smartinh } 2200*5d465952Smartinh 2201*5d465952Smartinh /* Move all nodes from src to dst. 2202*5d465952Smartinh */ 2203*5d465952Smartinh for (i = 0; i < NUMKEYS(src); i++) { 2204*5d465952Smartinh srcnode = NODEPTR(src, i); 2205*5d465952Smartinh 2206*5d465952Smartinh /* If branch node 0 (implicit key), find the real key. 2207*5d465952Smartinh */ 2208*5d465952Smartinh if (i == 0 && IS_BRANCH(src)) { 2209*5d465952Smartinh struct mpage *low; 2210*5d465952Smartinh 2211*5d465952Smartinh /* must find the lowest key below src 2212*5d465952Smartinh */ 2213*5d465952Smartinh assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2214*5d465952Smartinh &low) == BT_SUCCESS); 2215*5d465952Smartinh expand_prefix(bt, low, 0, &tmpkey); 2216*5d465952Smartinh DPRINTF("found lowest key [%.*s] on leaf page %u", 2217*5d465952Smartinh (int)tmpkey.len, tmpkey.str, low->pgno); 2218*5d465952Smartinh } else { 2219*5d465952Smartinh expand_prefix(bt, src, i, &tmpkey); 2220*5d465952Smartinh } 2221*5d465952Smartinh 2222*5d465952Smartinh key.size = tmpkey.len; 2223*5d465952Smartinh key.data = tmpkey.str; 2224*5d465952Smartinh 2225*5d465952Smartinh remove_prefix(bt, &key, dst->prefix.len); 2226*5d465952Smartinh data.size = NODEDSZ(srcnode); 2227*5d465952Smartinh data.data = NODEDATA(srcnode); 2228*5d465952Smartinh rc = btree_add_node(bt, dst, NUMKEYS(dst), &key, 2229*5d465952Smartinh &data, NODEPGNO(srcnode), srcnode->flags); 2230*5d465952Smartinh if (rc != BT_SUCCESS) 2231*5d465952Smartinh return rc; 2232*5d465952Smartinh } 2233*5d465952Smartinh 2234*5d465952Smartinh DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", 2235*5d465952Smartinh dst->pgno, NUMKEYS(dst), PAGEFILL(dst) * 100); 2236*5d465952Smartinh 2237*5d465952Smartinh /* Unlink the src page from parent. 2238*5d465952Smartinh */ 2239*5d465952Smartinh btree_del_node(bt, src->parent, src->parent_index); 2240*5d465952Smartinh if (src->parent_index == 0) { 2241*5d465952Smartinh key.size = 0; 2242*5d465952Smartinh if (btree_update_key(bt, src->parent, 0, &key) != BT_SUCCESS) 2243*5d465952Smartinh return BT_FAIL; 2244*5d465952Smartinh 2245*5d465952Smartinh unsigned int pfxlen = src->prefix.len; 2246*5d465952Smartinh find_common_prefix(bt, src); 2247*5d465952Smartinh assert (src->prefix.len == pfxlen); 2248*5d465952Smartinh } 2249*5d465952Smartinh 2250*5d465952Smartinh if (IS_LEAF(src)) 2251*5d465952Smartinh bt->meta->leaf_pages--; 2252*5d465952Smartinh else 2253*5d465952Smartinh bt->meta->branch_pages--; 2254*5d465952Smartinh 2255*5d465952Smartinh return btree_rebalance(bt, src->parent); 2256*5d465952Smartinh } 2257*5d465952Smartinh 2258*5d465952Smartinh #define FILL_THRESHOLD 0.25 2259*5d465952Smartinh 2260*5d465952Smartinh static int 2261*5d465952Smartinh btree_rebalance(struct btree *bt, struct mpage *mp) 2262*5d465952Smartinh { 2263*5d465952Smartinh indx_t si = 0, di = 0; 2264*5d465952Smartinh struct mpage *parent; 2265*5d465952Smartinh struct mpage *neighbor = NULL; 2266*5d465952Smartinh 2267*5d465952Smartinh assert(bt != NULL); 2268*5d465952Smartinh assert(bt->txn != NULL); 2269*5d465952Smartinh assert(mp != NULL); 2270*5d465952Smartinh 2271*5d465952Smartinh DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", 2272*5d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", 2273*5d465952Smartinh mp->pgno, NUMKEYS(mp), PAGEFILL(mp) * 100); 2274*5d465952Smartinh 2275*5d465952Smartinh if (PAGEFILL(mp) >= FILL_THRESHOLD) { 2276*5d465952Smartinh DPRINTF("no need to rebalance page %u, above fill threshold", 2277*5d465952Smartinh mp->pgno); 2278*5d465952Smartinh return BT_SUCCESS; 2279*5d465952Smartinh } 2280*5d465952Smartinh 2281*5d465952Smartinh parent = mp->parent; 2282*5d465952Smartinh 2283*5d465952Smartinh if (parent == NULL) { 2284*5d465952Smartinh if (NUMKEYS(mp) == 0) { 2285*5d465952Smartinh DPRINTF("tree is completely empty"); 2286*5d465952Smartinh bt->txn->root = P_INVALID; 2287*5d465952Smartinh bt->meta->depth--; 2288*5d465952Smartinh bt->meta->leaf_pages--; 2289*5d465952Smartinh } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 2290*5d465952Smartinh DPRINTF("collapsing root page!"); 2291*5d465952Smartinh bt->txn->root = NODEPGNO(NODEPTR(mp, 0)); 2292*5d465952Smartinh bt->meta->depth--; 2293*5d465952Smartinh bt->meta->branch_pages--; 2294*5d465952Smartinh } else 2295*5d465952Smartinh DPRINTF("root page doesn't need rebalancing"); 2296*5d465952Smartinh return BT_SUCCESS; 2297*5d465952Smartinh } 2298*5d465952Smartinh 2299*5d465952Smartinh /* The parent (branch page) must have at least 2 pointers, 2300*5d465952Smartinh * otherwise the tree is invalid. 2301*5d465952Smartinh */ 2302*5d465952Smartinh assert(NUMKEYS(parent) > 1); 2303*5d465952Smartinh 2304*5d465952Smartinh /* Leaf page fill factor is below the threshold. 2305*5d465952Smartinh * Try to move keys from left or right neighbor, or 2306*5d465952Smartinh * merge with a neighbor page. 2307*5d465952Smartinh */ 2308*5d465952Smartinh 2309*5d465952Smartinh /* Find neighbors. 2310*5d465952Smartinh */ 2311*5d465952Smartinh if (mp->parent_index == 0) { 2312*5d465952Smartinh /* We're the leftmost leaf in our parent. 2313*5d465952Smartinh */ 2314*5d465952Smartinh DPRINTF("reading right neighbor"); 2315*5d465952Smartinh if (btree_get_mpage(bt, 2316*5d465952Smartinh NODEPGNO(NODEPTR(parent, mp->parent_index + 1)), 2317*5d465952Smartinh &neighbor) != BT_SUCCESS) { 2318*5d465952Smartinh return BT_FAIL; 2319*5d465952Smartinh } 2320*5d465952Smartinh neighbor->parent_index = mp->parent_index + 1; 2321*5d465952Smartinh si = 0; 2322*5d465952Smartinh di = NUMKEYS(mp); 2323*5d465952Smartinh } else { 2324*5d465952Smartinh /* There is at least one neighbor to the left. 2325*5d465952Smartinh */ 2326*5d465952Smartinh DPRINTF("reading left neighbor"); 2327*5d465952Smartinh if (btree_get_mpage(bt, 2328*5d465952Smartinh NODEPGNO(NODEPTR(parent, mp->parent_index - 1)), 2329*5d465952Smartinh &neighbor) != BT_SUCCESS) { 2330*5d465952Smartinh return BT_FAIL; 2331*5d465952Smartinh } 2332*5d465952Smartinh neighbor->parent_index = mp->parent_index - 1; 2333*5d465952Smartinh si = NUMKEYS(neighbor) - 1; 2334*5d465952Smartinh di = 0; 2335*5d465952Smartinh } 2336*5d465952Smartinh neighbor->parent = parent; 2337*5d465952Smartinh 2338*5d465952Smartinh DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", 2339*5d465952Smartinh neighbor->pgno, NUMKEYS(neighbor), PAGEFILL(neighbor) * 100); 2340*5d465952Smartinh 2341*5d465952Smartinh /* If the neighbor page is above threshold and has at least two 2342*5d465952Smartinh * keys, move one key from it. 2343*5d465952Smartinh * 2344*5d465952Smartinh * Otherwise we should try to merge them, but that might not be 2345*5d465952Smartinh * possible, even if both are below threshold, as prefix expansion 2346*5d465952Smartinh * might make keys larger. FIXME: detect this 2347*5d465952Smartinh */ 2348*5d465952Smartinh if (PAGEFILL(neighbor) >= FILL_THRESHOLD && 2349*5d465952Smartinh NUMKEYS(neighbor) >= NUMKEYS(mp) + 2) 2350*5d465952Smartinh return btree_move_node(bt, neighbor, si, mp, di); 2351*5d465952Smartinh else { /* FIXME: if (has_enough_room()) */ 2352*5d465952Smartinh if (mp->parent_index == 0) 2353*5d465952Smartinh return btree_merge(bt, neighbor, mp); 2354*5d465952Smartinh else 2355*5d465952Smartinh return btree_merge(bt, mp, neighbor); 2356*5d465952Smartinh } 2357*5d465952Smartinh } 2358*5d465952Smartinh 2359*5d465952Smartinh int 2360*5d465952Smartinh btree_txn_del(struct btree *bt, struct btree_txn *txn, 2361*5d465952Smartinh struct btval *key, struct btval *data) 2362*5d465952Smartinh { 2363*5d465952Smartinh int rc, exact, close_txn = 0; 2364*5d465952Smartinh unsigned int ki; 2365*5d465952Smartinh struct node *leaf; 2366*5d465952Smartinh struct mpage *mp; 2367*5d465952Smartinh 2368*5d465952Smartinh DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); 2369*5d465952Smartinh 2370*5d465952Smartinh assert(bt != NULL); 2371*5d465952Smartinh assert(key != NULL); 2372*5d465952Smartinh 2373*5d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 2374*5d465952Smartinh errno = EINVAL; 2375*5d465952Smartinh return BT_FAIL; 2376*5d465952Smartinh } 2377*5d465952Smartinh 2378*5d465952Smartinh if (txn == NULL) { 2379*5d465952Smartinh close_txn = 1; 2380*5d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 2381*5d465952Smartinh return BT_FAIL; 2382*5d465952Smartinh } 2383*5d465952Smartinh 2384*5d465952Smartinh if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS) 2385*5d465952Smartinh goto done; 2386*5d465952Smartinh 2387*5d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki); 2388*5d465952Smartinh if (leaf == NULL || !exact) { 2389*5d465952Smartinh rc = BT_NOTFOUND; 2390*5d465952Smartinh goto done; 2391*5d465952Smartinh } 2392*5d465952Smartinh 2393*5d465952Smartinh if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS) 2394*5d465952Smartinh goto done; 2395*5d465952Smartinh 2396*5d465952Smartinh btree_del_node(bt, mp, ki); 2397*5d465952Smartinh bt->meta->entries--; 2398*5d465952Smartinh rc = btree_rebalance(bt, mp); 2399*5d465952Smartinh if (rc != BT_SUCCESS) 2400*5d465952Smartinh txn->flags |= BT_TXN_ERROR; 2401*5d465952Smartinh 2402*5d465952Smartinh done: 2403*5d465952Smartinh if (close_txn) { 2404*5d465952Smartinh if (rc == BT_SUCCESS) 2405*5d465952Smartinh rc = btree_txn_commit(txn); 2406*5d465952Smartinh else 2407*5d465952Smartinh btree_txn_abort(txn); 2408*5d465952Smartinh } 2409*5d465952Smartinh mpage_prune(bt); 2410*5d465952Smartinh return rc; 2411*5d465952Smartinh } 2412*5d465952Smartinh 2413*5d465952Smartinh /* Reduce the length of the prefix separator <sep> to the minimum length that 2414*5d465952Smartinh * still makes it uniquely distinguishable from <min>. 2415*5d465952Smartinh * 2416*5d465952Smartinh * <min> is guaranteed to be sorted less than <sep> 2417*5d465952Smartinh * 2418*5d465952Smartinh * On return, <sep> is modified to the minimum length. 2419*5d465952Smartinh */ 2420*5d465952Smartinh static void 2421*5d465952Smartinh bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep) 2422*5d465952Smartinh { 2423*5d465952Smartinh size_t n = 0; 2424*5d465952Smartinh char *p1; 2425*5d465952Smartinh char *p2; 2426*5d465952Smartinh 2427*5d465952Smartinh if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2428*5d465952Smartinh 2429*5d465952Smartinh assert(sep->size > 0); 2430*5d465952Smartinh 2431*5d465952Smartinh p1 = (char *)NODEKEY(min) + min->ksize - 1; 2432*5d465952Smartinh p2 = (char *)sep->data + sep->size - 1; 2433*5d465952Smartinh 2434*5d465952Smartinh while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { 2435*5d465952Smartinh assert(p2 > (char *)sep->data); 2436*5d465952Smartinh p1--; 2437*5d465952Smartinh p2--; 2438*5d465952Smartinh n++; 2439*5d465952Smartinh } 2440*5d465952Smartinh 2441*5d465952Smartinh sep->data = p2; 2442*5d465952Smartinh sep->size = n + 1; 2443*5d465952Smartinh } else { 2444*5d465952Smartinh 2445*5d465952Smartinh assert(min->ksize > 0); 2446*5d465952Smartinh assert(sep->size > 0); 2447*5d465952Smartinh 2448*5d465952Smartinh p1 = (char *)NODEKEY(min); 2449*5d465952Smartinh p2 = (char *)sep->data; 2450*5d465952Smartinh 2451*5d465952Smartinh while (*p1 == *p2) { 2452*5d465952Smartinh p1++; 2453*5d465952Smartinh p2++; 2454*5d465952Smartinh n++; 2455*5d465952Smartinh if (n == min->ksize || n == sep->size) 2456*5d465952Smartinh break; 2457*5d465952Smartinh } 2458*5d465952Smartinh 2459*5d465952Smartinh sep->size = n + 1; 2460*5d465952Smartinh } 2461*5d465952Smartinh 2462*5d465952Smartinh DPRINTF("reduced separator to [%.*s] > [%.*s]", 2463*5d465952Smartinh (int)sep->size, (char *)sep->data, 2464*5d465952Smartinh (int)min->ksize, (char *)NODEKEY(min)); 2465*5d465952Smartinh } 2466*5d465952Smartinh 2467*5d465952Smartinh /* Split page <*mpp>, and insert <key,(data|newpgno)> in either left or 2468*5d465952Smartinh * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and 2469*5d465952Smartinh * *newindxp with the actual values after split, ie if *mpp and *newindxp 2470*5d465952Smartinh * refer to a node in the new right sibling page. 2471*5d465952Smartinh */ 2472*5d465952Smartinh static int 2473*5d465952Smartinh btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp, 2474*5d465952Smartinh struct btval *newkey, struct btval *newdata, pgno_t newpgno) 2475*5d465952Smartinh { 2476*5d465952Smartinh uint8_t flags; 2477*5d465952Smartinh int rc = BT_SUCCESS, ins_new = 0; 2478*5d465952Smartinh indx_t newindx; 2479*5d465952Smartinh pgno_t pgno = 0; 2480*5d465952Smartinh size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff; 2481*5d465952Smartinh unsigned int i, j, split_indx; 2482*5d465952Smartinh struct node *node; 2483*5d465952Smartinh struct mpage *pright, *p, *mp; 2484*5d465952Smartinh struct btval sepkey, rkey, rdata; 2485*5d465952Smartinh struct btkey tmpkey; 2486*5d465952Smartinh struct page copy; 2487*5d465952Smartinh 2488*5d465952Smartinh assert(bt != NULL); 2489*5d465952Smartinh assert(bt->txn != NULL); 2490*5d465952Smartinh 2491*5d465952Smartinh mp = *mpp; 2492*5d465952Smartinh newindx = *newindxp; 2493*5d465952Smartinh 2494*5d465952Smartinh DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i", 2495*5d465952Smartinh IS_LEAF(mp) ? "leaf" : "branch", mp->pgno, 2496*5d465952Smartinh (int)newkey->size, (char *)newkey->data, *newindxp); 2497*5d465952Smartinh DPRINTF("page %u has prefix [%.*s]", mp->pgno, 2498*5d465952Smartinh (int)mp->prefix.len, (char *)mp->prefix.str); 2499*5d465952Smartinh orig_pfx_len = mp->prefix.len; 2500*5d465952Smartinh 2501*5d465952Smartinh if (mp->parent == NULL) { 2502*5d465952Smartinh if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL) 2503*5d465952Smartinh return BT_FAIL; 2504*5d465952Smartinh mp->parent_index = 0; 2505*5d465952Smartinh bt->txn->root = mp->parent->pgno; 2506*5d465952Smartinh DPRINTF("root split! new root = %u", mp->parent->pgno); 2507*5d465952Smartinh bt->meta->depth++; 2508*5d465952Smartinh 2509*5d465952Smartinh /* Add left (implicit) pointer. */ 2510*5d465952Smartinh if (btree_add_node(bt, mp->parent, 0, NULL, NULL, 2511*5d465952Smartinh mp->pgno, 0) != BT_SUCCESS) 2512*5d465952Smartinh return BT_FAIL; 2513*5d465952Smartinh } else { 2514*5d465952Smartinh DPRINTF("parent branch page is %u", mp->parent->pgno); 2515*5d465952Smartinh } 2516*5d465952Smartinh 2517*5d465952Smartinh /* Create a right sibling. */ 2518*5d465952Smartinh if ((pright = btree_new_page(bt, mp->page.flags)) == NULL) 2519*5d465952Smartinh return BT_FAIL; 2520*5d465952Smartinh pright->parent = mp->parent; 2521*5d465952Smartinh pright->parent_index = mp->parent_index + 1; 2522*5d465952Smartinh DPRINTF("new right sibling: page %u", pright->pgno); 2523*5d465952Smartinh 2524*5d465952Smartinh /* Move half of the keys to the right sibling. */ 2525*5d465952Smartinh bcopy(&mp->page, ©, PAGESIZE); 2526*5d465952Smartinh assert(mp->ref == 0); /* XXX */ 2527*5d465952Smartinh bzero(&mp->page.ptrs, PAGESIZE - PAGEHDRSZ); 2528*5d465952Smartinh mp->page.lower = PAGEHDRSZ; 2529*5d465952Smartinh mp->page.upper = PAGESIZE; 2530*5d465952Smartinh 2531*5d465952Smartinh split_indx = NUMKEYSP(©) / 2 + 1; 2532*5d465952Smartinh 2533*5d465952Smartinh /* First find the separating key between the split pages. 2534*5d465952Smartinh */ 2535*5d465952Smartinh bzero(&sepkey, sizeof(sepkey)); 2536*5d465952Smartinh if (newindx == split_indx) { 2537*5d465952Smartinh sepkey.size = newkey->size; 2538*5d465952Smartinh sepkey.data = newkey->data; 2539*5d465952Smartinh remove_prefix(bt, &sepkey, mp->prefix.len); 2540*5d465952Smartinh } else { 2541*5d465952Smartinh node = NODEPTRP(©, split_indx); 2542*5d465952Smartinh sepkey.size = node->ksize; 2543*5d465952Smartinh sepkey.data = NODEKEY(node); 2544*5d465952Smartinh } 2545*5d465952Smartinh 2546*5d465952Smartinh if (IS_LEAF(mp) && bt->cmp == NULL) { 2547*5d465952Smartinh /* Find the smallest separator. */ 2548*5d465952Smartinh /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */ 2549*5d465952Smartinh node = NODEPTRP(©, split_indx - 1); 2550*5d465952Smartinh bt_reduce_separator(bt, node, &sepkey); 2551*5d465952Smartinh } 2552*5d465952Smartinh 2553*5d465952Smartinh /* Fix separator wrt parent prefix. */ 2554*5d465952Smartinh if (bt->cmp == NULL) { 2555*5d465952Smartinh tmpkey.len = sizeof(tmpkey.str); 2556*5d465952Smartinh concat_prefix(bt, mp->prefix.str, mp->prefix.len, 2557*5d465952Smartinh sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len); 2558*5d465952Smartinh sepkey.data = tmpkey.str; 2559*5d465952Smartinh sepkey.size = tmpkey.len; 2560*5d465952Smartinh } 2561*5d465952Smartinh 2562*5d465952Smartinh DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); 2563*5d465952Smartinh 2564*5d465952Smartinh /* Copy separator key to the parent. 2565*5d465952Smartinh */ 2566*5d465952Smartinh if (SIZELEFT(pright->parent) < bt_branch_size(&sepkey)) { 2567*5d465952Smartinh rc = btree_split(bt, &pright->parent, &pright->parent_index, 2568*5d465952Smartinh &sepkey, NULL, pright->pgno); 2569*5d465952Smartinh 2570*5d465952Smartinh /* Right page might now have changed parent. 2571*5d465952Smartinh * Check if left page also changed parent. 2572*5d465952Smartinh */ 2573*5d465952Smartinh if (pright->parent != mp->parent && 2574*5d465952Smartinh mp->parent_index >= NUMKEYS(mp->parent)) { 2575*5d465952Smartinh mp->parent = pright->parent; 2576*5d465952Smartinh mp->parent_index = pright->parent_index - 1; 2577*5d465952Smartinh } 2578*5d465952Smartinh } else { 2579*5d465952Smartinh remove_prefix(bt, &sepkey, pright->parent->prefix.len); 2580*5d465952Smartinh rc = btree_add_node(bt, pright->parent, pright->parent_index, 2581*5d465952Smartinh &sepkey, NULL, pright->pgno, 0); 2582*5d465952Smartinh } 2583*5d465952Smartinh if (rc != BT_SUCCESS) 2584*5d465952Smartinh return BT_FAIL; 2585*5d465952Smartinh 2586*5d465952Smartinh /* Update prefix for right and left page, if the parent was split. 2587*5d465952Smartinh */ 2588*5d465952Smartinh find_common_prefix(bt, pright); 2589*5d465952Smartinh assert(orig_pfx_len <= pright->prefix.len); 2590*5d465952Smartinh right_pfx_diff = pright->prefix.len - orig_pfx_len; 2591*5d465952Smartinh 2592*5d465952Smartinh find_common_prefix(bt, mp); 2593*5d465952Smartinh assert(orig_pfx_len <= mp->prefix.len); 2594*5d465952Smartinh left_pfx_diff = mp->prefix.len - orig_pfx_len; 2595*5d465952Smartinh 2596*5d465952Smartinh for (i = j = 0; i <= NUMKEYSP(©); j++) { 2597*5d465952Smartinh if (i < split_indx) { 2598*5d465952Smartinh /* Re-insert in left sibling. */ 2599*5d465952Smartinh p = mp; 2600*5d465952Smartinh pfx_diff = left_pfx_diff; 2601*5d465952Smartinh } else { 2602*5d465952Smartinh /* Insert in right sibling. */ 2603*5d465952Smartinh if (i == split_indx) 2604*5d465952Smartinh /* Reset insert index for right sibling. */ 2605*5d465952Smartinh j = (i == newindx && ins_new); 2606*5d465952Smartinh p = pright; 2607*5d465952Smartinh pfx_diff = right_pfx_diff; 2608*5d465952Smartinh } 2609*5d465952Smartinh 2610*5d465952Smartinh if (i == newindx && !ins_new) { 2611*5d465952Smartinh /* Insert the original entry that caused the split. */ 2612*5d465952Smartinh rkey.data = newkey->data; 2613*5d465952Smartinh rkey.size = newkey->size; 2614*5d465952Smartinh if (IS_LEAF(mp)) { 2615*5d465952Smartinh rdata.data = newdata->data; 2616*5d465952Smartinh rdata.size = newdata->size; 2617*5d465952Smartinh } else 2618*5d465952Smartinh pgno = newpgno; 2619*5d465952Smartinh flags = 0; 2620*5d465952Smartinh pfx_diff = p->prefix.len; 2621*5d465952Smartinh 2622*5d465952Smartinh ins_new = 1; 2623*5d465952Smartinh 2624*5d465952Smartinh /* Update page and index for the new key. */ 2625*5d465952Smartinh *newindxp = j; 2626*5d465952Smartinh *mpp = p; 2627*5d465952Smartinh } else if (i == NUMKEYSP(©)) { 2628*5d465952Smartinh break; 2629*5d465952Smartinh } else { 2630*5d465952Smartinh node = NODEPTRP(©, i); 2631*5d465952Smartinh rkey.data = NODEKEY(node); 2632*5d465952Smartinh rkey.size = node->ksize; 2633*5d465952Smartinh if (IS_LEAF(mp)) { 2634*5d465952Smartinh rdata.data = NODEDATA(node); 2635*5d465952Smartinh rdata.size = node->n_dsize; 2636*5d465952Smartinh } else 2637*5d465952Smartinh pgno = node->n_pgno; 2638*5d465952Smartinh flags = node->flags; 2639*5d465952Smartinh 2640*5d465952Smartinh i++; 2641*5d465952Smartinh } 2642*5d465952Smartinh 2643*5d465952Smartinh if (!IS_LEAF(mp) && j == 0) { 2644*5d465952Smartinh /* First branch index doesn't need key data. */ 2645*5d465952Smartinh rkey.size = 0; 2646*5d465952Smartinh } else 2647*5d465952Smartinh remove_prefix(bt, &rkey, pfx_diff); 2648*5d465952Smartinh 2649*5d465952Smartinh rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags); 2650*5d465952Smartinh if (rc != BT_SUCCESS) 2651*5d465952Smartinh return BT_FAIL; 2652*5d465952Smartinh } 2653*5d465952Smartinh 2654*5d465952Smartinh return rc; 2655*5d465952Smartinh } 2656*5d465952Smartinh 2657*5d465952Smartinh int 2658*5d465952Smartinh btree_txn_put(struct btree *bt, struct btree_txn *txn, 2659*5d465952Smartinh struct btval *key, struct btval *data, unsigned int flags) 2660*5d465952Smartinh { 2661*5d465952Smartinh int rc = BT_SUCCESS, exact, close_txn = 0; 2662*5d465952Smartinh unsigned int ki; 2663*5d465952Smartinh struct node *leaf; 2664*5d465952Smartinh struct mpage *mp; 2665*5d465952Smartinh struct btval xkey; 2666*5d465952Smartinh 2667*5d465952Smartinh assert(bt != NULL); 2668*5d465952Smartinh assert(key != NULL); 2669*5d465952Smartinh assert(data != NULL); 2670*5d465952Smartinh 2671*5d465952Smartinh if (key->size == 0 || key->size > MAXKEYSIZE) { 2672*5d465952Smartinh errno = EINVAL; 2673*5d465952Smartinh return BT_FAIL; 2674*5d465952Smartinh } 2675*5d465952Smartinh 2676*5d465952Smartinh DPRINTF("==> put key %.*s, size %zu, data size %zu", 2677*5d465952Smartinh (int)key->size, (char *)key->data, key->size, data->size); 2678*5d465952Smartinh 2679*5d465952Smartinh if (txn == NULL) { 2680*5d465952Smartinh close_txn = 1; 2681*5d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 2682*5d465952Smartinh return BT_FAIL; 2683*5d465952Smartinh } 2684*5d465952Smartinh 2685*5d465952Smartinh rc = btree_search_page(bt, txn, key, NULL, 1, &mp); 2686*5d465952Smartinh if (rc == BT_SUCCESS) { 2687*5d465952Smartinh leaf = btree_search_node(bt, mp, key, &exact, &ki); 2688*5d465952Smartinh if (leaf && exact) { 2689*5d465952Smartinh if (F_ISSET(flags, BT_NOOVERWRITE)) { 2690*5d465952Smartinh DPRINTF("duplicate key %.*s", 2691*5d465952Smartinh (int)key->size, (char *)key->data); 2692*5d465952Smartinh rc = BT_EXISTS; 2693*5d465952Smartinh goto done; 2694*5d465952Smartinh } 2695*5d465952Smartinh btree_del_node(bt, mp, ki); 2696*5d465952Smartinh } 2697*5d465952Smartinh if (leaf == NULL) { /* append if not found */ 2698*5d465952Smartinh ki = NUMKEYS(mp); 2699*5d465952Smartinh DPRINTF("appending key at index %i", ki); 2700*5d465952Smartinh } 2701*5d465952Smartinh } else if (rc == BT_NOTFOUND) { 2702*5d465952Smartinh /* new file, just write a root leaf page */ 2703*5d465952Smartinh DPRINTF("allocating new root leaf page"); 2704*5d465952Smartinh if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { 2705*5d465952Smartinh rc = BT_FAIL; 2706*5d465952Smartinh goto done; 2707*5d465952Smartinh } 2708*5d465952Smartinh txn->root = mp->pgno; 2709*5d465952Smartinh bt->meta->depth++; 2710*5d465952Smartinh ki = 0; 2711*5d465952Smartinh } 2712*5d465952Smartinh else 2713*5d465952Smartinh goto done; 2714*5d465952Smartinh 2715*5d465952Smartinh assert(IS_LEAF(mp)); 2716*5d465952Smartinh DPRINTF("there are %lu keys, should insert new key at index %i", 2717*5d465952Smartinh NUMKEYS(mp), ki); 2718*5d465952Smartinh 2719*5d465952Smartinh /* Copy the key pointer as it is modified by the prefix code. The 2720*5d465952Smartinh * caller might have malloc'ed the data. 2721*5d465952Smartinh */ 2722*5d465952Smartinh xkey.data = key->data; 2723*5d465952Smartinh xkey.size = key->size; 2724*5d465952Smartinh 2725*5d465952Smartinh if (SIZELEFT(mp) < bt_leaf_size(key, data)) { 2726*5d465952Smartinh rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID); 2727*5d465952Smartinh } else { 2728*5d465952Smartinh /* There is room already in this leaf page. */ 2729*5d465952Smartinh remove_prefix(bt, &xkey, mp->prefix.len); 2730*5d465952Smartinh rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0); 2731*5d465952Smartinh } 2732*5d465952Smartinh 2733*5d465952Smartinh if (rc != BT_SUCCESS) 2734*5d465952Smartinh txn->flags |= BT_TXN_ERROR; 2735*5d465952Smartinh else 2736*5d465952Smartinh bt->meta->entries++; 2737*5d465952Smartinh 2738*5d465952Smartinh done: 2739*5d465952Smartinh if (close_txn) { 2740*5d465952Smartinh if (rc == BT_SUCCESS) 2741*5d465952Smartinh rc = btree_txn_commit(txn); 2742*5d465952Smartinh else 2743*5d465952Smartinh btree_txn_abort(txn); 2744*5d465952Smartinh } 2745*5d465952Smartinh mpage_prune(bt); 2746*5d465952Smartinh return rc; 2747*5d465952Smartinh } 2748*5d465952Smartinh 2749*5d465952Smartinh static pgno_t 2750*5d465952Smartinh btree_compact_tree(struct btree *bt, pgno_t pgno, int fd) 2751*5d465952Smartinh { 2752*5d465952Smartinh indx_t i; 2753*5d465952Smartinh pgno_t *pnext; 2754*5d465952Smartinh struct node *node; 2755*5d465952Smartinh struct page *p; 2756*5d465952Smartinh struct mpage *mp; 2757*5d465952Smartinh char page[PAGESIZE]; 2758*5d465952Smartinh 2759*5d465952Smartinh p = (struct page *)page; 2760*5d465952Smartinh if ((mp = mpage_lookup(bt, pgno)) != NULL) 2761*5d465952Smartinh bcopy(&mp->page, p, PAGESIZE); 2762*5d465952Smartinh else if (btree_read_page(bt, pgno, p) != BT_SUCCESS) 2763*5d465952Smartinh return P_INVALID; 2764*5d465952Smartinh 2765*5d465952Smartinh if (F_ISSET(p->flags, P_BRANCH)) { 2766*5d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) { 2767*5d465952Smartinh node = NODEPTRP(p, i); 2768*5d465952Smartinh node->n_pgno = btree_compact_tree(bt, node->n_pgno, fd); 2769*5d465952Smartinh if (node->n_pgno == P_INVALID) 2770*5d465952Smartinh return P_INVALID; 2771*5d465952Smartinh } 2772*5d465952Smartinh } else if (F_ISSET(p->flags, P_LEAF)) { 2773*5d465952Smartinh for (i = 0; i < NUMKEYSP(p); i++) { 2774*5d465952Smartinh node = NODEPTRP(p, i); 2775*5d465952Smartinh if (F_ISSET(node->flags, F_BIGDATA)) { 2776*5d465952Smartinh pnext = NODEDATA(node); 2777*5d465952Smartinh *pnext = btree_compact_tree(bt, *pnext, fd); 2778*5d465952Smartinh if (*pnext == P_INVALID) 2779*5d465952Smartinh return P_INVALID; 2780*5d465952Smartinh } 2781*5d465952Smartinh } 2782*5d465952Smartinh } else if (F_ISSET(p->flags, P_OVERFLOW)) { 2783*5d465952Smartinh pnext = &p->p_next_pgno; 2784*5d465952Smartinh if (*pnext > 0) { 2785*5d465952Smartinh *pnext = btree_compact_tree(bt, *pnext, fd); 2786*5d465952Smartinh if (*pnext == P_INVALID) 2787*5d465952Smartinh return P_INVALID; 2788*5d465952Smartinh } 2789*5d465952Smartinh } else 2790*5d465952Smartinh assert(0); 2791*5d465952Smartinh 2792*5d465952Smartinh p->pgno = bt->txn->next_pgno++; 2793*5d465952Smartinh if (write(fd, page, PAGESIZE) != PAGESIZE) 2794*5d465952Smartinh return P_INVALID; 2795*5d465952Smartinh return p->pgno; 2796*5d465952Smartinh } 2797*5d465952Smartinh 2798*5d465952Smartinh int 2799*5d465952Smartinh btree_compact(struct btree *bt) 2800*5d465952Smartinh { 2801*5d465952Smartinh int rc, fd, old_fd; 2802*5d465952Smartinh char *compact_path = NULL; 2803*5d465952Smartinh struct btree_txn *txn; 2804*5d465952Smartinh pgno_t root; 2805*5d465952Smartinh 2806*5d465952Smartinh assert(bt != NULL); 2807*5d465952Smartinh 2808*5d465952Smartinh if (bt->path == NULL) 2809*5d465952Smartinh return BT_FAIL; 2810*5d465952Smartinh 2811*5d465952Smartinh if ((rc = btree_read_meta(bt, NULL)) == BT_FAIL) 2812*5d465952Smartinh return BT_FAIL; 2813*5d465952Smartinh else if (rc == BT_NOTFOUND) 2814*5d465952Smartinh return BT_SUCCESS; 2815*5d465952Smartinh 2816*5d465952Smartinh asprintf(&compact_path, "%s.compact.XXXXXX", bt->path); 2817*5d465952Smartinh fd = mkstemp(compact_path); 2818*5d465952Smartinh if (fd == -1) { 2819*5d465952Smartinh free(compact_path); 2820*5d465952Smartinh return BT_FAIL; 2821*5d465952Smartinh } 2822*5d465952Smartinh 2823*5d465952Smartinh old_fd = bt->fd; 2824*5d465952Smartinh 2825*5d465952Smartinh if ((txn = btree_txn_begin(bt, 0)) == NULL) 2826*5d465952Smartinh goto failed; 2827*5d465952Smartinh 2828*5d465952Smartinh bt->txn->next_pgno = 0; 2829*5d465952Smartinh root = btree_compact_tree(bt, bt->meta->root, fd); 2830*5d465952Smartinh if (root == P_INVALID) 2831*5d465952Smartinh goto failed; 2832*5d465952Smartinh bt->fd = fd; 2833*5d465952Smartinh bt->meta->revisions = 0; 2834*5d465952Smartinh if (btree_write_meta(bt, root) != BT_SUCCESS) 2835*5d465952Smartinh goto failed; 2836*5d465952Smartinh 2837*5d465952Smartinh fsync(fd); 2838*5d465952Smartinh 2839*5d465952Smartinh DPRINTF("renaming %s to %s", compact_path, bt->path); 2840*5d465952Smartinh if (rename(compact_path, bt->path) != 0) 2841*5d465952Smartinh goto failed; 2842*5d465952Smartinh 2843*5d465952Smartinh // XXX: write a "reopen me" meta page for other processes to see 2844*5d465952Smartinh btree_txn_abort(txn); 2845*5d465952Smartinh close(old_fd); 2846*5d465952Smartinh 2847*5d465952Smartinh free(compact_path); 2848*5d465952Smartinh return BT_SUCCESS; 2849*5d465952Smartinh 2850*5d465952Smartinh failed: 2851*5d465952Smartinh bt->fd = old_fd; 2852*5d465952Smartinh btree_txn_abort(txn); 2853*5d465952Smartinh unlink(compact_path); 2854*5d465952Smartinh free(compact_path); 2855*5d465952Smartinh return BT_FAIL; 2856*5d465952Smartinh } 2857*5d465952Smartinh 2858*5d465952Smartinh /* Reverts the last change. Truncates the file at the last root page. 2859*5d465952Smartinh */ 2860*5d465952Smartinh int 2861*5d465952Smartinh btree_revert(struct btree *bt) 2862*5d465952Smartinh { 2863*5d465952Smartinh if (btree_read_meta(bt, NULL) == BT_FAIL) 2864*5d465952Smartinh return BT_FAIL; 2865*5d465952Smartinh 2866*5d465952Smartinh if (bt->meta == NULL) 2867*5d465952Smartinh return BT_SUCCESS; 2868*5d465952Smartinh 2869*5d465952Smartinh DPRINTF("truncating file at page %u", bt->meta->root); 2870*5d465952Smartinh return ftruncate(bt->fd, PAGESIZE * bt->meta->root); 2871*5d465952Smartinh } 2872*5d465952Smartinh 2873*5d465952Smartinh void 2874*5d465952Smartinh btree_set_cache_size(struct btree *bt, unsigned int cache_size) 2875*5d465952Smartinh { 2876*5d465952Smartinh bt->stat.max_cache = cache_size; 2877*5d465952Smartinh } 2878*5d465952Smartinh 2879*5d465952Smartinh unsigned int 2880*5d465952Smartinh btree_get_flags(struct btree *bt) 2881*5d465952Smartinh { 2882*5d465952Smartinh return (bt->flags & ~BT_FIXPADDING); 2883*5d465952Smartinh } 2884*5d465952Smartinh 2885*5d465952Smartinh const char * 2886*5d465952Smartinh btree_get_path(struct btree *bt) 2887*5d465952Smartinh { 2888*5d465952Smartinh return bt->path; 2889*5d465952Smartinh } 2890*5d465952Smartinh 2891*5d465952Smartinh const struct btree_stat * 2892*5d465952Smartinh btree_stat(struct btree *bt) 2893*5d465952Smartinh { 2894*5d465952Smartinh bt->stat.branch_pages = bt->meta->branch_pages; 2895*5d465952Smartinh bt->stat.leaf_pages = bt->meta->leaf_pages; 2896*5d465952Smartinh bt->stat.overflow_pages = bt->meta->overflow_pages; 2897*5d465952Smartinh bt->stat.revisions = bt->meta->revisions; 2898*5d465952Smartinh bt->stat.depth = bt->meta->depth; 2899*5d465952Smartinh bt->stat.entries = bt->meta->entries; 2900*5d465952Smartinh bt->stat.psize = bt->meta->psize; 2901*5d465952Smartinh bt->stat.created_at = bt->meta->created_at; 2902*5d465952Smartinh 2903*5d465952Smartinh return &bt->stat; 2904*5d465952Smartinh } 2905*5d465952Smartinh 2906