1 /*- 2 * Copyright (c) 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)bt_seq.c 8.7 (Berkeley) 7/20/94 33 * $DragonFly: src/lib/libc/db/btree/bt_seq.c,v 1.7 2005/11/12 23:01:54 swildner Exp $ 34 */ 35 36 #include <sys/types.h> 37 38 #include <errno.h> 39 #include <stddef.h> 40 #include <stdio.h> 41 #include <stdlib.h> 42 43 #include <db.h> 44 #include "btree.h" 45 46 static int __bt_first (BTREE *, const DBT *, EPG *, int *); 47 static int __bt_seqadv (BTREE *, EPG *, int); 48 static int __bt_seqset (BTREE *, EPG *, DBT *, int); 49 50 /* 51 * Sequential scan support. 52 * 53 * The tree can be scanned sequentially, starting from either end of the 54 * tree or from any specific key. A scan request before any scanning is 55 * done is initialized as starting from the least node. 56 */ 57 58 /* 59 * __bt_seq -- 60 * Btree sequential scan interface. 61 * 62 * Parameters: 63 * dbp: pointer to access method 64 * key: key for positioning and return value 65 * data: data return value 66 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. 67 * 68 * Returns: 69 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 70 */ 71 int 72 __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) 73 { 74 BTREE *t; 75 EPG e; 76 int status; 77 78 t = dbp->internal; 79 80 /* Toss any page pinned across calls. */ 81 if (t->bt_pinned != NULL) { 82 mpool_put(t->bt_mp, t->bt_pinned, 0); 83 t->bt_pinned = NULL; 84 } 85 86 /* 87 * If scan unitialized as yet, or starting at a specific record, set 88 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin 89 * the page the cursor references if they're successful. 90 */ 91 switch (flags) { 92 case R_NEXT: 93 case R_PREV: 94 if (F_ISSET(&t->bt_cursor, CURS_INIT)) { 95 status = __bt_seqadv(t, &e, flags); 96 break; 97 } 98 /* FALLTHROUGH */ 99 case R_FIRST: 100 case R_LAST: 101 case R_CURSOR: 102 status = __bt_seqset(t, &e, key, flags); 103 break; 104 default: 105 errno = EINVAL; 106 return (RET_ERROR); 107 } 108 109 if (status == RET_SUCCESS) { 110 __bt_setcur(t, e.page->pgno, e.index); 111 112 status = 113 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 114 115 /* 116 * If the user is doing concurrent access, we copied the 117 * key/data, toss the page. 118 */ 119 if (F_ISSET(t, B_DB_LOCK)) 120 mpool_put(t->bt_mp, e.page, 0); 121 else 122 t->bt_pinned = e.page; 123 } 124 return (status); 125 } 126 127 /* 128 * __bt_seqset -- 129 * Set the sequential scan to a specific key. 130 * 131 * Parameters: 132 * t: tree 133 * ep: storage for returned key 134 * key: key for initial scan position 135 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 136 * 137 * Side effects: 138 * Pins the page the cursor references. 139 * 140 * Returns: 141 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 142 */ 143 static int 144 __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) 145 { 146 PAGE *h; 147 pgno_t pg; 148 int exact; 149 150 /* 151 * Find the first, last or specific key in the tree and point the 152 * cursor at it. The cursor may not be moved until a new key has 153 * been found. 154 */ 155 switch (flags) { 156 case R_CURSOR: /* Keyed scan. */ 157 /* 158 * Find the first instance of the key or the smallest key 159 * which is greater than or equal to the specified key. 160 */ 161 if (key->data == NULL || key->size == 0) { 162 errno = EINVAL; 163 return (RET_ERROR); 164 } 165 return (__bt_first(t, key, ep, &exact)); 166 case R_FIRST: /* First record. */ 167 case R_NEXT: 168 /* Walk down the left-hand side of the tree. */ 169 for (pg = P_ROOT;;) { 170 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 171 return (RET_ERROR); 172 173 /* Check for an empty tree. */ 174 if (NEXTINDEX(h) == 0) { 175 mpool_put(t->bt_mp, h, 0); 176 return (RET_SPECIAL); 177 } 178 179 if (h->flags & (P_BLEAF | P_RLEAF)) 180 break; 181 pg = GETBINTERNAL(h, 0)->pgno; 182 mpool_put(t->bt_mp, h, 0); 183 } 184 ep->page = h; 185 ep->index = 0; 186 break; 187 case R_LAST: /* Last record. */ 188 case R_PREV: 189 /* Walk down the right-hand side of the tree. */ 190 for (pg = P_ROOT;;) { 191 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 192 return (RET_ERROR); 193 194 /* Check for an empty tree. */ 195 if (NEXTINDEX(h) == 0) { 196 mpool_put(t->bt_mp, h, 0); 197 return (RET_SPECIAL); 198 } 199 200 if (h->flags & (P_BLEAF | P_RLEAF)) 201 break; 202 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 203 mpool_put(t->bt_mp, h, 0); 204 } 205 206 ep->page = h; 207 ep->index = NEXTINDEX(h) - 1; 208 break; 209 } 210 return (RET_SUCCESS); 211 } 212 213 /* 214 * __bt_seqadvance -- 215 * Advance the sequential scan. 216 * 217 * Parameters: 218 * t: tree 219 * flags: R_NEXT, R_PREV 220 * 221 * Side effects: 222 * Pins the page the new key/data record is on. 223 * 224 * Returns: 225 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 226 */ 227 static int 228 __bt_seqadv(BTREE *t, EPG *ep, int flags) 229 { 230 CURSOR *c; 231 PAGE *h; 232 indx_t idx; 233 pgno_t pg; 234 int exact; 235 236 /* 237 * There are a couple of states that we can be in. The cursor has 238 * been initialized by the time we get here, but that's all we know. 239 */ 240 c = &t->bt_cursor; 241 242 /* 243 * The cursor was deleted where there weren't any duplicate records, 244 * so the key was saved. Find out where that key would go in the 245 * current tree. It doesn't matter if the returned key is an exact 246 * match or not -- if it's an exact match, the record was added after 247 * the delete so we can just return it. If not, as long as there's 248 * a record there, return it. 249 */ 250 if (F_ISSET(c, CURS_ACQUIRE)) 251 return (__bt_first(t, &c->key, ep, &exact)); 252 253 /* Get the page referenced by the cursor. */ 254 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 255 return (RET_ERROR); 256 257 /* 258 * Find the next/previous record in the tree and point the cursor at 259 * it. The cursor may not be moved until a new key has been found. 260 */ 261 switch (flags) { 262 case R_NEXT: /* Next record. */ 263 /* 264 * The cursor was deleted in duplicate records, and moved 265 * forward to a record that has yet to be returned. Clear 266 * that flag, and return the record. 267 */ 268 if (F_ISSET(c, CURS_AFTER)) 269 goto usecurrent; 270 idx = c->pg.index; 271 if (++idx == NEXTINDEX(h)) { 272 pg = h->nextpg; 273 mpool_put(t->bt_mp, h, 0); 274 if (pg == P_INVALID) 275 return (RET_SPECIAL); 276 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 277 return (RET_ERROR); 278 idx = 0; 279 } 280 break; 281 case R_PREV: /* Previous record. */ 282 /* 283 * The cursor was deleted in duplicate records, and moved 284 * backward to a record that has yet to be returned. Clear 285 * that flag, and return the record. 286 */ 287 if (F_ISSET(c, CURS_BEFORE)) { 288 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); 289 ep->page = h; 290 ep->index = c->pg.index; 291 return (RET_SUCCESS); 292 } 293 idx = c->pg.index; 294 if (idx == 0) { 295 pg = h->prevpg; 296 mpool_put(t->bt_mp, h, 0); 297 if (pg == P_INVALID) 298 return (RET_SPECIAL); 299 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 300 return (RET_ERROR); 301 idx = NEXTINDEX(h) - 1; 302 } else 303 --idx; 304 break; 305 default: 306 return (RET_ERROR); 307 } 308 309 ep->page = h; 310 ep->index = idx; 311 return (RET_SUCCESS); 312 } 313 314 /* 315 * __bt_first -- 316 * Find the first entry. 317 * 318 * Parameters: 319 * t: the tree 320 * key: the key 321 * erval: return EPG 322 * exactp: pointer to exact match flag 323 * 324 * Returns: 325 * The first entry in the tree greater than or equal to key, 326 * or RET_SPECIAL if no such key exists. 327 */ 328 static int 329 __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp) 330 { 331 PAGE *h; 332 EPG *ep, save; 333 pgno_t pg; 334 335 /* 336 * Find any matching record; __bt_search pins the page. 337 * 338 * If it's an exact match and duplicates are possible, walk backwards 339 * in the tree until we find the first one. Otherwise, make sure it's 340 * a valid key (__bt_search may return an index just past the end of a 341 * page) and return it. 342 */ 343 if ((ep = __bt_search(t, key, exactp)) == NULL) 344 return (0); 345 if (*exactp) { 346 if (F_ISSET(t, B_NODUPS)) { 347 *erval = *ep; 348 return (RET_SUCCESS); 349 } 350 351 /* 352 * Walk backwards, as long as the entry matches and there are 353 * keys left in the tree. Save a copy of each match in case 354 * we go too far. 355 */ 356 save = *ep; 357 h = ep->page; 358 do { 359 if (save.page->pgno != ep->page->pgno) { 360 mpool_put(t->bt_mp, save.page, 0); 361 save = *ep; 362 } else 363 save.index = ep->index; 364 365 /* 366 * Don't unpin the page the last (or original) match 367 * was on, but make sure it's unpinned if an error 368 * occurs. 369 */ 370 if (ep->index == 0) { 371 if (h->prevpg == P_INVALID) 372 break; 373 if (h->pgno != save.page->pgno) 374 mpool_put(t->bt_mp, h, 0); 375 if ((h = mpool_get(t->bt_mp, 376 h->prevpg, 0)) == NULL) { 377 if (h->pgno == save.page->pgno) 378 mpool_put(t->bt_mp, 379 save.page, 0); 380 return (RET_ERROR); 381 } 382 ep->page = h; 383 ep->index = NEXTINDEX(h); 384 } 385 --ep->index; 386 } while (__bt_cmp(t, key, ep) == 0); 387 388 /* 389 * Reach here with the last page that was looked at pinned, 390 * which may or may not be the same as the last (or original) 391 * match page. If it's not useful, release it. 392 */ 393 if (h->pgno != save.page->pgno) 394 mpool_put(t->bt_mp, h, 0); 395 396 *erval = save; 397 return (RET_SUCCESS); 398 } 399 400 /* If at the end of a page, find the next entry. */ 401 if (ep->index == NEXTINDEX(ep->page)) { 402 h = ep->page; 403 pg = h->nextpg; 404 mpool_put(t->bt_mp, h, 0); 405 if (pg == P_INVALID) 406 return (RET_SPECIAL); 407 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 408 return (RET_ERROR); 409 ep->index = 0; 410 ep->page = h; 411 } 412 *erval = *ep; 413 return (RET_SUCCESS); 414 } 415 416 /* 417 * __bt_setcur -- 418 * Set the cursor to an entry in the tree. 419 * 420 * Parameters: 421 * t: the tree 422 * pgno: page number 423 * index: page index 424 */ 425 void 426 __bt_setcur(BTREE *t, pgno_t pgno, u_int index) 427 { 428 /* Lose any already deleted key. */ 429 if (t->bt_cursor.key.data != NULL) { 430 free(t->bt_cursor.key.data); 431 t->bt_cursor.key.size = 0; 432 t->bt_cursor.key.data = NULL; 433 } 434 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 435 436 /* Update the cursor. */ 437 t->bt_cursor.pg.pgno = pgno; 438 t->bt_cursor.pg.index = index; 439 F_SET(&t->bt_cursor, CURS_INIT); 440 } 441