1 /*- 2 * Copyright (c) 1990 The Regents of the University of California. 3 * All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * %sccs.include.redist.c% 9 */ 10 11 #if defined(LIBC_SCCS) && !defined(lint) 12 static char sccsid[] = "@(#)bt_get.c 5.2 (Berkeley) 02/22/91"; 13 #endif /* LIBC_SCCS and not lint */ 14 15 #include <sys/param.h> 16 #include <db.h> 17 #include <errno.h> 18 #include <stdlib.h> 19 #include <string.h> 20 #include <unistd.h> 21 #include "btree.h" 22 23 /* 24 * BT_GETPAGE -- Make pgno the current page of the btree. 25 * 26 * This routine is just a wrapper that decides whether to call the 27 * memory or disk-based routine to do the work. 28 * 29 * Parameters: 30 * t -- btree in which to get page 31 * pgno -- page number to get 32 * 33 * Returns: 34 * RET_SUCCESS or RET_ERROR. 35 */ 36 37 int 38 _bt_getpage(t, pgno) 39 BTREE_P t; 40 pgno_t pgno; 41 { 42 #ifdef DEBUG 43 if (pgno == P_NONE) 44 _punt(); 45 #endif /* DEBUG */ 46 47 /* see if we can get away without doing any work */ 48 if (t->bt_curpage != (BTHEADER *) NULL) { 49 if (t->bt_curpage->h_pgno == pgno) 50 return (RET_SUCCESS); 51 } 52 53 if (t->bt_fname == (char *) NULL) 54 return (_bt_getmpage(t, pgno)); 55 else 56 return (_bt_getdpage(t, pgno)); 57 } 58 59 /* 60 * _BT_GETMPAGE -- Make pgno the current page of the btree. 61 * 62 * This routine gets pages for in-memory btrees. 63 * 64 * Parameters: 65 * t -- btree in which to get page 66 * pgno -- page number to get 67 * 68 * Returns: 69 * RET_SUCCESS or RET_ERROR. 70 */ 71 72 int 73 _bt_getmpage(t, pgno) 74 register BTREE_P t; 75 pgno_t pgno; 76 { 77 int htindex; 78 BTHEADER *h; 79 HTBUCKET *b; 80 81 if (t->bt_curpage == (BTHEADER *) NULL) { 82 if (pgno != P_ROOT) { 83 errno = EBADF; 84 return (RET_ERROR); 85 } 86 87 t->bt_npages++; 88 h = (BTHEADER *) malloc((unsigned) t->bt_psize); 89 if (h == (BTHEADER *) NULL) 90 return (RET_ERROR); 91 92 h->h_pgno = P_ROOT; 93 h->h_flags = F_LEAF; 94 h->h_lower = (index_t) 95 (((char *) &(h->h_linp[0])) - ((char *) h)); 96 h->h_upper = t->bt_psize; 97 h->h_prevpg = h->h_nextpg = P_NONE; 98 99 t->bt_curpage = h; 100 101 /* get the root page into the hash table */ 102 if (_bt_write(t, h, RELEASE) == RET_ERROR) 103 return (RET_ERROR); 104 } 105 106 htindex = HASHKEY(pgno); 107 108 for (b = t->bt_s.bt_ht[htindex]; 109 b != (HTBUCKET *) NULL; 110 b = b->ht_next) { 111 if (b->ht_pgno == pgno) { 112 t->bt_curpage = b->ht_page; 113 return (RET_SUCCESS); 114 } 115 } 116 return (RET_ERROR); 117 } 118 119 /* 120 * _BT_GETDPAGE -- Make pgno the current page of the btree. 121 * 122 * This routine gets pages for disk btrees. 123 * 124 * Because disk btree pages must be readable across machine architectures, 125 * the btree code writes integers out in network format. This routine 126 * converts them back to host format before returning the page. 127 * 128 * Parameters: 129 * t -- btree in which to get page 130 * pgno -- page number to get 131 * 132 * Returns: 133 * RET_SUCCESS, RET_ERROR. 134 */ 135 136 int 137 _bt_getdpage(t, pgno) 138 register BTREE_P t; 139 pgno_t pgno; 140 { 141 BTHEADER *h; 142 char *cache; 143 long pos; 144 int n, nbytes; 145 146 /* if we have an lru cache, let the cache code do the work */ 147 if (ISCACHE(t)) { 148 cache = t->bt_s.bt_d.d_cache; 149 150 /* release the old page */ 151 if (t->bt_curpage != (BTHEADER *) NULL) { 152 pgno_t opgno = t->bt_curpage->h_pgno; 153 t->bt_curpage->h_flags &= ~F_DIRTY; 154 155 if (lruwrite(cache, (int) opgno) < 0) 156 return (RET_ERROR); 157 158 if (lrurelease(cache, (int) opgno) < 0) 159 return (RET_ERROR); 160 } 161 162 if (pgno > t->bt_npages) { 163 if ((h = (BTHEADER *) lrugetnew(cache, (int)pgno, &nbytes)) 164 == (BTHEADER *) NULL) 165 return (RET_ERROR); 166 t->bt_npages = pgno; 167 } else { 168 if ((h = (BTHEADER *) lruget(cache, (int)pgno, &nbytes)) 169 == (BTHEADER *) NULL) 170 return (RET_ERROR); 171 } 172 173 /* init this page, if necessary */ 174 if (nbytes == 0) { 175 h->h_pgno = pgno; 176 h->h_flags = F_LEAF; 177 h->h_lower = (index_t) 178 (((char *) &(h->h_linp[0])) - ((char *) h)); 179 h->h_upper = t->bt_psize; 180 h->h_prevpg = h->h_nextpg = P_NONE; 181 } 182 183 t->bt_curpage = h; 184 185 return (RET_SUCCESS); 186 } 187 188 /* sync the current page, if necessary */ 189 if (t->bt_curpage != (BTHEADER *) NULL) { 190 if (t->bt_curpage->h_flags & F_DIRTY) 191 if (_bt_write(t, t->bt_curpage, RELEASE) == RET_ERROR) 192 return (RET_ERROR); 193 } else { 194 if (t->bt_npages == 0) 195 t->bt_npages = 1; 196 197 /* if no current page, get space for one */ 198 if ((t->bt_curpage = (BTHEADER *) malloc((unsigned) t->bt_psize)) 199 == (BTHEADER *) NULL) { 200 return (RET_ERROR); 201 } 202 } 203 204 n = t->bt_psize; 205 pos = (long) (pgno * n); 206 207 /* seek to correct location in file */ 208 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) { 209 return (RET_ERROR); 210 } 211 212 /* read the page */ 213 if ((nbytes = read(t->bt_s.bt_d.d_fd, t->bt_curpage, n)) < n) { 214 215 /* 216 * If we didn't get a full page, we must have gotten no 217 * data at all -- in which case we're trying to read a 218 * root page that doesn't exist yet. This is the only 219 * case in which this is okay. If this happens, construct 220 * an empty root page by hand. 221 */ 222 if (nbytes != 0 || pgno != P_ROOT) { 223 errno = EBADF; 224 return (RET_ERROR); 225 } 226 227 h = (BTHEADER *) t->bt_curpage; 228 h->h_pgno = pgno; 229 h->h_flags = F_LEAF; 230 h->h_lower = (index_t) 231 (((char *) &(h->h_linp[0])) - ((char *) h)); 232 h->h_upper = t->bt_psize; 233 h->h_prevpg = h->h_nextpg = P_NONE; 234 } else 235 (void) _bt_pgin(t->bt_curpage, (char *) t->bt_lorder); 236 237 return (RET_SUCCESS); 238 } 239 240 /* 241 * _BT_PGOUT, _BT_PGIN -- Convert host-specific number layout to/from 242 * the host-independent format stored on disk. 243 * 244 * Parameters: 245 * h -- page to convert 246 * _lorder -- byte order for pages (stored as a char * in the 247 * cache, and passed around as a magic cookie). 248 * 249 * Returns: 250 * RET_SUCCESS (lru code requires a return value). 251 * 252 * Side Effects: 253 * Layout of tree metadata on the page is changed in place. 254 * 255 * Warnings: 256 * Everywhere else in the code, the types pgno_t and index_t 257 * are opaque. These two routines know what they really 258 * are. 259 */ 260 261 int 262 _bt_pgout(h, _lorder) 263 BTHEADER *h; 264 char *_lorder; 265 { 266 int i; 267 int top; 268 int lorder; 269 DATUM *d; 270 IDATUM *id; 271 size_t chain; 272 273 lorder = (int) _lorder; 274 if (lorder == BYTE_ORDER) 275 return (RET_SUCCESS); 276 277 if (h->h_flags & F_LEAF) { 278 if (h->h_flags & F_CONT) { 279 if (h->h_prevpg == P_NONE) { 280 size_t longsz; 281 282 (void) bcopy((char *) &(h->h_linp[0]), 283 (char *) &longsz, 284 sizeof(longsz)); 285 BLSWAP(longsz); 286 (void) bcopy((char *) &longsz, 287 (char *) &(h->h_linp[0]), 288 sizeof(longsz)); 289 } 290 } else { 291 top = NEXTINDEX(h); 292 for (i = 0; i < top; i++) { 293 d = (DATUM *) GETDATUM(h, i); 294 if (d->d_flags & D_BIGKEY) { 295 (void) bcopy((char *) &(d->d_bytes[0]), 296 (char *) &chain, 297 sizeof(chain)); 298 BLSWAP(chain); 299 (void) bcopy((char *) &chain, 300 (char *) &(d->d_bytes[0]), 301 sizeof(chain)); 302 } 303 if (d->d_flags & D_BIGDATA) { 304 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 305 (char *) &chain, 306 sizeof(chain)); 307 BLSWAP(chain); 308 (void) bcopy((char *) &chain, 309 (char *) &(d->d_bytes[d->d_ksize]), 310 sizeof(chain)); 311 } 312 BLSWAP(d->d_dsize); 313 BLSWAP(d->d_ksize); 314 BLSWAP(d->d_flags); 315 BLSWAP(h->h_linp[i]); 316 } 317 } 318 } else { 319 top = NEXTINDEX(h); 320 for (i = 0; i < top; i++) { 321 id = (IDATUM *) GETDATUM(h, i); 322 BLSWAP(id->i_size); 323 BLSWAP(id->i_pgno); 324 BLSWAP(id->i_flags); 325 if (id->i_flags & D_BIGKEY) { 326 (void) bcopy((char *) &(id->i_bytes[0]), 327 (char *) &chain, 328 sizeof(chain)); 329 BLSWAP(chain); 330 (void) bcopy((char *) &chain, 331 (char *) &(id->i_bytes[0]), 332 sizeof(chain)); 333 } 334 BLSWAP(h->h_linp[i]); 335 } 336 } 337 BLSWAP(h->h_flags); 338 BLSWAP(h->h_pgno); 339 BLSWAP(h->h_prevpg); 340 BLSWAP(h->h_nextpg); 341 BLSWAP(h->h_lower); 342 BLSWAP(h->h_upper); 343 344 return (RET_SUCCESS); 345 } 346 347 int 348 _bt_pgin(h, _lorder) 349 BTHEADER *h; 350 char *_lorder; 351 { 352 int i; 353 int top; 354 int lorder; 355 DATUM *d; 356 IDATUM *id; 357 size_t chain; 358 359 /* 360 * If btree pages are to be stored in the host byte order, don't 361 * bother swapping. 362 */ 363 lorder = (int) _lorder; 364 if (lorder == BYTE_ORDER) 365 return (RET_SUCCESS); 366 367 BLSWAP(h->h_upper); 368 BLSWAP(h->h_lower); 369 BLSWAP(h->h_nextpg); 370 BLSWAP(h->h_prevpg); 371 BLSWAP(h->h_pgno); 372 BLSWAP(h->h_flags); 373 374 if (h->h_flags & F_LEAF) { 375 if (h->h_flags & F_CONT) { 376 if (h->h_prevpg == P_NONE) { 377 size_t longsz; 378 379 (void) bcopy((char *) &(h->h_linp[0]), 380 (char *) &longsz, 381 sizeof(longsz)); 382 BLSWAP(longsz); 383 (void) bcopy((char *) &longsz, 384 (char *) &(h->h_linp[0]), 385 sizeof(longsz)); 386 } 387 } else { 388 top = NEXTINDEX(h); 389 for (i = 0; i < top; i++) { 390 BLSWAP(h->h_linp[i]); 391 d = (DATUM *) GETDATUM(h, i); 392 BLSWAP(d->d_dsize); 393 BLSWAP(d->d_ksize); 394 BLSWAP(d->d_flags); 395 if (d->d_flags & D_BIGKEY) { 396 (void) bcopy((char *) &(d->d_bytes[0]), 397 (char *) &chain, 398 sizeof(chain)); 399 BLSWAP(chain); 400 (void) bcopy((char *) &chain, 401 (char *) &(d->d_bytes[0]), 402 sizeof(chain)); 403 } 404 if (d->d_flags & D_BIGDATA) { 405 (void) bcopy((char *) &(d->d_bytes[d->d_ksize]), 406 (char *) &chain, 407 sizeof(chain)); 408 BLSWAP(chain); 409 (void) bcopy((char *) &chain, 410 (char *) &(d->d_bytes[d->d_ksize]), 411 sizeof(chain)); 412 } 413 } 414 } 415 } else { 416 top = NEXTINDEX(h); 417 for (i = 0; i < top; i++) { 418 BLSWAP(h->h_linp[i]); 419 id = (IDATUM *) GETDATUM(h, i); 420 BLSWAP(id->i_size); 421 BLSWAP(id->i_pgno); 422 BLSWAP(id->i_flags); 423 if (id->i_flags & D_BIGKEY) { 424 (void) bcopy((char *) &(id->i_bytes[0]), 425 (char *) &chain, 426 sizeof(chain)); 427 BLSWAP(chain); 428 (void) bcopy((char *) &chain, 429 (char *) &(id->i_bytes[0]), 430 sizeof(chain)); 431 } 432 } 433 } 434 return (RET_SUCCESS); 435 } 436 437 /* 438 * _BT_ALLOCPG -- allocate a new page in the btree. 439 * 440 * This is called when we split a page, to get space to do the split. 441 * For disk btrees, these pages are released when the split is done. 442 * For memory btrees, they are not. 443 * 444 * Parameters: 445 * t -- tree in which to do split 446 * 447 * Returns: 448 * Pointer to the newly-allocated page 449 */ 450 451 BTHEADER * 452 _bt_allocpg(t) 453 BTREE_P t; 454 { 455 BTHEADER *h = t->bt_curpage; 456 BTHEADER *nh; 457 int nbytes; 458 459 /* if we have a cache, let the cache code do the work */ 460 if (ISDISK(t) && ISCACHE(t)) { 461 nh = (BTHEADER *) lrugetnew(t->bt_s.bt_d.d_cache, 462 (int) (t->bt_npages + 1), 463 &nbytes); 464 } else { 465 nh = (BTHEADER *) malloc((unsigned) t->bt_psize); 466 } 467 468 if (nh != (BTHEADER *) NULL) { 469 nh->h_pgno = nh->h_prevpg = nh->h_nextpg = P_NONE; 470 nh->h_flags = h->h_flags; 471 nh->h_lower = (index_t) 472 (((char *) &(nh->h_linp[0])) - ((char *) nh)); 473 nh->h_upper = t->bt_psize; 474 } 475 476 return (nh); 477 } 478 479 /* 480 * _BT_WRITE -- Write a specific page to a btree file. 481 * 482 * Parameters: 483 * t -- btree to get the page 484 * h -- page to write 485 * relflag -- (int) this page may/may not be released 486 * 487 * Returns: 488 * RET_SUCCESS, RET_ERROR. 489 * 490 * Side Effects: 491 * Writes a metadata page if none has been written yet. 492 */ 493 494 int 495 _bt_write(t, h, relflag) 496 BTREE_P t; 497 BTHEADER *h; 498 int relflag; 499 { 500 long pos; 501 int htindex; 502 HTBUCKET *b; 503 char *cache; 504 pgno_t pgno; 505 506 h->h_flags &= ~F_DIRTY; 507 if (ISDISK(t)) { 508 509 /* if we haven't done so yet, write the metadata */ 510 if (!(t->bt_flags & BTF_METAOK)) { 511 if (_bt_wrtmeta(t) == RET_ERROR) 512 return (RET_ERROR); 513 } 514 515 pgno = h->h_pgno; 516 517 518 /* if we have a cache, let the cache code do the work */ 519 if ((cache = t->bt_s.bt_d.d_cache) != (char *) NULL) { 520 if (lruwrite(cache, (int) pgno) < 0) 521 return (RET_ERROR); 522 if (relflag && lrurelease(cache, (int) pgno) < 0) 523 return (RET_ERROR); 524 525 } else { 526 (void) _bt_pgout(h, (char *) t->bt_lorder); 527 /* now write the current page */ 528 pos = (long) (pgno * t->bt_psize); 529 if (lseek(t->bt_s.bt_d.d_fd, pos, L_SET) != pos) 530 return (RET_ERROR); 531 if (write(t->bt_s.bt_d.d_fd, (char *) h, (int) t->bt_psize) 532 < t->bt_psize) 533 return (RET_ERROR); 534 if (!relflag) 535 (void) _bt_pgin(h, (char *) t->bt_lorder); 536 } 537 } else { 538 /* in-memory btree */ 539 htindex = HASHKEY(h->h_pgno); 540 541 /* see if we need to overwrite existing entry */ 542 for (b = t->bt_s.bt_ht[htindex]; 543 b != (HTBUCKET *) NULL; 544 b = b->ht_next) { 545 if (b->ht_pgno == h->h_pgno) { 546 b->ht_page = h; 547 return (RET_SUCCESS); 548 } 549 } 550 551 /* new entry, write it */ 552 b = (HTBUCKET *) malloc((unsigned) sizeof (HTBUCKET)); 553 if (b == (HTBUCKET *) NULL) 554 return (RET_ERROR); 555 556 b->ht_pgno = h->h_pgno; 557 b->ht_page = h; 558 b->ht_next = t->bt_s.bt_ht[htindex]; 559 t->bt_s.bt_ht[htindex] = b; 560 } 561 return (RET_SUCCESS); 562 } 563 564 /* 565 * _BT_RELEASE -- Release a locked-down cache page 566 * 567 * During page splits, we want to force pages out to the cache 568 * before we actually release them, in some cases. This routine 569 * releases such a page when it is no longer needed. 570 * 571 * Parameters: 572 * t -- btree in which to release page 573 * h -- page to release 574 * 575 * Returns: 576 * RET_SUCCESS, RET_ERROR. 577 * 578 * Side Effects: 579 * None. 580 */ 581 582 int 583 _bt_release(t, h) 584 BTREE_P t; 585 BTHEADER *h; 586 { 587 if (ISDISK(t) && ISCACHE(t)) { 588 if (lrurelease(t->bt_s.bt_d.d_cache, (int) (h->h_pgno)) < 0) 589 return (RET_ERROR); 590 } 591 return (RET_SUCCESS); 592 } 593 594 /* 595 * _BT_WRTMETA -- Write metadata to the btree. 596 * 597 * Parameters: 598 * t -- tree to which to write 599 * 600 * Returns: 601 * RET_SUCCESS, RET_ERROR. 602 */ 603 604 int 605 _bt_wrtmeta(t) 606 BTREE_P t; 607 { 608 BTMETA m; 609 610 if (lseek(t->bt_s.bt_d.d_fd, 0l, L_SET) != 0l) 611 return (RET_ERROR); 612 613 /* lorder has to be in host-independent format */ 614 m.m_lorder = (u_long) htonl((long) t->bt_lorder); 615 616 m.m_magic = BTREEMAGIC; 617 m.m_version = BTREEVERSION; 618 m.m_psize = t->bt_psize; 619 m.m_free = t->bt_free; 620 m.m_flags = t->bt_flags & BTF_NODUPS; 621 622 if (t->bt_lorder != BYTE_ORDER) { 623 BLSWAP(m.m_magic); 624 BLSWAP(m.m_version); 625 BLSWAP(m.m_psize); 626 BLSWAP(m.m_free); 627 BLSWAP(m.m_flags); 628 } 629 630 if (write(t->bt_s.bt_d.d_fd, (char *) &m, sizeof(m)) 631 != sizeof(m)) { 632 return (RET_ERROR); 633 } 634 635 t->bt_flags |= BTF_METAOK; 636 637 return (RET_SUCCESS); 638 } 639