1 /* 2 * Copyright (c) 2007 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.9 2007/12/14 08:05:39 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_add(hammer_transaction_t trans, 40 hammer_record_t record); 41 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip); 42 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip); 43 44 /* 45 * Red-black tree support. 46 */ 47 static int 48 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 49 { 50 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type) 51 return(-1); 52 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type) 53 return(1); 54 55 if (rec1->rec.base.base.key < rec2->rec.base.base.key) 56 return(-1); 57 if (rec1->rec.base.base.key > rec2->rec.base.base.key) 58 return(1); 59 60 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid) 61 return(-1); 62 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid) 63 return(1); 64 return(0); 65 } 66 67 static int 68 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) 69 { 70 /* 71 * A key1->rec_type of 0 matches any record type. 72 */ 73 if (info->rec_type) { 74 if (info->rec_type < rec->rec.base.base.rec_type) 75 return(-3); 76 if (info->rec_type > rec->rec.base.base.rec_type) 77 return(3); 78 } 79 80 /* 81 * There is no special case for key. 0 means 0. 82 */ 83 if (info->key < rec->rec.base.base.key) 84 return(-2); 85 if (info->key > rec->rec.base.base.key) 86 return(2); 87 88 /* 89 * This test has a number of special cases. create_tid in key1 is 90 * the as-of transction id, and delete_tid in key1 is NOT USED. 91 * 92 * A key1->create_tid of 0 matches any record regardles of when 93 * it was created or destroyed. 0xFFFFFFFFFFFFFFFFULL should be 94 * used to search for the most current state of the object. 95 * 96 * key2->create_tid is a HAMMER record and will never be 97 * 0. key2->delete_tid is the deletion transaction id or 0 if 98 * the record has not yet been deleted. 99 */ 100 if (info->create_tid) { 101 if (info->create_tid < rec->rec.base.base.create_tid) 102 return(-1); 103 if (rec->rec.base.base.delete_tid && 104 info->create_tid >= rec->rec.base.base.delete_tid) { 105 return(1); 106 } 107 } 108 return(0); 109 } 110 111 /* 112 * RB_SCAN comparison code for hammer_mem_first(). The argument order 113 * is reversed so the comparison result has to be negated. key_beg and 114 * key_end are both range-inclusive. 115 * 116 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1. 117 * These do not stop the scan. 118 * 119 * Localized deletions are not cached in-memory. 120 */ 121 static 122 int 123 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 124 { 125 hammer_cursor_t cursor = data; 126 int r; 127 128 r = hammer_rec_compare(&cursor->key_beg, rec); 129 if (r > 1) 130 return(-1); 131 if (r == 0) 132 return(0); 133 r = hammer_rec_compare(&cursor->key_end, rec); 134 if (r < -1) 135 return(1); 136 return(0); 137 } 138 139 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 140 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, 141 hammer_rec_compare, hammer_base_elm_t); 142 143 /* 144 * Allocate a record for the caller to finish filling in 145 */ 146 hammer_record_t 147 hammer_alloc_mem_record(hammer_inode_t ip) 148 { 149 hammer_record_t record; 150 151 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); 152 record->ip = ip; 153 return (record); 154 } 155 156 /* 157 * Release a memory record. If the record is marked for defered deletion, 158 * destroy the record when the last reference goes away. 159 */ 160 void 161 hammer_rel_mem_record(struct hammer_record **recordp) 162 { 163 hammer_record_t rec; 164 165 if ((rec = *recordp) != NULL) { 166 if (hammer_islastref(&rec->lock)) { 167 hammer_unref(&rec->lock); 168 if (rec->flags & HAMMER_RECF_DELETED) 169 hammer_free_mem_record(rec); 170 } else { 171 hammer_unref(&rec->lock); 172 } 173 *recordp = NULL; 174 } 175 } 176 177 /* 178 * Free a record. Clean the structure up even though we are throwing it 179 * away as a sanity check. The actual free operation is delayed while 180 * the record is referenced. However, the record is removed from the RB 181 * tree immediately. 182 */ 183 void 184 hammer_free_mem_record(hammer_record_t record) 185 { 186 if (record->flags & HAMMER_RECF_ONRBTREE) { 187 RB_REMOVE(hammer_rec_rb_tree, &record->ip->rec_tree, record); 188 record->flags &= ~HAMMER_RECF_ONRBTREE; 189 } 190 if (record->lock.refs) { 191 record->flags |= HAMMER_RECF_DELETED; 192 return; 193 } 194 if (record->flags & HAMMER_RECF_ALLOCDATA) { 195 kfree(record->data, M_HAMMER); 196 record->flags &= ~HAMMER_RECF_ALLOCDATA; 197 } 198 record->data = NULL; 199 kfree(record, M_HAMMER); 200 } 201 202 /* 203 * Lookup an in-memory record given the key specified in the cursor. Works 204 * just like hammer_btree_lookup() but operates on an inode's in-memory 205 * record list. 206 * 207 * The lookup must fail if the record is marked for deferred deletion. 208 */ 209 static 210 int 211 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip) 212 { 213 int error; 214 215 if (cursor->iprec) 216 hammer_rel_mem_record(&cursor->iprec); 217 if (cursor->ip) { 218 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 219 &cursor->ip->rec_tree); 220 } 221 cursor->ip = ip; 222 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 223 cursor->scan.node = NULL; 224 cursor->iprec = hammer_rec_rb_tree_RB_LOOKUP_INFO( 225 &ip->rec_tree, &cursor->key_beg); 226 if (cursor->iprec == NULL) { 227 error = ENOENT; 228 } else { 229 hammer_ref(&cursor->iprec->lock); 230 error = 0; 231 } 232 return(error); 233 } 234 235 /* 236 * hammer_mem_first() - locate the first in-memory record matching the 237 * cursor. 238 * 239 * The RB_SCAN function we use is designed as a callback. We terminate it 240 * (return -1) as soon as we get a match. 241 */ 242 static 243 int 244 hammer_rec_scan_callback(hammer_record_t rec, void *data) 245 { 246 hammer_cursor_t cursor = data; 247 248 /* 249 * Skip if not visible due to our as-of TID 250 */ 251 if (cursor->key_beg.create_tid) { 252 if (cursor->key_beg.create_tid < rec->rec.base.base.create_tid) 253 return(0); 254 if (rec->rec.base.base.delete_tid && 255 cursor->key_beg.create_tid >= 256 rec->rec.base.base.delete_tid) { 257 return(0); 258 } 259 } 260 261 /* 262 * Return the first matching record and stop the scan 263 */ 264 if (cursor->iprec == NULL) { 265 cursor->iprec = rec; 266 hammer_ref(&rec->lock); 267 return(-1); 268 } 269 return(0); 270 } 271 272 static 273 int 274 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip) 275 { 276 if (cursor->iprec) 277 hammer_rel_mem_record(&cursor->iprec); 278 if (cursor->ip) { 279 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 280 &cursor->ip->rec_tree); 281 } 282 cursor->ip = ip; 283 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 284 cursor->scan.node = NULL; 285 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 286 hammer_rec_scan_callback, cursor); 287 288 /* 289 * Adjust scan.node and keep it linked into the RB-tree so we can 290 * hold the cursor through third party modifications of the RB-tree. 291 */ 292 if (cursor->iprec) { 293 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec); 294 return(0); 295 } 296 return(ENOENT); 297 } 298 299 void 300 hammer_mem_done(hammer_cursor_t cursor) 301 { 302 if (cursor->ip) { 303 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 304 &cursor->ip->rec_tree); 305 cursor->ip = NULL; 306 } 307 if (cursor->iprec) 308 hammer_rel_mem_record(&cursor->iprec); 309 } 310 311 /************************************************************************ 312 * HAMMER IN-MEMORY RECORD FUNCTIONS * 313 ************************************************************************ 314 * 315 * These functions manipulate in-memory records. Such records typically 316 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 317 */ 318 319 /* 320 * Add a directory entry (dip,ncp) which references inode (ip). 321 * 322 * Note that the low 32 bits of the namekey are set temporarily to create 323 * a unique in-memory record, and may be modified a second time when the 324 * record is synchronized to disk. In particular, the low 32 bits cannot be 325 * all 0's when synching to disk, which is not handled here. 326 */ 327 int 328 hammer_ip_add_directory(struct hammer_transaction *trans, 329 struct hammer_inode *dip, struct namecache *ncp, 330 struct hammer_inode *ip) 331 { 332 hammer_record_t record; 333 int error; 334 int bytes; 335 336 record = hammer_alloc_mem_record(dip); 337 338 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */ 339 if (++trans->hmp->namekey_iterator == 0) 340 ++trans->hmp->namekey_iterator; 341 342 record->rec.entry.base.base.obj_id = dip->obj_id; 343 record->rec.entry.base.base.key = 344 hammer_directory_namekey(ncp->nc_name, bytes); 345 record->rec.entry.base.base.key += trans->hmp->namekey_iterator; 346 record->rec.entry.base.base.create_tid = trans->tid; 347 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 348 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type; 349 record->rec.entry.obj_id = ip->obj_id; 350 if (bytes <= sizeof(record->rec.entry.den_name)) { 351 record->data = (void *)record->rec.entry.den_name; 352 record->flags |= HAMMER_RECF_EMBEDDED_DATA; 353 } else { 354 record->data = kmalloc(bytes, M_HAMMER, M_WAITOK); 355 record->flags |= HAMMER_RECF_ALLOCDATA; 356 } 357 bcopy(ncp->nc_name, record->data, bytes); 358 record->rec.entry.base.data_len = bytes; 359 ++ip->ino_rec.ino_nlinks; 360 hammer_modify_inode(trans, ip, 361 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID); 362 error = hammer_mem_add(trans, record); 363 return(error); 364 } 365 366 /* 367 * Delete the directory entry and update the inode link count. The 368 * cursor must be seeked to the directory entry record being deleted. 369 * 370 * NOTE: HAMMER_CURSOR_DELETE may not have been set. XXX remove flag. 371 */ 372 int 373 hammer_ip_del_directory(struct hammer_transaction *trans, 374 hammer_cursor_t cursor, struct hammer_inode *dip, 375 struct hammer_inode *ip) 376 { 377 int error; 378 379 error = hammer_ip_delete_record(cursor, trans->tid); 380 381 /* 382 * One less link. The file may still be open in the OS even after 383 * all links have gone away so we don't destroy the inode's data 384 * here. 385 */ 386 if (error == 0) { 387 --ip->ino_rec.ino_nlinks; 388 hammer_modify_inode(trans, ip, 389 HAMMER_INODE_RDIRTY | HAMMER_INODE_TID); 390 if (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE)) 391 hammer_sync_inode(ip, MNT_NOWAIT, 1); 392 393 } 394 return(error); 395 } 396 397 /* 398 * Sync data from a buffer cache buffer (typically) to the filesystem. This 399 * is called via the strategy called from a cached data source. This code 400 * is responsible for actually writing a data record out to the disk. 401 */ 402 int 403 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, 404 int64_t offset, void *data, int bytes) 405 { 406 struct hammer_cursor cursor; 407 hammer_record_ondisk_t rec; 408 union hammer_btree_elm elm; 409 void *bdata; 410 int error; 411 412 error = hammer_init_cursor_ip(&cursor, ip); 413 if (error) 414 return(error); 415 cursor.key_beg.obj_id = ip->obj_id; 416 cursor.key_beg.key = offset + bytes; 417 cursor.key_beg.create_tid = trans->tid; 418 cursor.key_beg.delete_tid = 0; 419 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 420 cursor.flags = HAMMER_CURSOR_INSERT; 421 422 /* 423 * Issue a lookup to position the cursor and locate the cluster 424 */ 425 error = hammer_btree_lookup(&cursor); 426 if (error == 0) { 427 kprintf("hammer_ip_sync_data: duplicate data at (%lld,%d)\n", 428 offset, bytes); 429 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], 430 HAMMER_BTREE_TYPE_LEAF, cursor.index); 431 error = EIO; 432 } 433 if (error != ENOENT) 434 goto done; 435 436 /* 437 * Allocate record and data space now that we know which cluster 438 * the B-Tree node ended up in. 439 */ 440 bdata = hammer_alloc_data(cursor.node->cluster, bytes, &error, 441 &cursor.data_buffer); 442 if (bdata == NULL) 443 goto done; 444 rec = hammer_alloc_record(cursor.node->cluster, &error, 445 &cursor.record_buffer); 446 if (rec == NULL) 447 goto fail1; 448 449 /* 450 * Fill everything in and insert our B-Tree node. 451 */ 452 rec->base.base = cursor.key_beg; 453 rec->base.data_crc = crc32(data, bytes); 454 rec->base.rec_id = 0; /* XXX */ 455 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer, bdata); 456 rec->base.data_len = bytes; 457 hammer_modify_buffer(cursor.record_buffer); 458 459 bcopy(data, bdata, bytes); 460 hammer_modify_buffer(cursor.data_buffer); 461 462 elm.leaf.base = cursor.key_beg; 463 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); 464 elm.leaf.data_offset = rec->base.data_offset; 465 elm.leaf.data_len = bytes; 466 elm.leaf.data_crc = rec->base.data_crc; 467 468 error = hammer_btree_insert(&cursor, &elm); 469 if (error == 0) 470 goto done; 471 472 hammer_free_record_ptr(cursor.record_buffer, rec); 473 fail1: 474 hammer_free_data_ptr(cursor.data_buffer, bdata, bytes); 475 done: 476 hammer_done_cursor(&cursor); 477 return(error); 478 } 479 480 /* 481 * Sync an in-memory record to the disk. this is typically called via fsync 482 * from a cached record source. This code is responsible for actually 483 * writing a record out to the disk. 484 */ 485 int 486 hammer_ip_sync_record(hammer_record_t record) 487 { 488 struct hammer_cursor cursor; 489 hammer_record_ondisk_t rec; 490 union hammer_btree_elm elm; 491 void *bdata; 492 int error; 493 494 error = hammer_init_cursor_ip(&cursor, record->ip); 495 if (error) 496 return(error); 497 cursor.key_beg = record->rec.base.base; 498 cursor.flags = HAMMER_CURSOR_INSERT; 499 500 /* 501 * Issue a lookup to position the cursor and locate the cluster 502 */ 503 error = hammer_btree_lookup(&cursor); 504 if (error == 0) { 505 kprintf("hammer_ip_sync_record: duplicate rec at (%016llx)\n", 506 record->rec.base.base.key); 507 error = EIO; 508 } 509 if (error != ENOENT) 510 goto done; 511 512 /* 513 * Allocate record and data space now that we know which cluster 514 * the B-Tree node ended up in. 515 */ 516 if (record->data == NULL || 517 (record->flags & HAMMER_RECF_EMBEDDED_DATA)) { 518 bdata = record->data; 519 } else { 520 bdata = hammer_alloc_data(cursor.node->cluster, 521 record->rec.base.data_len, &error, 522 &cursor.data_buffer); 523 if (bdata == NULL) 524 goto done; 525 } 526 rec = hammer_alloc_record(cursor.node->cluster, &error, 527 &cursor.record_buffer); 528 if (rec == NULL) 529 goto fail1; 530 531 /* 532 * Fill everything in and insert our B-Tree node. 533 * 534 * XXX assign rec_id here 535 */ 536 *rec = record->rec; 537 if (bdata) { 538 rec->base.data_crc = crc32(record->data, 539 record->rec.base.data_len); 540 if (record->flags & HAMMER_RECF_EMBEDDED_DATA) { 541 /* 542 * Data embedded in record 543 */ 544 rec->base.data_offset = ((char *)bdata - 545 (char *)&record->rec); 546 KKASSERT(rec->base.data_offset >= 0 && 547 rec->base.data_offset + rec->base.data_len < 548 sizeof(*rec)); 549 rec->base.data_offset += hammer_bclu_offset(cursor.record_buffer, rec); 550 } else { 551 /* 552 * Data separate from record 553 */ 554 rec->base.data_offset = hammer_bclu_offset(cursor.data_buffer,bdata); 555 bcopy(record->data, bdata, rec->base.data_len); 556 hammer_modify_buffer(cursor.data_buffer); 557 } 558 } 559 rec->base.rec_id = 0; /* XXX */ 560 561 hammer_modify_buffer(cursor.record_buffer); 562 563 elm.leaf.base = cursor.key_beg; 564 elm.leaf.rec_offset = hammer_bclu_offset(cursor.record_buffer, rec); 565 elm.leaf.data_offset = rec->base.data_offset; 566 elm.leaf.data_len = rec->base.data_len; 567 elm.leaf.data_crc = rec->base.data_crc; 568 569 error = hammer_btree_insert(&cursor, &elm); 570 if (error == 0) 571 goto done; 572 573 hammer_free_record_ptr(cursor.record_buffer, rec); 574 fail1: 575 if (record->data && (record->flags & HAMMER_RECF_EMBEDDED_DATA) == 0) { 576 hammer_free_data_ptr(cursor.data_buffer, bdata, 577 rec->base.data_len); 578 } 579 done: 580 hammer_done_cursor(&cursor); 581 return(error); 582 } 583 584 585 /* 586 * Add the record to the inode's rec_tree. The low 32 bits of a directory 587 * entry's key is used to deal with hash collisions in the upper 32 bits. 588 * A unique 64 bit key is generated in-memory and may be regenerated a 589 * second time when the directory record is flushed to the on-disk B-Tree. 590 */ 591 static 592 int 593 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record) 594 { 595 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 596 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ 597 hammer_free_mem_record(record); 598 return (EEXIST); 599 } 600 if (++trans->hmp->namekey_iterator == 0) 601 ++trans->hmp->namekey_iterator; 602 record->rec.base.base.key &= ~(0xFFFFFFFFLL); 603 record->rec.base.base.key |= trans->hmp->namekey_iterator; 604 } 605 record->flags |= HAMMER_RECF_ONRBTREE; 606 return(0); 607 } 608 609 /************************************************************************ 610 * HAMMER INODE MERGED-RECORD FUNCTIONS * 611 ************************************************************************ 612 * 613 * These functions augment the B-Tree scanning functions in hammer_btree.c 614 * by merging in-memory records with on-disk records. 615 */ 616 617 /* 618 * Locate a particular record either in-memory or on-disk. 619 * 620 * NOTE: This is basically a standalone routine, hammer_ip_next() may 621 * NOT be called to iterate results. 622 */ 623 int 624 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip) 625 { 626 int error; 627 628 /* 629 * If the element is in-memory return it without searching the 630 * on-disk B-Tree 631 */ 632 error = hammer_mem_lookup(cursor, ip); 633 if (error == 0) { 634 cursor->record = &cursor->iprec->rec; 635 return(error); 636 } 637 if (error != ENOENT) 638 return(error); 639 640 /* 641 * If the inode has on-disk components search the on-disk B-Tree. 642 */ 643 if ((ip->flags & HAMMER_INODE_ONDISK) == 0) 644 return(error); 645 error = hammer_btree_lookup(cursor); 646 if (error == 0) 647 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 648 return(error); 649 } 650 651 /* 652 * Locate the first record within the cursor's key_beg/key_end range, 653 * restricted to a particular inode. 0 is returned on success, ENOENT 654 * if no records matched the requested range, or some other error. 655 * 656 * When 0 is returned hammer_ip_next() may be used to iterate additional 657 * records within the requested range. 658 */ 659 int 660 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) 661 { 662 int error; 663 664 /* 665 * Clean up fields and setup for merged scan 666 */ 667 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 668 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM; 669 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF; 670 if (cursor->iprec) 671 hammer_rel_mem_record(&cursor->iprec); 672 673 /* 674 * Search the on-disk B-Tree. hammer_btree_lookup() only does an 675 * exact lookup so if we get ENOENT we have to call the iterate 676 * function to validate the first record after the begin key. 677 * 678 * The ATEDISK flag is used by hammer_btree_iterate to determine 679 * whether it must index forwards or not. 680 */ 681 if (ip->flags & HAMMER_INODE_ONDISK) { 682 error = hammer_btree_lookup(cursor); 683 if (error == ENOENT) { 684 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 685 error = hammer_btree_iterate(cursor); 686 } 687 if (error && error != ENOENT) 688 return(error); 689 if (error == 0) { 690 cursor->flags &= ~HAMMER_CURSOR_DISKEOF; 691 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 692 } else { 693 cursor->flags |= HAMMER_CURSOR_ATEDISK; 694 } 695 } 696 697 /* 698 * Search the in-memory record list (Red-Black tree). Unlike the 699 * B-Tree search, mem_first checks for records in the range. 700 */ 701 error = hammer_mem_first(cursor, ip); 702 if (error && error != ENOENT) 703 return(error); 704 if (error == 0) { 705 cursor->flags &= ~HAMMER_CURSOR_MEMEOF; 706 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 707 } 708 709 /* 710 * This will return the first matching record. 711 */ 712 return(hammer_ip_next(cursor)); 713 } 714 715 /* 716 * Retrieve the next record in a merged iteration within the bounds of the 717 * cursor. This call may be made multiple times after the cursor has been 718 * initially searched with hammer_ip_first(). 719 * 720 * 0 is returned on success, ENOENT if no further records match the 721 * requested range, or some other error code is returned. 722 */ 723 int 724 hammer_ip_next(hammer_cursor_t cursor) 725 { 726 hammer_btree_elm_t elm; 727 hammer_record_t rec; 728 int error; 729 int r; 730 731 /* 732 * Load the current on-disk and in-memory record. If we ate any 733 * records we have to get the next one. 734 * 735 * If we deleted the last on-disk record we had scanned ATEDISK will 736 * be clear and DELBTREE will be set, forcing a call to iterate. The 737 * fact that ATEDISK is clear causes iterate to re-test the 'current' 738 * element. If ATEDISK is set, iterate will skip the 'current' 739 * element. 740 * 741 * Get the next on-disk record 742 */ 743 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) { 744 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 745 error = hammer_btree_iterate(cursor); 746 if (error == 0) 747 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 748 else 749 cursor->flags |= HAMMER_CURSOR_DISKEOF | 750 HAMMER_CURSOR_ATEDISK; 751 } 752 } 753 754 /* 755 * Get the next in-memory record. The record can be ripped out 756 * of the RB tree so we maintain a scan_info structure to track 757 * the next node. 758 * 759 * hammer_rec_scan_cmp: Is the record still in our general range, 760 * (non-inclusive of snapshot exclusions)? 761 * hammer_rec_scan_callback: Is the record in our snapshot? 762 */ 763 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 764 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 765 hammer_rel_mem_record(&cursor->iprec); 766 rec = cursor->scan.node; /* next node */ 767 while (rec) { 768 if (hammer_rec_scan_cmp(rec, cursor) != 0) 769 break; 770 if (hammer_rec_scan_callback(rec, cursor) != 0) 771 break; 772 rec = hammer_rec_rb_tree_RB_NEXT(rec); 773 } 774 if (cursor->iprec) { 775 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 776 hammer_ref(&cursor->iprec->lock); 777 cursor->scan.node = 778 hammer_rec_rb_tree_RB_NEXT(rec); 779 } else { 780 cursor->flags |= HAMMER_CURSOR_MEMEOF; 781 } 782 } 783 } 784 785 /* 786 * Extract either the disk or memory record depending on their 787 * relative position. 788 */ 789 error = 0; 790 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 791 case 0: 792 /* 793 * Both entries valid 794 */ 795 elm = &cursor->node->ondisk->elms[cursor->index]; 796 r = hammer_btree_cmp(&elm->base, 797 &cursor->iprec->rec.base.base); 798 if (r < 0) { 799 error = hammer_btree_extract(cursor, 800 HAMMER_CURSOR_GET_RECORD); 801 cursor->flags |= HAMMER_CURSOR_ATEDISK; 802 break; 803 } 804 /* fall through to the memory entry */ 805 case HAMMER_CURSOR_ATEDISK: 806 /* 807 * Only the memory entry is valid 808 */ 809 cursor->record = &cursor->iprec->rec; 810 cursor->flags |= HAMMER_CURSOR_ATEMEM; 811 break; 812 case HAMMER_CURSOR_ATEMEM: 813 /* 814 * Only the disk entry is valid 815 */ 816 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 817 cursor->flags |= HAMMER_CURSOR_ATEDISK; 818 break; 819 default: 820 /* 821 * Neither entry is valid 822 * 823 * XXX error not set properly 824 */ 825 cursor->record = NULL; 826 error = ENOENT; 827 break; 828 } 829 return(error); 830 } 831 832 /* 833 * Resolve the cursor->data pointer for the current cursor position in 834 * a merged iteration. 835 */ 836 int 837 hammer_ip_resolve_data(hammer_cursor_t cursor) 838 { 839 int error; 840 841 if (cursor->iprec && cursor->record == &cursor->iprec->rec) { 842 cursor->data = cursor->iprec->data; 843 error = 0; 844 } else { 845 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 846 } 847 return(error); 848 } 849 850 /* 851 * Delete all records within the specified range for inode ip. 852 * 853 * NOTE: An unaligned range will cause new records to be added to cover 854 * the edge cases. (XXX not implemented yet). 855 * 856 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 857 * 858 * NOTE: Record keys for regular file data have to be special-cased since 859 * they indicate the end of the range (key = base + bytes). 860 */ 861 int 862 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, 863 int64_t ran_beg, int64_t ran_end) 864 { 865 struct hammer_cursor cursor; 866 hammer_record_ondisk_t rec; 867 hammer_base_elm_t base; 868 int error; 869 int isregfile; 870 int64_t off; 871 872 hammer_init_cursor_ip(&cursor, ip); 873 874 cursor.key_beg.obj_id = ip->obj_id; 875 cursor.key_beg.create_tid = ip->obj_asof; 876 cursor.key_beg.delete_tid = 0; 877 cursor.key_beg.obj_type = 0; 878 879 cursor.key_end = cursor.key_beg; 880 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) { 881 cursor.key_beg.key = ran_beg; 882 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 883 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 884 cursor.key_end.key = ran_end; 885 isregfile = 0; 886 } else { 887 /* 888 * The key in the B-Tree is (base+bytes), so the first possible 889 * matching key is ran_beg + 1. 890 */ 891 int64_t tmp64; 892 893 cursor.key_beg.key = ran_beg + 1; 894 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 895 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 896 897 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 898 if (tmp64 < ran_end) 899 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 900 else 901 cursor.key_end.key = ran_end + MAXPHYS + 1; 902 isregfile = 1; 903 } 904 905 error = hammer_ip_first(&cursor, ip); 906 907 /* 908 * Iterate through matching records and mark them as deleted. 909 */ 910 while (error == 0) { 911 rec = cursor.record; 912 base = &rec->base.base; 913 914 KKASSERT(base->delete_tid == 0); 915 916 /* 917 * There may be overlap cases for regular file data. Also 918 * remember the key for a regular file record is the offset 919 * of the last byte of the record (base + len - 1), NOT the 920 * base offset. 921 */ 922 #if 0 923 kprintf("delete_range rec_type %02x\n", base->rec_type); 924 #endif 925 if (base->rec_type == HAMMER_RECTYPE_DATA) { 926 #if 0 927 kprintf("delete_range loop key %016llx\n", 928 base->key - rec->base.data_len); 929 #endif 930 off = base->key - rec->base.data_len; 931 /* 932 * Check the left edge case. We currently do not 933 * split existing records. 934 */ 935 if (off < ran_beg) { 936 panic("hammer left edge case %016llx %d\n", 937 base->key, rec->base.data_len); 938 } 939 940 /* 941 * Check the right edge case. Note that the 942 * record can be completely out of bounds, which 943 * terminates the search. 944 * 945 * base->key is exclusive of the right edge while 946 * ran_end is inclusive of the right edge. The 947 * (key - data_len) left boundary is inclusive. 948 * 949 * XXX theory-check this test at some point, are 950 * we missing a + 1 somewhere? Note that ran_end 951 * could overflow. 952 */ 953 if (base->key > ran_end) { 954 if (base->key - rec->base.data_len > ran_end) { 955 kprintf("right edge OOB\n"); 956 break; 957 } 958 panic("hammer right edge case\n"); 959 } 960 } 961 962 /* 963 * Mark the record and B-Tree entry as deleted. This will 964 * also physically delete the B-Tree entry, record, and 965 * data if the retention policy dictates. The function 966 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() 967 * uses to perform a fixup. 968 */ 969 error = hammer_ip_delete_record(&cursor, trans->tid); 970 if (error) 971 break; 972 error = hammer_ip_next(&cursor); 973 } 974 hammer_done_cursor(&cursor); 975 if (error == ENOENT) 976 error = 0; 977 return(error); 978 } 979 980 /* 981 * Delete the record at the current cursor 982 */ 983 int 984 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) 985 { 986 hammer_btree_elm_t elm; 987 hammer_mount_t hmp; 988 int error; 989 990 /* 991 * In-memory (unsynchronized) records can simply be freed. 992 */ 993 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 994 if (cursor->record == &cursor->iprec->rec) { 995 hammer_free_mem_record(cursor->iprec); 996 return(0); 997 } 998 999 /* 1000 * On-disk records are marked as deleted by updating their delete_tid. 1001 */ 1002 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1003 elm = NULL; 1004 hmp = cursor->node->cluster->volume->hmp; 1005 1006 if (error == 0) { 1007 elm = &cursor->node->ondisk->elms[cursor->index]; 1008 cursor->record->base.base.delete_tid = tid; 1009 elm->leaf.base.delete_tid = tid; 1010 hammer_modify_buffer(cursor->record_buffer); 1011 hammer_modify_node(cursor->node); 1012 } 1013 1014 /* 1015 * If we were mounted with the nohistory option, we physically 1016 * delete the record. 1017 */ 1018 if (error == 0 && (hmp->hflags & HMNT_NOHISTORY)) { 1019 int32_t rec_offset; 1020 int32_t data_offset; 1021 int32_t data_len; 1022 hammer_cluster_t cluster; 1023 1024 rec_offset = elm->leaf.rec_offset; 1025 data_offset = elm->leaf.data_offset; 1026 data_len = elm->leaf.data_len; 1027 #if 0 1028 kprintf("hammer_ip_delete_record: %08x %08x/%d\n", 1029 rec_offset, data_offset, data_len); 1030 #endif 1031 cluster = cursor->node->cluster; 1032 hammer_ref_cluster(cluster); 1033 1034 error = hammer_btree_delete(cursor); 1035 if (error == 0) { 1036 /* 1037 * This forces a fixup for the iteration because 1038 * the cursor is now either sitting at the 'next' 1039 * element or sitting at the end of a leaf. 1040 */ 1041 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1042 cursor->flags |= HAMMER_CURSOR_DELBTREE; 1043 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1044 } 1045 hammer_free_record(cluster, rec_offset); 1046 if (data_offset - rec_offset < 0 || 1047 data_offset - rec_offset >= HAMMER_RECORD_SIZE) { 1048 hammer_free_data(cluster, data_offset,data_len); 1049 } 1050 } 1051 hammer_rel_cluster(cluster, 0); 1052 if (error) { 1053 kprintf("hammer_ip_delete_record: unable to physically delete the record!\n"); 1054 error = 0; 1055 } 1056 } 1057 return(error); 1058 } 1059 1060