1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_lookup(hammer_cursor_t cursor); 40 static void hammer_mem_first(hammer_cursor_t cursor); 41 static int hammer_frontend_trunc_callback(hammer_record_t record, 42 void *data __unused); 43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data); 44 static int hammer_record_needs_overwrite_delete(hammer_record_t record); 45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 46 hammer_btree_leaf_elm_t leaf); 47 static int hammer_cursor_localize_data(hammer_data_ondisk_t data, 48 hammer_btree_leaf_elm_t leaf); 49 50 struct rec_trunc_info { 51 u_int16_t rec_type; 52 int64_t trunc_off; 53 }; 54 55 struct hammer_bulk_info { 56 hammer_record_t record; 57 hammer_record_t conflict; 58 }; 59 60 /* 61 * Red-black tree support. Comparison code for insertion. 62 */ 63 static int 64 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 65 { 66 if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type) 67 return(-1); 68 if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type) 69 return(1); 70 71 if (rec1->leaf.base.key < rec2->leaf.base.key) 72 return(-1); 73 if (rec1->leaf.base.key > rec2->leaf.base.key) 74 return(1); 75 76 /* 77 * For search & insertion purposes records deleted by the 78 * frontend or deleted/committed by the backend are silently 79 * ignored. Otherwise pipelined insertions will get messed 80 * up. 81 * 82 * rec1 is greater then rec2 if rec1 is marked deleted. 83 * rec1 is less then rec2 if rec2 is marked deleted. 84 * 85 * Multiple deleted records may be present, do not return 0 86 * if both are marked deleted. 87 */ 88 if (rec1->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 89 HAMMER_RECF_COMMITTED)) { 90 return(1); 91 } 92 if (rec2->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 93 HAMMER_RECF_COMMITTED)) { 94 return(-1); 95 } 96 97 return(0); 98 } 99 100 /* 101 * Basic record comparison code similar to hammer_btree_cmp(). 102 * 103 * obj_id is not compared and may not yet be assigned in the record. 104 */ 105 static int 106 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) 107 { 108 if (elm->rec_type < rec->leaf.base.rec_type) 109 return(-3); 110 if (elm->rec_type > rec->leaf.base.rec_type) 111 return(3); 112 113 if (elm->key < rec->leaf.base.key) 114 return(-2); 115 if (elm->key > rec->leaf.base.key) 116 return(2); 117 118 /* 119 * Never match against an item deleted by the frontend 120 * or backend, or committed by the backend. 121 * 122 * elm is less then rec if rec is marked deleted. 123 */ 124 if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 125 HAMMER_RECF_COMMITTED)) { 126 return(-1); 127 } 128 return(0); 129 } 130 131 /* 132 * Ranged scan to locate overlapping record(s). This is used by 133 * hammer_ip_get_bulk() to locate an overlapping record. We have 134 * to use a ranged scan because the keys for data records with the 135 * same file base offset can be different due to differing data_len's. 136 * 137 * NOTE: The base file offset of a data record is (key - data_len), not (key). 138 */ 139 static int 140 hammer_rec_overlap_cmp(hammer_record_t rec, void *data) 141 { 142 struct hammer_bulk_info *info = data; 143 hammer_btree_leaf_elm_t leaf = &info->record->leaf; 144 145 if (rec->leaf.base.rec_type < leaf->base.rec_type) 146 return(-3); 147 if (rec->leaf.base.rec_type > leaf->base.rec_type) 148 return(3); 149 150 /* 151 * Overlap compare 152 */ 153 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 154 /* rec_beg >= leaf_end */ 155 if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key) 156 return(2); 157 /* rec_end <= leaf_beg */ 158 if (rec->leaf.base.key <= leaf->base.key - leaf->data_len) 159 return(-2); 160 } else { 161 if (rec->leaf.base.key < leaf->base.key) 162 return(-2); 163 if (rec->leaf.base.key > leaf->base.key) 164 return(2); 165 } 166 167 /* 168 * We have to return 0 at this point, even if DELETED_FE is set, 169 * because returning anything else will cause the scan to ignore 170 * one of the branches when we really want it to check both. 171 */ 172 return(0); 173 } 174 175 /* 176 * RB_SCAN comparison code for hammer_mem_first(). The argument order 177 * is reversed so the comparison result has to be negated. key_beg and 178 * key_end are both range-inclusive. 179 * 180 * Localized deletions are not cached in-memory. 181 */ 182 static 183 int 184 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 185 { 186 hammer_cursor_t cursor = data; 187 int r; 188 189 r = hammer_rec_cmp(&cursor->key_beg, rec); 190 if (r > 1) 191 return(-1); 192 r = hammer_rec_cmp(&cursor->key_end, rec); 193 if (r < -1) 194 return(1); 195 return(0); 196 } 197 198 /* 199 * This compare function is used when simply looking up key_beg. 200 */ 201 static 202 int 203 hammer_rec_find_cmp(hammer_record_t rec, void *data) 204 { 205 hammer_cursor_t cursor = data; 206 int r; 207 208 r = hammer_rec_cmp(&cursor->key_beg, rec); 209 if (r > 1) 210 return(-1); 211 if (r < -1) 212 return(1); 213 return(0); 214 } 215 216 /* 217 * Locate blocks within the truncation range. Partial blocks do not count. 218 */ 219 static 220 int 221 hammer_rec_trunc_cmp(hammer_record_t rec, void *data) 222 { 223 struct rec_trunc_info *info = data; 224 225 if (rec->leaf.base.rec_type < info->rec_type) 226 return(-1); 227 if (rec->leaf.base.rec_type > info->rec_type) 228 return(1); 229 230 switch(rec->leaf.base.rec_type) { 231 case HAMMER_RECTYPE_DB: 232 /* 233 * DB record key is not beyond the truncation point, retain. 234 */ 235 if (rec->leaf.base.key < info->trunc_off) 236 return(-1); 237 break; 238 case HAMMER_RECTYPE_DATA: 239 /* 240 * DATA record offset start is not beyond the truncation point, 241 * retain. 242 */ 243 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off) 244 return(-1); 245 break; 246 default: 247 panic("hammer_rec_trunc_cmp: unexpected record type"); 248 } 249 250 /* 251 * The record start is >= the truncation point, return match, 252 * the record should be destroyed. 253 */ 254 return(0); 255 } 256 257 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 258 259 /* 260 * Allocate a record for the caller to finish filling in. The record is 261 * returned referenced. 262 */ 263 hammer_record_t 264 hammer_alloc_mem_record(hammer_inode_t ip, int data_len) 265 { 266 hammer_record_t record; 267 hammer_mount_t hmp; 268 269 hmp = ip->hmp; 270 ++hammer_count_records; 271 record = kmalloc(sizeof(*record), hmp->m_misc, 272 M_WAITOK | M_ZERO | M_USE_RESERVE); 273 record->flush_state = HAMMER_FST_IDLE; 274 record->ip = ip; 275 record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 276 record->leaf.data_len = data_len; 277 hammer_ref(&record->lock); 278 279 if (data_len) { 280 record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO); 281 record->flags |= HAMMER_RECF_ALLOCDATA; 282 ++hammer_count_record_datas; 283 } 284 285 return (record); 286 } 287 288 void 289 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident) 290 { 291 while (record->flush_state == HAMMER_FST_FLUSH) { 292 record->flags |= HAMMER_RECF_WANTED; 293 tsleep(record, 0, ident, 0); 294 } 295 } 296 297 /* 298 * Called from the backend, hammer_inode.c, after a record has been 299 * flushed to disk. The record has been exclusively locked by the 300 * caller and interlocked with BE. 301 * 302 * We clean up the state, unlock, and release the record (the record 303 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). 304 */ 305 void 306 hammer_flush_record_done(hammer_record_t record, int error) 307 { 308 hammer_inode_t target_ip; 309 310 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 311 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 312 313 /* 314 * If an error occured, the backend was unable to sync the 315 * record to its media. Leave the record intact. 316 */ 317 if (error) { 318 hammer_critical_error(record->ip->hmp, record->ip, error, 319 "while flushing record"); 320 } 321 322 --record->flush_group->refs; 323 record->flush_group = NULL; 324 325 /* 326 * Adjust the flush state and dependancy based on success or 327 * failure. 328 */ 329 if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { 330 if ((target_ip = record->target_ip) != NULL) { 331 TAILQ_REMOVE(&target_ip->target_list, record, 332 target_entry); 333 record->target_ip = NULL; 334 hammer_test_inode(target_ip); 335 } 336 record->flush_state = HAMMER_FST_IDLE; 337 } else { 338 if (record->target_ip) { 339 record->flush_state = HAMMER_FST_SETUP; 340 hammer_test_inode(record->ip); 341 hammer_test_inode(record->target_ip); 342 } else { 343 record->flush_state = HAMMER_FST_IDLE; 344 } 345 } 346 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 347 348 /* 349 * Cleanup 350 */ 351 if (record->flags & HAMMER_RECF_WANTED) { 352 record->flags &= ~HAMMER_RECF_WANTED; 353 wakeup(record); 354 } 355 hammer_rel_mem_record(record); 356 } 357 358 /* 359 * Release a memory record. Records marked for deletion are immediately 360 * removed from the RB-Tree but otherwise left intact until the last ref 361 * goes away. 362 */ 363 void 364 hammer_rel_mem_record(struct hammer_record *record) 365 { 366 hammer_mount_t hmp; 367 hammer_reserve_t resv; 368 hammer_inode_t ip; 369 hammer_inode_t target_ip; 370 int diddrop; 371 372 hammer_rel(&record->lock); 373 374 if (hammer_norefs(&record->lock)) { 375 /* 376 * Upon release of the last reference wakeup any waiters. 377 * The record structure may get destroyed so callers will 378 * loop up and do a relookup. 379 * 380 * WARNING! Record must be removed from RB-TREE before we 381 * might possibly block. hammer_test_inode() can block! 382 */ 383 ip = record->ip; 384 hmp = ip->hmp; 385 386 /* 387 * Upon release of the last reference a record marked deleted 388 * by the front or backend, or committed by the backend, 389 * is destroyed. 390 */ 391 if (record->flags & (HAMMER_RECF_DELETED_FE | 392 HAMMER_RECF_DELETED_BE | 393 HAMMER_RECF_COMMITTED)) { 394 KKASSERT(hammer_isactive(&ip->lock) > 0); 395 KKASSERT(record->flush_state != HAMMER_FST_FLUSH); 396 397 /* 398 * target_ip may have zero refs, we have to ref it 399 * to prevent it from being ripped out from under 400 * us. 401 */ 402 if ((target_ip = record->target_ip) != NULL) { 403 TAILQ_REMOVE(&target_ip->target_list, 404 record, target_entry); 405 record->target_ip = NULL; 406 hammer_ref(&target_ip->lock); 407 } 408 409 /* 410 * Remove the record from the B-Tree 411 */ 412 if (record->flags & HAMMER_RECF_ONRBTREE) { 413 RB_REMOVE(hammer_rec_rb_tree, 414 &record->ip->rec_tree, 415 record); 416 record->flags &= ~HAMMER_RECF_ONRBTREE; 417 KKASSERT(ip->rsv_recs > 0); 418 diddrop = 1; 419 } else { 420 diddrop = 0; 421 } 422 423 /* 424 * We must wait for any direct-IO to complete before 425 * we can destroy the record because the bio may 426 * have a reference to it. 427 */ 428 if (record->flags & 429 (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) { 430 hammer_io_direct_wait(record); 431 } 432 433 /* 434 * Account for the completion after the direct IO 435 * has completed. 436 */ 437 if (diddrop) { 438 --hmp->rsv_recs; 439 --ip->rsv_recs; 440 hmp->rsv_databytes -= record->leaf.data_len; 441 442 if (RB_EMPTY(&record->ip->rec_tree)) { 443 record->ip->flags &= ~HAMMER_INODE_XDIRTY; 444 record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; 445 hammer_test_inode(record->ip); 446 } 447 if (ip->rsv_recs == hammer_limit_inode_recs - 1) 448 wakeup(&ip->rsv_recs); 449 } 450 451 /* 452 * Do this test after removing record from the B-Tree. 453 */ 454 if (target_ip) { 455 hammer_test_inode(target_ip); 456 hammer_rel_inode(target_ip, 0); 457 } 458 459 if (record->flags & HAMMER_RECF_ALLOCDATA) { 460 --hammer_count_record_datas; 461 kfree(record->data, hmp->m_misc); 462 record->flags &= ~HAMMER_RECF_ALLOCDATA; 463 } 464 465 /* 466 * Release the reservation. 467 * 468 * If the record was not committed we can theoretically 469 * undo the reservation. However, doing so might 470 * create weird edge cases with the ordering of 471 * direct writes because the related buffer cache 472 * elements are per-vnode. So we don't try. 473 */ 474 if ((resv = record->resv) != NULL) { 475 /* XXX undo leaf.data_offset,leaf.data_len */ 476 hammer_blockmap_reserve_complete(hmp, resv); 477 record->resv = NULL; 478 } 479 record->data = NULL; 480 --hammer_count_records; 481 kfree(record, hmp->m_misc); 482 } 483 } 484 } 485 486 /* 487 * Record visibility depends on whether the record is being accessed by 488 * the backend or the frontend. Backend tests ignore the frontend delete 489 * flag. Frontend tests do NOT ignore the backend delete/commit flags and 490 * must also check for commit races. 491 * 492 * Return non-zero if the record is visible, zero if it isn't or if it is 493 * deleted. Returns 0 if the record has been comitted (unless the special 494 * delete-visibility flag is set). A committed record must be located 495 * via the media B-Tree. Returns non-zero if the record is good. 496 * 497 * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory 498 * records to be returned. This is so pending deletions are detected 499 * when using an iterator to locate an unused hash key, or when we need 500 * to locate historical records on-disk to destroy. 501 */ 502 static __inline 503 int 504 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) 505 { 506 if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) 507 return(1); 508 if (cursor->flags & HAMMER_CURSOR_BACKEND) { 509 if (record->flags & (HAMMER_RECF_DELETED_BE | 510 HAMMER_RECF_COMMITTED)) { 511 return(0); 512 } 513 } else { 514 if (record->flags & (HAMMER_RECF_DELETED_FE | 515 HAMMER_RECF_DELETED_BE | 516 HAMMER_RECF_COMMITTED)) { 517 return(0); 518 } 519 } 520 return(1); 521 } 522 523 /* 524 * This callback is used as part of the RB_SCAN function for in-memory 525 * records. We terminate it (return -1) as soon as we get a match. 526 * 527 * This routine is used by frontend code. 528 * 529 * The primary compare code does not account for ASOF lookups. This 530 * code handles that case as well as a few others. 531 */ 532 static 533 int 534 hammer_rec_scan_callback(hammer_record_t rec, void *data) 535 { 536 hammer_cursor_t cursor = data; 537 538 /* 539 * We terminate on success, so this should be NULL on entry. 540 */ 541 KKASSERT(cursor->iprec == NULL); 542 543 /* 544 * Skip if the record was marked deleted or committed. 545 */ 546 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) 547 return(0); 548 549 /* 550 * Skip if not visible due to our as-of TID 551 */ 552 if (cursor->flags & HAMMER_CURSOR_ASOF) { 553 if (cursor->asof < rec->leaf.base.create_tid) 554 return(0); 555 if (rec->leaf.base.delete_tid && 556 cursor->asof >= rec->leaf.base.delete_tid) { 557 return(0); 558 } 559 } 560 561 /* 562 * ref the record. The record is protected from backend B-Tree 563 * interactions by virtue of the cursor's IP lock. 564 */ 565 hammer_ref(&rec->lock); 566 567 /* 568 * The record may have been deleted or committed while we 569 * were blocked. XXX remove? 570 */ 571 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { 572 hammer_rel_mem_record(rec); 573 return(0); 574 } 575 576 /* 577 * Set the matching record and stop the scan. 578 */ 579 cursor->iprec = rec; 580 return(-1); 581 } 582 583 584 /* 585 * Lookup an in-memory record given the key specified in the cursor. Works 586 * just like hammer_btree_lookup() but operates on an inode's in-memory 587 * record list. 588 * 589 * The lookup must fail if the record is marked for deferred deletion. 590 * 591 * The API for mem/btree_lookup() does not mess with the ATE/EOF bits. 592 */ 593 static 594 int 595 hammer_mem_lookup(hammer_cursor_t cursor) 596 { 597 KKASSERT(cursor->ip); 598 if (cursor->iprec) { 599 hammer_rel_mem_record(cursor->iprec); 600 cursor->iprec = NULL; 601 } 602 hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp, 603 hammer_rec_scan_callback, cursor); 604 605 return (cursor->iprec ? 0 : ENOENT); 606 } 607 608 /* 609 * hammer_mem_first() - locate the first in-memory record matching the 610 * cursor within the bounds of the key range. 611 * 612 * WARNING! API is slightly different from btree_first(). hammer_mem_first() 613 * will set ATEMEM the same as MEMEOF, and does not return any error. 614 */ 615 static 616 void 617 hammer_mem_first(hammer_cursor_t cursor) 618 { 619 hammer_inode_t ip; 620 621 ip = cursor->ip; 622 KKASSERT(ip != NULL); 623 624 if (cursor->iprec) { 625 hammer_rel_mem_record(cursor->iprec); 626 cursor->iprec = NULL; 627 } 628 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 629 hammer_rec_scan_callback, cursor); 630 631 if (cursor->iprec) 632 cursor->flags &= ~(HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM); 633 else 634 cursor->flags |= HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM; 635 } 636 637 /************************************************************************ 638 * HAMMER IN-MEMORY RECORD FUNCTIONS * 639 ************************************************************************ 640 * 641 * These functions manipulate in-memory records. Such records typically 642 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 643 */ 644 645 /* 646 * Add a directory entry (dip,ncp) which references inode (ip). 647 * 648 * Note that the low 32 bits of the namekey are set temporarily to create 649 * a unique in-memory record, and may be modified a second time when the 650 * record is synchronized to disk. In particular, the low 32 bits cannot be 651 * all 0's when synching to disk, which is not handled here. 652 * 653 * NOTE: bytes does not include any terminating \0 on name, and name might 654 * not be terminated. 655 */ 656 int 657 hammer_ip_add_directory(struct hammer_transaction *trans, 658 struct hammer_inode *dip, const char *name, int bytes, 659 struct hammer_inode *ip) 660 { 661 struct hammer_cursor cursor; 662 hammer_record_t record; 663 int error; 664 u_int32_t max_iterations; 665 666 record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes)); 667 668 record->type = HAMMER_MEM_RECORD_ADD; 669 record->leaf.base.localization = dip->obj_localization + 670 hammer_dir_localization(dip); 671 record->leaf.base.obj_id = dip->obj_id; 672 record->leaf.base.key = hammer_directory_namekey(dip, name, bytes, 673 &max_iterations); 674 record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 675 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 676 record->data->entry.obj_id = ip->obj_id; 677 record->data->entry.localization = ip->obj_localization; 678 bcopy(name, record->data->entry.name, bytes); 679 680 ++ip->ino_data.nlinks; 681 ip->ino_data.ctime = trans->time; 682 hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); 683 684 /* 685 * Find an unused namekey. Both the in-memory record tree and 686 * the B-Tree are checked. We do not want historically deleted 687 * names to create a collision as our iteration space may be limited, 688 * and since create_tid wouldn't match anyway an ASOF search 689 * must be used to locate collisions. 690 * 691 * delete-visibility is set so pending deletions do not give us 692 * a false-negative on our ability to use an iterator. 693 * 694 * The iterator must not rollover the key. Directory keys only 695 * use the positive key space. 696 */ 697 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 698 cursor.key_beg = record->leaf.base; 699 cursor.flags |= HAMMER_CURSOR_ASOF; 700 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 701 cursor.asof = ip->obj_asof; 702 703 while (hammer_ip_lookup(&cursor) == 0) { 704 ++record->leaf.base.key; 705 KKASSERT(record->leaf.base.key > 0); 706 cursor.key_beg.key = record->leaf.base.key; 707 if (--max_iterations == 0) { 708 hammer_rel_mem_record(record); 709 error = ENOSPC; 710 goto failed; 711 } 712 } 713 714 /* 715 * The target inode and the directory entry are bound together. 716 */ 717 record->target_ip = ip; 718 record->flush_state = HAMMER_FST_SETUP; 719 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 720 721 /* 722 * The inode now has a dependancy and must be taken out of the idle 723 * state. An inode not in an idle state is given an extra reference. 724 * 725 * When transitioning to a SETUP state flag for an automatic reflush 726 * when the dependancies are disposed of if someone is waiting on 727 * the inode. 728 */ 729 if (ip->flush_state == HAMMER_FST_IDLE) { 730 hammer_ref(&ip->lock); 731 ip->flush_state = HAMMER_FST_SETUP; 732 if (ip->flags & HAMMER_INODE_FLUSHW) 733 ip->flags |= HAMMER_INODE_REFLUSH; 734 } 735 error = hammer_mem_add(record); 736 if (error == 0) { 737 dip->ino_data.mtime = trans->time; 738 hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); 739 } 740 failed: 741 hammer_done_cursor(&cursor); 742 return(error); 743 } 744 745 /* 746 * Delete the directory entry and update the inode link count. The 747 * cursor must be seeked to the directory entry record being deleted. 748 * 749 * The related inode should be share-locked by the caller. The caller is 750 * on the frontend. It could also be NULL indicating that the directory 751 * entry being removed has no related inode. 752 * 753 * This function can return EDEADLK requiring the caller to terminate 754 * the cursor, any locks, wait on the returned record, and retry. 755 */ 756 int 757 hammer_ip_del_directory(struct hammer_transaction *trans, 758 hammer_cursor_t cursor, struct hammer_inode *dip, 759 struct hammer_inode *ip) 760 { 761 hammer_record_t record; 762 int error; 763 764 if (hammer_cursor_inmem(cursor)) { 765 /* 766 * In-memory (unsynchronized) records can simply be freed. 767 * 768 * Even though the HAMMER_RECF_DELETED_FE flag is ignored 769 * by the backend, we must still avoid races against the 770 * backend potentially syncing the record to the media. 771 * 772 * We cannot call hammer_ip_delete_record(), that routine may 773 * only be called from the backend. 774 */ 775 record = cursor->iprec; 776 if (record->flags & (HAMMER_RECF_INTERLOCK_BE | 777 HAMMER_RECF_DELETED_BE | 778 HAMMER_RECF_COMMITTED)) { 779 KKASSERT(cursor->deadlk_rec == NULL); 780 hammer_ref(&record->lock); 781 cursor->deadlk_rec = record; 782 error = EDEADLK; 783 } else { 784 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 785 record->flags |= HAMMER_RECF_DELETED_FE; 786 error = 0; 787 } 788 } else { 789 /* 790 * If the record is on-disk we have to queue the deletion by 791 * the record's key. This also causes lookups to skip the 792 * record (lookups for the purposes of finding an unused 793 * directory key do not skip the record). 794 */ 795 KKASSERT(dip->flags & 796 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK)); 797 record = hammer_alloc_mem_record(dip, 0); 798 record->type = HAMMER_MEM_RECORD_DEL; 799 record->leaf.base = cursor->leaf->base; 800 KKASSERT(dip->obj_id == record->leaf.base.obj_id); 801 802 /* 803 * ip may be NULL, indicating the deletion of a directory 804 * entry which has no related inode. 805 */ 806 record->target_ip = ip; 807 if (ip) { 808 record->flush_state = HAMMER_FST_SETUP; 809 TAILQ_INSERT_TAIL(&ip->target_list, record, 810 target_entry); 811 } else { 812 record->flush_state = HAMMER_FST_IDLE; 813 } 814 815 /* 816 * The inode now has a dependancy and must be taken out of 817 * the idle state. An inode not in an idle state is given 818 * an extra reference. 819 * 820 * When transitioning to a SETUP state flag for an automatic 821 * reflush when the dependancies are disposed of if someone 822 * is waiting on the inode. 823 */ 824 if (ip && ip->flush_state == HAMMER_FST_IDLE) { 825 hammer_ref(&ip->lock); 826 ip->flush_state = HAMMER_FST_SETUP; 827 if (ip->flags & HAMMER_INODE_FLUSHW) 828 ip->flags |= HAMMER_INODE_REFLUSH; 829 } 830 831 error = hammer_mem_add(record); 832 } 833 834 /* 835 * One less link. The file may still be open in the OS even after 836 * all links have gone away. 837 * 838 * We have to terminate the cursor before syncing the inode to 839 * avoid deadlocking against ourselves. XXX this may no longer 840 * be true. 841 * 842 * If nlinks drops to zero and the vnode is inactive (or there is 843 * no vnode), call hammer_inode_unloadable_check() to zonk the 844 * inode. If we don't do this here the inode will not be destroyed 845 * on-media until we unmount. 846 */ 847 if (error == 0) { 848 if (ip) { 849 --ip->ino_data.nlinks; /* do before we might block */ 850 ip->ino_data.ctime = trans->time; 851 } 852 dip->ino_data.mtime = trans->time; 853 hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); 854 if (ip) { 855 hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); 856 if (ip->ino_data.nlinks == 0 && 857 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 858 hammer_done_cursor(cursor); 859 hammer_inode_unloadable_check(ip, 1); 860 hammer_flush_inode(ip, 0); 861 } 862 } 863 864 } 865 return(error); 866 } 867 868 /* 869 * Add a record to an inode. 870 * 871 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 872 * initialize the following additional fields: 873 * 874 * The related inode should be share-locked by the caller. The caller is 875 * on the frontend. 876 * 877 * record->rec.entry.base.base.key 878 * record->rec.entry.base.base.rec_type 879 * record->rec.entry.base.base.data_len 880 * record->data (a copy will be kmalloc'd if it cannot be embedded) 881 */ 882 int 883 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 884 { 885 hammer_inode_t ip = record->ip; 886 int error; 887 888 KKASSERT(record->leaf.base.localization != 0); 889 record->leaf.base.obj_id = ip->obj_id; 890 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 891 error = hammer_mem_add(record); 892 return(error); 893 } 894 895 /* 896 * Locate a pre-existing bulk record in memory. The caller wishes to 897 * replace the record with a new one. The existing record may have a 898 * different length (and thus a different key) so we have to use an 899 * overlap check function. 900 */ 901 static hammer_record_t 902 hammer_ip_get_bulk(hammer_record_t record) 903 { 904 struct hammer_bulk_info info; 905 hammer_inode_t ip = record->ip; 906 907 info.record = record; 908 info.conflict = NULL; 909 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp, 910 hammer_bulk_scan_callback, &info); 911 912 return(info.conflict); /* may be NULL */ 913 } 914 915 /* 916 * Take records vetted by overlap_cmp. The first non-deleted record 917 * (if any) stops the scan. 918 */ 919 static int 920 hammer_bulk_scan_callback(hammer_record_t record, void *data) 921 { 922 struct hammer_bulk_info *info = data; 923 924 if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 925 HAMMER_RECF_COMMITTED)) { 926 return(0); 927 } 928 hammer_ref(&record->lock); 929 info->conflict = record; 930 return(-1); /* stop scan */ 931 } 932 933 /* 934 * Reserve blockmap space placemarked with an in-memory record. 935 * 936 * This routine is called by the frontend in order to be able to directly 937 * flush a buffer cache buffer. The frontend has locked the related buffer 938 * cache buffers and we should be able to manipulate any overlapping 939 * in-memory records. 940 * 941 * The caller is responsible for adding the returned record and deleting 942 * the returned conflicting record (if any), typically by calling 943 * hammer_ip_replace_bulk() (via hammer_io_direct_write()). 944 */ 945 hammer_record_t 946 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, 947 int *errorp) 948 { 949 hammer_record_t record; 950 int zone; 951 952 /* 953 * Create a record to cover the direct write. The record cannot 954 * be added to the in-memory RB tree here as it might conflict 955 * with an existing memory record. See hammer_io_direct_write(). 956 * 957 * The backend is responsible for finalizing the space reserved in 958 * this record. 959 * 960 * XXX bytes not aligned, depend on the reservation code to 961 * align the reservation. 962 */ 963 record = hammer_alloc_mem_record(ip, 0); 964 zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : 965 HAMMER_ZONE_SMALL_DATA_INDEX; 966 record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, 967 &record->leaf.data_offset, 968 errorp); 969 if (record->resv == NULL) { 970 kprintf("hammer_ip_add_bulk: reservation failed\n"); 971 hammer_rel_mem_record(record); 972 return(NULL); 973 } 974 record->type = HAMMER_MEM_RECORD_DATA; 975 record->leaf.base.rec_type = HAMMER_RECTYPE_DATA; 976 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 977 record->leaf.base.obj_id = ip->obj_id; 978 record->leaf.base.key = file_offset + bytes; 979 record->leaf.base.localization = ip->obj_localization + 980 HAMMER_LOCALIZE_MISC; 981 record->leaf.data_len = bytes; 982 hammer_crc_set_leaf(data, &record->leaf); 983 KKASSERT(*errorp == 0); 984 985 return(record); 986 } 987 988 /* 989 * Called by hammer_io_direct_write() prior to any possible completion 990 * of the BIO to emplace the memory record associated with the I/O and 991 * to replace any prior memory record which might still be active. 992 * 993 * Setting the FE deleted flag on the old record (if any) avoids any RB 994 * tree insertion conflict, amoung other things. 995 * 996 * This has to be done prior to the caller completing any related buffer 997 * cache I/O or a reinstantiation of the buffer may load data from the 998 * old media location instead of the new media location. The holding 999 * of the locked buffer cache buffer serves to interlock the record 1000 * replacement operation. 1001 */ 1002 void 1003 hammer_ip_replace_bulk(hammer_mount_t hmp, hammer_record_t record) 1004 { 1005 hammer_record_t conflict; 1006 int error; 1007 1008 while ((conflict = hammer_ip_get_bulk(record)) != NULL) { 1009 if ((conflict->flags & HAMMER_RECF_INTERLOCK_BE) == 0) { 1010 conflict->flags |= HAMMER_RECF_DELETED_FE; 1011 break; 1012 } 1013 conflict->flags |= HAMMER_RECF_WANTED; 1014 tsleep(conflict, 0, "hmrrc3", 0); 1015 hammer_rel_mem_record(conflict); 1016 } 1017 error = hammer_mem_add(record); 1018 if (conflict) 1019 hammer_rel_mem_record(conflict); 1020 KKASSERT(error == 0); 1021 } 1022 1023 /* 1024 * Frontend truncation code. Scan in-memory records only. On-disk records 1025 * and records in a flushing state are handled by the backend. The vnops 1026 * setattr code will handle the block containing the truncation point. 1027 * 1028 * Partial blocks are not deleted. 1029 * 1030 * This code is only called on regular files. 1031 */ 1032 int 1033 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) 1034 { 1035 struct rec_trunc_info info; 1036 1037 switch(ip->ino_data.obj_type) { 1038 case HAMMER_OBJTYPE_REGFILE: 1039 info.rec_type = HAMMER_RECTYPE_DATA; 1040 break; 1041 case HAMMER_OBJTYPE_DBFILE: 1042 info.rec_type = HAMMER_RECTYPE_DB; 1043 break; 1044 default: 1045 return(EINVAL); 1046 } 1047 info.trunc_off = file_size; 1048 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp, 1049 hammer_frontend_trunc_callback, &info); 1050 return(0); 1051 } 1052 1053 /* 1054 * Scan callback for frontend records to destroy during a truncation. 1055 * We must ensure that DELETED_FE is set on the record or the frontend 1056 * will get confused in future read() calls. 1057 * 1058 * NOTE: DELETED_FE cannot be set while the record interlock (BE) is held. 1059 * In this rare case we must wait for the interlock to be cleared. 1060 * 1061 * NOTE: This function is only called on regular files. There are further 1062 * restrictions to the setting of DELETED_FE on directory records 1063 * undergoing a flush due to sensitive inode link count calculations. 1064 */ 1065 static int 1066 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused) 1067 { 1068 if (record->flags & HAMMER_RECF_DELETED_FE) 1069 return(0); 1070 #if 0 1071 if (record->flush_state == HAMMER_FST_FLUSH) 1072 return(0); 1073 #endif 1074 hammer_ref(&record->lock); 1075 while (record->flags & HAMMER_RECF_INTERLOCK_BE) 1076 hammer_wait_mem_record_ident(record, "hmmtrr"); 1077 record->flags |= HAMMER_RECF_DELETED_FE; 1078 hammer_rel_mem_record(record); 1079 return(0); 1080 } 1081 1082 /* 1083 * Return 1 if the caller must check for and delete existing records 1084 * before writing out a new data record. 1085 * 1086 * Return 0 if the caller can just insert the record into the B-Tree without 1087 * checking. 1088 */ 1089 static int 1090 hammer_record_needs_overwrite_delete(hammer_record_t record) 1091 { 1092 hammer_inode_t ip = record->ip; 1093 int64_t file_offset; 1094 int r; 1095 1096 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) 1097 file_offset = record->leaf.base.key; 1098 else 1099 file_offset = record->leaf.base.key - record->leaf.data_len; 1100 r = (file_offset < ip->save_trunc_off); 1101 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1102 if (ip->save_trunc_off <= record->leaf.base.key) 1103 ip->save_trunc_off = record->leaf.base.key + 1; 1104 } else { 1105 if (ip->save_trunc_off < record->leaf.base.key) 1106 ip->save_trunc_off = record->leaf.base.key; 1107 } 1108 return(r); 1109 } 1110 1111 /* 1112 * Backend code. Sync a record to the media. 1113 */ 1114 int 1115 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) 1116 { 1117 hammer_transaction_t trans = cursor->trans; 1118 int64_t file_offset; 1119 int bytes; 1120 void *bdata; 1121 int error; 1122 int doprop; 1123 1124 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1125 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 1126 KKASSERT(record->leaf.base.localization != 0); 1127 1128 /* 1129 * Any direct-write related to the record must complete before we 1130 * can sync the record to the on-disk media. 1131 */ 1132 if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) 1133 hammer_io_direct_wait(record); 1134 1135 /* 1136 * If this is a bulk-data record placemarker there may be an existing 1137 * record on-disk, indicating a data overwrite. If there is the 1138 * on-disk record must be deleted before we can insert our new record. 1139 * 1140 * We've synthesized this record and do not know what the create_tid 1141 * on-disk is, nor how much data it represents. 1142 * 1143 * Keep in mind that (key) for data records is (base_offset + len), 1144 * not (base_offset). Also, we only want to get rid of on-disk 1145 * records since we are trying to sync our in-memory record, call 1146 * hammer_ip_delete_range() with truncating set to 1 to make sure 1147 * it skips in-memory records. 1148 * 1149 * It is ok for the lookup to return ENOENT. 1150 * 1151 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have 1152 * to call hammer_ip_delete_range() or not. This also means we must 1153 * update sync_trunc_off() as we write. 1154 */ 1155 if (record->type == HAMMER_MEM_RECORD_DATA && 1156 hammer_record_needs_overwrite_delete(record)) { 1157 file_offset = record->leaf.base.key - record->leaf.data_len; 1158 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 1159 ~HAMMER_BUFMASK; 1160 KKASSERT((file_offset & HAMMER_BUFMASK) == 0); 1161 error = hammer_ip_delete_range( 1162 cursor, record->ip, 1163 file_offset, file_offset + bytes - 1, 1164 1); 1165 if (error && error != ENOENT) 1166 goto done; 1167 } 1168 1169 /* 1170 * If this is a general record there may be an on-disk version 1171 * that must be deleted before we can insert the new record. 1172 */ 1173 if (record->type == HAMMER_MEM_RECORD_GENERAL) { 1174 error = hammer_delete_general(cursor, record->ip, 1175 &record->leaf); 1176 if (error && error != ENOENT) 1177 goto done; 1178 } 1179 1180 /* 1181 * Setup the cursor. 1182 */ 1183 hammer_normalize_cursor(cursor); 1184 cursor->key_beg = record->leaf.base; 1185 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1186 cursor->flags |= HAMMER_CURSOR_BACKEND; 1187 cursor->flags &= ~HAMMER_CURSOR_INSERT; 1188 1189 /* 1190 * Records can wind up on-media before the inode itself is on-media. 1191 * Flag the case. 1192 */ 1193 record->ip->flags |= HAMMER_INODE_DONDISK; 1194 1195 /* 1196 * If we are deleting a directory entry an exact match must be 1197 * found on-disk. 1198 */ 1199 if (record->type == HAMMER_MEM_RECORD_DEL) { 1200 error = hammer_btree_lookup(cursor); 1201 if (error == 0) { 1202 KKASSERT(cursor->iprec == NULL); 1203 error = hammer_ip_delete_record(cursor, record->ip, 1204 trans->tid); 1205 if (error == 0) { 1206 record->flags |= HAMMER_RECF_DELETED_BE | 1207 HAMMER_RECF_COMMITTED; 1208 ++record->ip->rec_generation; 1209 } 1210 } 1211 goto done; 1212 } 1213 1214 /* 1215 * We are inserting. 1216 * 1217 * Issue a lookup to position the cursor and locate the insertion 1218 * point. The target key should not exist. If we are creating a 1219 * directory entry we may have to iterate the low 32 bits of the 1220 * key to find an unused key. 1221 */ 1222 hammer_sync_lock_sh(trans); 1223 cursor->flags |= HAMMER_CURSOR_INSERT; 1224 error = hammer_btree_lookup(cursor); 1225 if (hammer_debug_inode) 1226 kprintf("DOINSERT LOOKUP %d\n", error); 1227 if (error == 0) { 1228 kprintf("hammer_ip_sync_record: duplicate rec " 1229 "at (%016llx)\n", (long long)record->leaf.base.key); 1230 if (hammer_debug_critical) 1231 Debugger("duplicate record1"); 1232 error = EIO; 1233 } 1234 #if 0 1235 if (record->type == HAMMER_MEM_RECORD_DATA) 1236 kprintf("sync_record %016llx ---------------- %016llx %d\n", 1237 record->leaf.base.key - record->leaf.data_len, 1238 record->leaf.data_offset, error); 1239 #endif 1240 1241 if (error != ENOENT) 1242 goto done_unlock; 1243 1244 /* 1245 * Allocate the record and data. The result buffers will be 1246 * marked as being modified and further calls to 1247 * hammer_modify_buffer() will result in unneeded UNDO records. 1248 * 1249 * Support zero-fill records (data == NULL and data_len != 0) 1250 */ 1251 if (record->type == HAMMER_MEM_RECORD_DATA) { 1252 /* 1253 * The data portion of a bulk-data record has already been 1254 * committed to disk, we need only adjust the layer2 1255 * statistics in the same transaction as our B-Tree insert. 1256 */ 1257 KKASSERT(record->leaf.data_offset != 0); 1258 error = hammer_blockmap_finalize(trans, 1259 record->resv, 1260 record->leaf.data_offset, 1261 record->leaf.data_len); 1262 } else if (record->data && record->leaf.data_len) { 1263 /* 1264 * Wholely cached record, with data. Allocate the data. 1265 */ 1266 bdata = hammer_alloc_data(trans, record->leaf.data_len, 1267 record->leaf.base.rec_type, 1268 &record->leaf.data_offset, 1269 &cursor->data_buffer, 1270 0, &error); 1271 if (bdata == NULL) 1272 goto done_unlock; 1273 hammer_crc_set_leaf(record->data, &record->leaf); 1274 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); 1275 bcopy(record->data, bdata, record->leaf.data_len); 1276 hammer_modify_buffer_done(cursor->data_buffer); 1277 } else { 1278 /* 1279 * Wholely cached record, without data. 1280 */ 1281 record->leaf.data_offset = 0; 1282 record->leaf.data_crc = 0; 1283 } 1284 1285 error = hammer_btree_insert(cursor, &record->leaf, &doprop); 1286 if (hammer_debug_inode && error) { 1287 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", 1288 error, 1289 (long long)cursor->node->node_offset, 1290 cursor->index, 1291 (long long)record->leaf.base.key); 1292 } 1293 1294 /* 1295 * Our record is on-disk and we normally mark the in-memory version 1296 * as having been committed (and not BE-deleted). 1297 * 1298 * If the record represented a directory deletion but we had to 1299 * sync a valid directory entry to disk due to dependancies, 1300 * we must convert the record to a covering delete so the 1301 * frontend does not have visibility on the synced entry. 1302 * 1303 * WARNING: cursor's leaf pointer may have changed after do_propagation 1304 * returns! 1305 */ 1306 if (error == 0) { 1307 if (doprop) { 1308 hammer_btree_do_propagation(cursor, 1309 record->ip->pfsm, 1310 &record->leaf); 1311 } 1312 if (record->flags & HAMMER_RECF_CONVERT_DELETE) { 1313 /* 1314 * Must convert deleted directory entry add 1315 * to a directory entry delete. 1316 */ 1317 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 1318 record->flags &= ~HAMMER_RECF_DELETED_FE; 1319 record->type = HAMMER_MEM_RECORD_DEL; 1320 KKASSERT(record->ip->obj_id == record->leaf.base.obj_id); 1321 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1322 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 1323 KKASSERT((record->flags & (HAMMER_RECF_COMMITTED | 1324 HAMMER_RECF_DELETED_BE)) == 0); 1325 /* converted record is not yet committed */ 1326 /* hammer_flush_record_done takes care of the rest */ 1327 } else { 1328 /* 1329 * Everything went fine and we are now done with 1330 * this record. 1331 */ 1332 record->flags |= HAMMER_RECF_COMMITTED; 1333 ++record->ip->rec_generation; 1334 } 1335 } else { 1336 if (record->leaf.data_offset) { 1337 hammer_blockmap_free(trans, record->leaf.data_offset, 1338 record->leaf.data_len); 1339 } 1340 } 1341 done_unlock: 1342 hammer_sync_unlock(trans); 1343 done: 1344 return(error); 1345 } 1346 1347 /* 1348 * Add the record to the inode's rec_tree. The low 32 bits of a directory 1349 * entry's key is used to deal with hash collisions in the upper 32 bits. 1350 * A unique 64 bit key is generated in-memory and may be regenerated a 1351 * second time when the directory record is flushed to the on-disk B-Tree. 1352 * 1353 * A referenced record is passed to this function. This function 1354 * eats the reference. If an error occurs the record will be deleted. 1355 * 1356 * A copy of the temporary record->data pointer provided by the caller 1357 * will be made. 1358 */ 1359 int 1360 hammer_mem_add(hammer_record_t record) 1361 { 1362 hammer_mount_t hmp = record->ip->hmp; 1363 1364 /* 1365 * Make a private copy of record->data 1366 */ 1367 if (record->data) 1368 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA); 1369 1370 /* 1371 * Insert into the RB tree. A unique key should have already 1372 * been selected if this is a directory entry. 1373 */ 1374 if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 1375 record->flags |= HAMMER_RECF_DELETED_FE; 1376 hammer_rel_mem_record(record); 1377 return (EEXIST); 1378 } 1379 ++hmp->count_newrecords; 1380 ++hmp->rsv_recs; 1381 ++record->ip->rsv_recs; 1382 record->ip->hmp->rsv_databytes += record->leaf.data_len; 1383 record->flags |= HAMMER_RECF_ONRBTREE; 1384 hammer_modify_inode(NULL, record->ip, HAMMER_INODE_XDIRTY); 1385 hammer_rel_mem_record(record); 1386 return(0); 1387 } 1388 1389 /************************************************************************ 1390 * HAMMER INODE MERGED-RECORD FUNCTIONS * 1391 ************************************************************************ 1392 * 1393 * These functions augment the B-Tree scanning functions in hammer_btree.c 1394 * by merging in-memory records with on-disk records. 1395 */ 1396 1397 /* 1398 * Locate a particular record either in-memory or on-disk. 1399 * 1400 * NOTE: This is basically a standalone routine, hammer_ip_next() may 1401 * NOT be called to iterate results. 1402 */ 1403 int 1404 hammer_ip_lookup(hammer_cursor_t cursor) 1405 { 1406 int error; 1407 1408 /* 1409 * If the element is in-memory return it without searching the 1410 * on-disk B-Tree 1411 */ 1412 KKASSERT(cursor->ip); 1413 error = hammer_mem_lookup(cursor); 1414 if (error == 0) { 1415 cursor->leaf = &cursor->iprec->leaf; 1416 return(error); 1417 } 1418 if (error != ENOENT) 1419 return(error); 1420 1421 /* 1422 * If the inode has on-disk components search the on-disk B-Tree. 1423 */ 1424 if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 1425 return(error); 1426 error = hammer_btree_lookup(cursor); 1427 if (error == 0) 1428 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1429 return(error); 1430 } 1431 1432 /* 1433 * Helper for hammer_ip_first()/hammer_ip_next() 1434 * 1435 * NOTE: Both ATEDISK and DISKEOF will be set the same. This sets up 1436 * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek 1437 * state if hammer_ip_next() needs to re-seek. 1438 */ 1439 static __inline 1440 int 1441 _hammer_ip_seek_btree(hammer_cursor_t cursor) 1442 { 1443 hammer_inode_t ip = cursor->ip; 1444 int error; 1445 1446 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1447 error = hammer_btree_lookup(cursor); 1448 if (error == ENOENT || error == EDEADLK) { 1449 if (hammer_debug_general & 0x2000) { 1450 kprintf("error %d node %p %016llx index %d\n", 1451 error, cursor->node, 1452 (long long)cursor->node->node_offset, 1453 cursor->index); 1454 } 1455 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1456 error = hammer_btree_iterate(cursor); 1457 } 1458 if (error == 0) { 1459 cursor->flags &= ~(HAMMER_CURSOR_DISKEOF | 1460 HAMMER_CURSOR_ATEDISK); 1461 } else { 1462 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1463 HAMMER_CURSOR_ATEDISK; 1464 if (error == ENOENT) 1465 error = 0; 1466 } 1467 } else { 1468 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; 1469 error = 0; 1470 } 1471 return(error); 1472 } 1473 1474 /* 1475 * Helper for hammer_ip_next() 1476 * 1477 * The caller has determined that the media cursor is further along than the 1478 * memory cursor and must be reseeked after a generation number change. 1479 */ 1480 static 1481 int 1482 _hammer_ip_reseek(hammer_cursor_t cursor) 1483 { 1484 struct hammer_base_elm save; 1485 hammer_btree_elm_t elm; 1486 int error; 1487 int r; 1488 int again = 0; 1489 1490 /* 1491 * Do the re-seek. 1492 */ 1493 kprintf("HAMMER: Debug: re-seeked during scan @ino=%016llx\n", 1494 (long long)cursor->ip->obj_id); 1495 save = cursor->key_beg; 1496 cursor->key_beg = cursor->iprec->leaf.base; 1497 error = _hammer_ip_seek_btree(cursor); 1498 KKASSERT(error == 0); 1499 cursor->key_beg = save; 1500 1501 /* 1502 * If the memory record was previous returned to 1503 * the caller and the media record matches 1504 * (-1/+1: only create_tid differs), then iterate 1505 * the media record to avoid a double result. 1506 */ 1507 if ((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0 && 1508 (cursor->flags & HAMMER_CURSOR_LASTWASMEM)) { 1509 elm = &cursor->node->ondisk->elms[cursor->index]; 1510 r = hammer_btree_cmp(&elm->base, 1511 &cursor->iprec->leaf.base); 1512 if (cursor->flags & HAMMER_CURSOR_ASOF) { 1513 if (r >= -1 && r <= 1) { 1514 kprintf("HAMMER: Debug: iterated after " 1515 "re-seek (asof r=%d)\n", r); 1516 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1517 again = 1; 1518 } 1519 } else { 1520 if (r == 0) { 1521 kprintf("HAMMER: Debug: iterated after " 1522 "re-seek\n"); 1523 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1524 again = 1; 1525 } 1526 } 1527 } 1528 return(again); 1529 } 1530 1531 /* 1532 * Locate the first record within the cursor's key_beg/key_end range, 1533 * restricted to a particular inode. 0 is returned on success, ENOENT 1534 * if no records matched the requested range, or some other error. 1535 * 1536 * When 0 is returned hammer_ip_next() may be used to iterate additional 1537 * records within the requested range. 1538 * 1539 * This function can return EDEADLK, requiring the caller to terminate 1540 * the cursor and try again. 1541 */ 1542 1543 int 1544 hammer_ip_first(hammer_cursor_t cursor) 1545 { 1546 hammer_inode_t ip __debugvar = cursor->ip; 1547 int error; 1548 1549 KKASSERT(ip != NULL); 1550 1551 /* 1552 * Clean up fields and setup for merged scan 1553 */ 1554 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1555 1556 /* 1557 * Search the in-memory record list (Red-Black tree). Unlike the 1558 * B-Tree search, mem_first checks for records in the range. 1559 * 1560 * This function will setup both ATEMEM and MEMEOF properly for 1561 * the ip iteration. ATEMEM will be set if MEMEOF is set. 1562 */ 1563 hammer_mem_first(cursor); 1564 1565 /* 1566 * Detect generation changes during blockages, including 1567 * blockages which occur on the initial btree search. 1568 */ 1569 cursor->rec_generation = cursor->ip->rec_generation; 1570 1571 /* 1572 * Initial search and result 1573 */ 1574 error = _hammer_ip_seek_btree(cursor); 1575 if (error == 0) 1576 error = hammer_ip_next(cursor); 1577 1578 return (error); 1579 } 1580 1581 /* 1582 * Retrieve the next record in a merged iteration within the bounds of the 1583 * cursor. This call may be made multiple times after the cursor has been 1584 * initially searched with hammer_ip_first(). 1585 * 1586 * There are numerous special cases in this code to deal with races between 1587 * in-memory records and on-media records. 1588 * 1589 * 0 is returned on success, ENOENT if no further records match the 1590 * requested range, or some other error code is returned. 1591 */ 1592 int 1593 hammer_ip_next(hammer_cursor_t cursor) 1594 { 1595 hammer_btree_elm_t elm; 1596 hammer_record_t rec; 1597 hammer_record_t tmprec; 1598 int error; 1599 int r; 1600 1601 again: 1602 /* 1603 * Get the next on-disk record 1604 * 1605 * NOTE: If we deleted the last on-disk record we had scanned 1606 * ATEDISK will be clear and RETEST will be set, forcing 1607 * a call to iterate. The fact that ATEDISK is clear causes 1608 * iterate to re-test the 'current' element. If ATEDISK is 1609 * set, iterate will skip the 'current' element. 1610 */ 1611 error = 0; 1612 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1613 if (cursor->flags & (HAMMER_CURSOR_ATEDISK | 1614 HAMMER_CURSOR_RETEST)) { 1615 error = hammer_btree_iterate(cursor); 1616 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1617 if (error == 0) { 1618 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1619 hammer_cache_node(&cursor->ip->cache[1], 1620 cursor->node); 1621 } else if (error == ENOENT) { 1622 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1623 HAMMER_CURSOR_ATEDISK; 1624 error = 0; 1625 } 1626 } 1627 } 1628 1629 /* 1630 * If the generation changed the backend has deleted or committed 1631 * one or more memory records since our last check. 1632 * 1633 * When this case occurs if the disk cursor is > current memory record 1634 * or the disk cursor is at EOF, we must re-seek the disk-cursor. 1635 * Since the cursor is ahead it must have not yet been eaten (if 1636 * not at eof anyway). (XXX data offset case?) 1637 * 1638 * NOTE: we are not doing a full check here. That will be handled 1639 * later on. 1640 * 1641 * If we have exhausted all memory records we do not have to do any 1642 * further seeks. 1643 */ 1644 while (cursor->rec_generation != cursor->ip->rec_generation && 1645 error == 0 1646 ) { 1647 kprintf("HAMMER: Debug: generation changed during scan @ino=%016llx\n", (long long)cursor->ip->obj_id); 1648 cursor->rec_generation = cursor->ip->rec_generation; 1649 if (cursor->flags & HAMMER_CURSOR_MEMEOF) 1650 break; 1651 if (cursor->flags & HAMMER_CURSOR_DISKEOF) { 1652 r = 1; 1653 } else { 1654 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0); 1655 elm = &cursor->node->ondisk->elms[cursor->index]; 1656 r = hammer_btree_cmp(&elm->base, 1657 &cursor->iprec->leaf.base); 1658 } 1659 1660 /* 1661 * Do we re-seek the media cursor? 1662 */ 1663 if (r > 0) { 1664 if (_hammer_ip_reseek(cursor)) 1665 goto again; 1666 } 1667 } 1668 1669 /* 1670 * We can now safely get the next in-memory record. We cannot 1671 * block here. 1672 * 1673 * hammer_rec_scan_cmp: Is the record still in our general range, 1674 * (non-inclusive of snapshot exclusions)? 1675 * hammer_rec_scan_callback: Is the record in our snapshot? 1676 */ 1677 tmprec = NULL; 1678 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1679 /* 1680 * If the current memory record was eaten then get the next 1681 * one. Stale records are skipped. 1682 */ 1683 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1684 tmprec = cursor->iprec; 1685 cursor->iprec = NULL; 1686 rec = hammer_rec_rb_tree_RB_NEXT(tmprec); 1687 while (rec) { 1688 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1689 break; 1690 if (hammer_rec_scan_callback(rec, cursor) != 0) 1691 break; 1692 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1693 } 1694 if (cursor->iprec) { 1695 KKASSERT(cursor->iprec == rec); 1696 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1697 } else { 1698 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1699 } 1700 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1701 } 1702 } 1703 1704 /* 1705 * MEMORY RECORD VALIDITY TEST 1706 * 1707 * (We still can't block, which is why tmprec is being held so 1708 * long). 1709 * 1710 * If the memory record is no longer valid we skip it. It may 1711 * have been deleted by the frontend. If it was deleted or 1712 * committed by the backend the generation change re-seeked the 1713 * disk cursor and the record will be present there. 1714 */ 1715 if (error == 0 && (cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1716 KKASSERT(cursor->iprec); 1717 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0); 1718 if (!hammer_ip_iterate_mem_good(cursor, cursor->iprec)) { 1719 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1720 if (tmprec) 1721 hammer_rel_mem_record(tmprec); 1722 goto again; 1723 } 1724 } 1725 if (tmprec) 1726 hammer_rel_mem_record(tmprec); 1727 1728 /* 1729 * Extract either the disk or memory record depending on their 1730 * relative position. 1731 */ 1732 error = 0; 1733 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1734 case 0: 1735 /* 1736 * Both entries valid. Compare the entries and nominally 1737 * return the first one in the sort order. Numerous cases 1738 * require special attention, however. 1739 */ 1740 elm = &cursor->node->ondisk->elms[cursor->index]; 1741 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base); 1742 1743 /* 1744 * If the two entries differ only by their key (-2/2) or 1745 * create_tid (-1/1), and are DATA records, we may have a 1746 * nominal match. We have to calculate the base file 1747 * offset of the data. 1748 */ 1749 if (r <= 2 && r >= -2 && r != 0 && 1750 cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE && 1751 cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1752 int64_t base1 = elm->leaf.base.key - elm->leaf.data_len; 1753 int64_t base2 = cursor->iprec->leaf.base.key - 1754 cursor->iprec->leaf.data_len; 1755 if (base1 == base2) 1756 r = 0; 1757 } 1758 1759 if (r < 0) { 1760 error = hammer_btree_extract(cursor, 1761 HAMMER_CURSOR_GET_LEAF); 1762 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1763 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1764 break; 1765 } 1766 1767 /* 1768 * If the entries match exactly the memory entry is either 1769 * an on-disk directory entry deletion or a bulk data 1770 * overwrite. If it is a directory entry deletion we eat 1771 * both entries. 1772 * 1773 * For the bulk-data overwrite case it is possible to have 1774 * visibility into both, which simply means the syncer 1775 * hasn't gotten around to doing the delete+insert sequence 1776 * on the B-Tree. Use the memory entry and throw away the 1777 * on-disk entry. 1778 * 1779 * If the in-memory record is not either of these we 1780 * probably caught the syncer while it was syncing it to 1781 * the media. Since we hold a shared lock on the cursor, 1782 * the in-memory record had better be marked deleted at 1783 * this point. 1784 */ 1785 if (r == 0) { 1786 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1787 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1788 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1789 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1790 goto again; 1791 } 1792 } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1793 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1794 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1795 } 1796 /* fall through to memory entry */ 1797 } else { 1798 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags); 1799 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1800 goto again; 1801 } 1802 } 1803 /* fall through to the memory entry */ 1804 case HAMMER_CURSOR_ATEDISK: 1805 /* 1806 * Only the memory entry is valid. 1807 */ 1808 cursor->leaf = &cursor->iprec->leaf; 1809 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1810 cursor->flags |= HAMMER_CURSOR_LASTWASMEM; 1811 1812 /* 1813 * If the memory entry is an on-disk deletion we should have 1814 * also had found a B-Tree record. If the backend beat us 1815 * to it it would have interlocked the cursor and we should 1816 * have seen the in-memory record marked DELETED_FE. 1817 */ 1818 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL && 1819 (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1820 panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags); 1821 } 1822 break; 1823 case HAMMER_CURSOR_ATEMEM: 1824 /* 1825 * Only the disk entry is valid 1826 */ 1827 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1828 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1829 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1830 break; 1831 default: 1832 /* 1833 * Neither entry is valid 1834 * 1835 * XXX error not set properly 1836 */ 1837 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1838 cursor->leaf = NULL; 1839 error = ENOENT; 1840 break; 1841 } 1842 return(error); 1843 } 1844 1845 /* 1846 * Resolve the cursor->data pointer for the current cursor position in 1847 * a merged iteration. 1848 */ 1849 int 1850 hammer_ip_resolve_data(hammer_cursor_t cursor) 1851 { 1852 hammer_record_t record; 1853 int error; 1854 1855 if (hammer_cursor_inmem(cursor)) { 1856 /* 1857 * The data associated with an in-memory record is usually 1858 * kmalloced, but reserve-ahead data records will have an 1859 * on-disk reference. 1860 * 1861 * NOTE: Reserve-ahead data records must be handled in the 1862 * context of the related high level buffer cache buffer 1863 * to interlock against async writes. 1864 */ 1865 record = cursor->iprec; 1866 cursor->data = record->data; 1867 error = 0; 1868 if (cursor->data == NULL) { 1869 KKASSERT(record->leaf.base.rec_type == 1870 HAMMER_RECTYPE_DATA); 1871 cursor->data = hammer_bread_ext(cursor->trans->hmp, 1872 record->leaf.data_offset, 1873 record->leaf.data_len, 1874 &error, 1875 &cursor->data_buffer); 1876 } 1877 } else { 1878 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf; 1879 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1880 } 1881 return(error); 1882 } 1883 1884 /* 1885 * Backend truncation / record replacement - delete records in range. 1886 * 1887 * Delete all records within the specified range for inode ip. In-memory 1888 * records still associated with the frontend are ignored. 1889 * 1890 * If truncating is non-zero in-memory records associated with the back-end 1891 * are ignored. If truncating is > 1 we can return EWOULDBLOCK. 1892 * 1893 * NOTES: 1894 * 1895 * * An unaligned range will cause new records to be added to cover 1896 * the edge cases. (XXX not implemented yet). 1897 * 1898 * * Replacement via reservations (see hammer_ip_sync_record_cursor()) 1899 * also do not deal with unaligned ranges. 1900 * 1901 * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1902 * 1903 * * Record keys for regular file data have to be special-cased since 1904 * they indicate the end of the range (key = base + bytes). 1905 * 1906 * * This function may be asked to delete ridiculously huge ranges, for 1907 * example if someone truncates or removes a 1TB regular file. We 1908 * must be very careful on restarts and we may have to stop w/ 1909 * EWOULDBLOCK to avoid blowing out the buffer cache. 1910 */ 1911 int 1912 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, 1913 int64_t ran_beg, int64_t ran_end, int truncating) 1914 { 1915 hammer_transaction_t trans = cursor->trans; 1916 hammer_btree_leaf_elm_t leaf; 1917 int error; 1918 int64_t off; 1919 int64_t tmp64; 1920 1921 #if 0 1922 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); 1923 #endif 1924 1925 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1926 retry: 1927 hammer_normalize_cursor(cursor); 1928 cursor->key_beg.localization = ip->obj_localization + 1929 HAMMER_LOCALIZE_MISC; 1930 cursor->key_beg.obj_id = ip->obj_id; 1931 cursor->key_beg.create_tid = 0; 1932 cursor->key_beg.delete_tid = 0; 1933 cursor->key_beg.obj_type = 0; 1934 1935 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1936 cursor->key_beg.key = ran_beg; 1937 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB; 1938 } else { 1939 /* 1940 * The key in the B-Tree is (base+bytes), so the first possible 1941 * matching key is ran_beg + 1. 1942 */ 1943 cursor->key_beg.key = ran_beg + 1; 1944 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; 1945 } 1946 1947 cursor->key_end = cursor->key_beg; 1948 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1949 cursor->key_end.key = ran_end; 1950 } else { 1951 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1952 if (tmp64 < ran_end) 1953 cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1954 else 1955 cursor->key_end.key = ran_end + MAXPHYS + 1; 1956 } 1957 1958 cursor->asof = ip->obj_asof; 1959 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1960 cursor->flags |= HAMMER_CURSOR_ASOF; 1961 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1962 cursor->flags |= HAMMER_CURSOR_BACKEND; 1963 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE; 1964 1965 error = hammer_ip_first(cursor); 1966 1967 /* 1968 * Iterate through matching records and mark them as deleted. 1969 */ 1970 while (error == 0) { 1971 leaf = cursor->leaf; 1972 1973 KKASSERT(leaf->base.delete_tid == 0); 1974 KKASSERT(leaf->base.obj_id == ip->obj_id); 1975 1976 /* 1977 * There may be overlap cases for regular file data. Also 1978 * remember the key for a regular file record is (base + len), 1979 * NOT (base). 1980 * 1981 * Note that do to duplicates (mem & media) allowed by 1982 * DELETE_VISIBILITY, off can wind up less then ran_beg. 1983 */ 1984 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 1985 off = leaf->base.key - leaf->data_len; 1986 /* 1987 * Check the left edge case. We currently do not 1988 * split existing records. 1989 */ 1990 if (off < ran_beg && leaf->base.key > ran_beg) { 1991 panic("hammer left edge case %016llx %d\n", 1992 (long long)leaf->base.key, 1993 leaf->data_len); 1994 } 1995 1996 /* 1997 * Check the right edge case. Note that the 1998 * record can be completely out of bounds, which 1999 * terminates the search. 2000 * 2001 * base->key is exclusive of the right edge while 2002 * ran_end is inclusive of the right edge. The 2003 * (key - data_len) left boundary is inclusive. 2004 * 2005 * XXX theory-check this test at some point, are 2006 * we missing a + 1 somewhere? Note that ran_end 2007 * could overflow. 2008 */ 2009 if (leaf->base.key - 1 > ran_end) { 2010 if (leaf->base.key - leaf->data_len > ran_end) 2011 break; 2012 panic("hammer right edge case\n"); 2013 } 2014 } else { 2015 off = leaf->base.key; 2016 } 2017 2018 /* 2019 * Delete the record. When truncating we do not delete 2020 * in-memory (data) records because they represent data 2021 * written after the truncation. 2022 * 2023 * This will also physically destroy the B-Tree entry and 2024 * data if the retention policy dictates. The function 2025 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2026 * to retest the new 'current' element. 2027 */ 2028 if (truncating == 0 || hammer_cursor_ondisk(cursor)) { 2029 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2030 /* 2031 * If we have built up too many meta-buffers we risk 2032 * deadlocking the kernel and must stop. This can 2033 * occur when deleting ridiculously huge files. 2034 * sync_trunc_off is updated so the next cycle does 2035 * not re-iterate records we have already deleted. 2036 * 2037 * This is only done with formal truncations. 2038 */ 2039 if (truncating > 1 && error == 0 && 2040 hammer_flusher_meta_limit(ip->hmp)) { 2041 ip->sync_trunc_off = off; 2042 error = EWOULDBLOCK; 2043 } 2044 } 2045 if (error) 2046 break; 2047 ran_beg = off; /* for restart */ 2048 error = hammer_ip_next(cursor); 2049 } 2050 if (cursor->node) 2051 hammer_cache_node(&ip->cache[1], cursor->node); 2052 2053 if (error == EDEADLK) { 2054 hammer_done_cursor(cursor); 2055 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2056 if (error == 0) 2057 goto retry; 2058 } 2059 if (error == ENOENT) 2060 error = 0; 2061 return(error); 2062 } 2063 2064 /* 2065 * This backend function deletes the specified record on-disk, similar to 2066 * delete_range but for a specific record. Unlike the exact deletions 2067 * used when deleting a directory entry this function uses an ASOF search 2068 * like delete_range. 2069 * 2070 * This function may be called with ip->obj_asof set for a slave snapshot, 2071 * so don't use it. We always delete non-historical records only. 2072 */ 2073 static int 2074 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 2075 hammer_btree_leaf_elm_t leaf) 2076 { 2077 hammer_transaction_t trans = cursor->trans; 2078 int error; 2079 2080 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2081 retry: 2082 hammer_normalize_cursor(cursor); 2083 cursor->key_beg = leaf->base; 2084 cursor->asof = HAMMER_MAX_TID; 2085 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2086 cursor->flags |= HAMMER_CURSOR_ASOF; 2087 cursor->flags |= HAMMER_CURSOR_BACKEND; 2088 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2089 2090 error = hammer_btree_lookup(cursor); 2091 if (error == 0) { 2092 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2093 } 2094 if (error == EDEADLK) { 2095 hammer_done_cursor(cursor); 2096 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2097 if (error == 0) 2098 goto retry; 2099 } 2100 return(error); 2101 } 2102 2103 /* 2104 * This function deletes remaining auxillary records when an inode is 2105 * being deleted. This function explicitly does not delete the 2106 * inode record, directory entry, data, or db records. Those must be 2107 * properly disposed of prior to this call. 2108 */ 2109 int 2110 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp) 2111 { 2112 hammer_transaction_t trans = cursor->trans; 2113 hammer_btree_leaf_elm_t leaf; 2114 int error; 2115 2116 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2117 retry: 2118 hammer_normalize_cursor(cursor); 2119 cursor->key_beg.localization = ip->obj_localization + 2120 HAMMER_LOCALIZE_MISC; 2121 cursor->key_beg.obj_id = ip->obj_id; 2122 cursor->key_beg.create_tid = 0; 2123 cursor->key_beg.delete_tid = 0; 2124 cursor->key_beg.obj_type = 0; 2125 cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START; 2126 cursor->key_beg.key = HAMMER_MIN_KEY; 2127 2128 cursor->key_end = cursor->key_beg; 2129 cursor->key_end.rec_type = HAMMER_RECTYPE_MAX; 2130 cursor->key_end.key = HAMMER_MAX_KEY; 2131 2132 cursor->asof = ip->obj_asof; 2133 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2134 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2135 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 2136 cursor->flags |= HAMMER_CURSOR_BACKEND; 2137 2138 error = hammer_ip_first(cursor); 2139 2140 /* 2141 * Iterate through matching records and mark them as deleted. 2142 */ 2143 while (error == 0) { 2144 leaf = cursor->leaf; 2145 2146 KKASSERT(leaf->base.delete_tid == 0); 2147 2148 /* 2149 * Mark the record and B-Tree entry as deleted. This will 2150 * also physically delete the B-Tree entry, record, and 2151 * data if the retention policy dictates. The function 2152 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2153 * to retest the new 'current' element. 2154 * 2155 * Directory entries (and delete-on-disk directory entries) 2156 * must be synced and cannot be deleted. 2157 */ 2158 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2159 ++*countp; 2160 if (error) 2161 break; 2162 error = hammer_ip_next(cursor); 2163 } 2164 if (cursor->node) 2165 hammer_cache_node(&ip->cache[1], cursor->node); 2166 if (error == EDEADLK) { 2167 hammer_done_cursor(cursor); 2168 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2169 if (error == 0) 2170 goto retry; 2171 } 2172 if (error == ENOENT) 2173 error = 0; 2174 return(error); 2175 } 2176 2177 /* 2178 * Delete the record at the current cursor. On success the cursor will 2179 * be positioned appropriately for an iteration but may no longer be at 2180 * a leaf node. 2181 * 2182 * This routine is only called from the backend. 2183 * 2184 * NOTE: This can return EDEADLK, requiring the caller to terminate the 2185 * cursor and retry. 2186 */ 2187 int 2188 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip, 2189 hammer_tid_t tid) 2190 { 2191 hammer_record_t iprec; 2192 hammer_mount_t hmp; 2193 int error; 2194 2195 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 2196 KKASSERT(tid != 0); 2197 hmp = cursor->node->hmp; 2198 2199 /* 2200 * In-memory (unsynchronized) records can simply be freed. This 2201 * only occurs in range iterations since all other records are 2202 * individually synchronized. Thus there should be no confusion with 2203 * the interlock. 2204 * 2205 * An in-memory record may be deleted before being committed to disk, 2206 * but could have been accessed in the mean time. The reservation 2207 * code will deal with the case. 2208 */ 2209 if (hammer_cursor_inmem(cursor)) { 2210 iprec = cursor->iprec; 2211 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); 2212 iprec->flags |= HAMMER_RECF_DELETED_FE; 2213 iprec->flags |= HAMMER_RECF_DELETED_BE; 2214 KKASSERT(iprec->ip == ip); 2215 ++ip->rec_generation; 2216 return(0); 2217 } 2218 2219 /* 2220 * On-disk records are marked as deleted by updating their delete_tid. 2221 * This does not effect their position in the B-Tree (which is based 2222 * on their create_tid). 2223 * 2224 * Frontend B-Tree operations track inodes so we tell 2225 * hammer_delete_at_cursor() not to. 2226 */ 2227 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 2228 2229 if (error == 0) { 2230 error = hammer_delete_at_cursor( 2231 cursor, 2232 HAMMER_DELETE_ADJUST | hammer_nohistory(ip), 2233 cursor->trans->tid, 2234 cursor->trans->time32, 2235 0, NULL); 2236 } 2237 return(error); 2238 } 2239 2240 /* 2241 * Used to write a generic record w/optional data to the media b-tree 2242 * when no inode context is available. Used by the mirroring and 2243 * snapshot code. 2244 * 2245 * Caller must set cursor->key_beg to leaf->base. The cursor must be 2246 * flagged for backend operation and not flagged ASOF (since we are 2247 * doing an insertion). 2248 * 2249 * This function will acquire the appropriate sync lock and will set 2250 * the cursor insertion flag for the operation, do the btree lookup, 2251 * and the insertion, and clear the insertion flag and sync lock before 2252 * returning. The cursor state will be such that the caller can continue 2253 * scanning (used by the mirroring code). 2254 * 2255 * mode: HAMMER_CREATE_MODE_UMIRROR copyin data, check crc 2256 * HAMMER_CREATE_MODE_SYS bcopy data, generate crc 2257 * 2258 * NOTE: EDEADLK can be returned. The caller must do deadlock handling and 2259 * retry. 2260 * 2261 * EALREADY can be returned if the record already exists (WARNING, 2262 * because ASOF cannot be used no check is made for illegal 2263 * duplicates). 2264 * 2265 * NOTE: Do not use the function for normal inode-related records as this 2266 * functions goes directly to the media and is not integrated with 2267 * in-memory records. 2268 */ 2269 int 2270 hammer_create_at_cursor(hammer_cursor_t cursor, hammer_btree_leaf_elm_t leaf, 2271 void *udata, int mode) 2272 { 2273 hammer_transaction_t trans; 2274 hammer_buffer_t data_buffer; 2275 hammer_off_t ndata_offset; 2276 hammer_tid_t high_tid; 2277 void *ndata; 2278 int error; 2279 int doprop; 2280 2281 trans = cursor->trans; 2282 data_buffer = NULL; 2283 ndata_offset = 0; 2284 doprop = 0; 2285 2286 KKASSERT((cursor->flags & 2287 (HAMMER_CURSOR_BACKEND | HAMMER_CURSOR_ASOF)) == 2288 (HAMMER_CURSOR_BACKEND)); 2289 2290 hammer_sync_lock_sh(trans); 2291 2292 if (leaf->data_len) { 2293 ndata = hammer_alloc_data(trans, leaf->data_len, 2294 leaf->base.rec_type, 2295 &ndata_offset, &data_buffer, 2296 0, &error); 2297 if (ndata == NULL) { 2298 hammer_sync_unlock(trans); 2299 return (error); 2300 } 2301 leaf->data_offset = ndata_offset; 2302 hammer_modify_buffer(trans, data_buffer, NULL, 0); 2303 2304 switch(mode) { 2305 case HAMMER_CREATE_MODE_UMIRROR: 2306 error = copyin(udata, ndata, leaf->data_len); 2307 if (error == 0) { 2308 if (hammer_crc_test_leaf(ndata, leaf) == 0) { 2309 kprintf("data crc mismatch on pipe\n"); 2310 error = EINVAL; 2311 } else { 2312 error = hammer_cursor_localize_data( 2313 ndata, leaf); 2314 } 2315 } 2316 break; 2317 case HAMMER_CREATE_MODE_SYS: 2318 bcopy(udata, ndata, leaf->data_len); 2319 error = 0; 2320 hammer_crc_set_leaf(ndata, leaf); 2321 break; 2322 default: 2323 panic("hammer: hammer_create_at_cursor: bad mode %d", 2324 mode); 2325 break; /* NOT REACHED */ 2326 } 2327 hammer_modify_buffer_done(data_buffer); 2328 } else { 2329 leaf->data_offset = 0; 2330 error = 0; 2331 ndata = NULL; 2332 } 2333 if (error) 2334 goto failed; 2335 2336 /* 2337 * Do the insertion. This can fail with a EDEADLK or EALREADY 2338 */ 2339 cursor->flags |= HAMMER_CURSOR_INSERT; 2340 error = hammer_btree_lookup(cursor); 2341 if (error != ENOENT) { 2342 if (error == 0) 2343 error = EALREADY; 2344 goto failed; 2345 } 2346 error = hammer_btree_insert(cursor, leaf, &doprop); 2347 2348 /* 2349 * Cursor is left on current element, we want to skip it now. 2350 * (in case the caller is scanning) 2351 */ 2352 cursor->flags |= HAMMER_CURSOR_ATEDISK; 2353 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2354 2355 /* 2356 * If the insertion happens to be creating (and not just replacing) 2357 * an inode we have to track it. 2358 */ 2359 if (error == 0 && 2360 leaf->base.rec_type == HAMMER_RECTYPE_INODE && 2361 leaf->base.delete_tid == 0) { 2362 hammer_modify_volume_field(trans, trans->rootvol, 2363 vol0_stat_inodes); 2364 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 2365 hammer_modify_volume_done(trans->rootvol); 2366 } 2367 2368 /* 2369 * vol0_next_tid must track the highest TID stored in the filesystem. 2370 * We do not need to generate undo for this update. 2371 */ 2372 high_tid = leaf->base.create_tid; 2373 if (high_tid < leaf->base.delete_tid) 2374 high_tid = leaf->base.delete_tid; 2375 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 2376 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2377 trans->rootvol->ondisk->vol0_next_tid = high_tid; 2378 hammer_modify_volume_done(trans->rootvol); 2379 } 2380 2381 /* 2382 * WARNING! cursor's leaf pointer may have changed after 2383 * do_propagation returns. 2384 */ 2385 if (error == 0 && doprop) 2386 hammer_btree_do_propagation(cursor, NULL, leaf); 2387 2388 failed: 2389 /* 2390 * Cleanup 2391 */ 2392 if (error && leaf->data_offset) { 2393 hammer_blockmap_free(trans, leaf->data_offset, leaf->data_len); 2394 2395 } 2396 hammer_sync_unlock(trans); 2397 if (data_buffer) 2398 hammer_rel_buffer(data_buffer, 0); 2399 return (error); 2400 } 2401 2402 /* 2403 * Delete the B-Tree element at the current cursor and do any necessary 2404 * mirror propagation. 2405 * 2406 * The cursor must be properly positioned for an iteration on return but 2407 * may be pointing at an internal element. 2408 * 2409 * An element can be un-deleted by passing a delete_tid of 0 with 2410 * HAMMER_DELETE_ADJUST. 2411 */ 2412 int 2413 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, 2414 hammer_tid_t delete_tid, u_int32_t delete_ts, 2415 int track, int64_t *stat_bytes) 2416 { 2417 struct hammer_btree_leaf_elm save_leaf; 2418 hammer_transaction_t trans; 2419 hammer_btree_leaf_elm_t leaf; 2420 hammer_node_t node; 2421 hammer_btree_elm_t elm; 2422 hammer_off_t data_offset; 2423 int32_t data_len; 2424 u_int16_t rec_type; 2425 int error; 2426 int icount; 2427 int doprop; 2428 2429 error = hammer_cursor_upgrade(cursor); 2430 if (error) 2431 return(error); 2432 2433 trans = cursor->trans; 2434 node = cursor->node; 2435 elm = &node->ondisk->elms[cursor->index]; 2436 leaf = &elm->leaf; 2437 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 2438 2439 hammer_sync_lock_sh(trans); 2440 doprop = 0; 2441 icount = 0; 2442 2443 /* 2444 * Adjust the delete_tid. Update the mirror_tid propagation field 2445 * as well. delete_tid can be 0 (undelete -- used by mirroring). 2446 */ 2447 if (delete_flags & HAMMER_DELETE_ADJUST) { 2448 if (elm->base.rec_type == HAMMER_RECTYPE_INODE) { 2449 if (elm->leaf.base.delete_tid == 0 && delete_tid) 2450 icount = -1; 2451 if (elm->leaf.base.delete_tid && delete_tid == 0) 2452 icount = 1; 2453 } 2454 2455 hammer_modify_node(trans, node, elm, sizeof(*elm)); 2456 elm->leaf.base.delete_tid = delete_tid; 2457 elm->leaf.delete_ts = delete_ts; 2458 hammer_modify_node_done(node); 2459 2460 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) { 2461 hammer_modify_node_field(trans, node, mirror_tid); 2462 node->ondisk->mirror_tid = elm->leaf.base.delete_tid; 2463 hammer_modify_node_done(node); 2464 doprop = 1; 2465 if (hammer_debug_general & 0x0002) { 2466 kprintf("delete_at_cursor: propagate %016llx" 2467 " @%016llx\n", 2468 (long long)elm->leaf.base.delete_tid, 2469 (long long)node->node_offset); 2470 } 2471 } 2472 2473 /* 2474 * Adjust for the iteration. We have deleted the current 2475 * element and want to clear ATEDISK so the iteration does 2476 * not skip the element after, which now becomes the current 2477 * element. This element must be re-tested if doing an 2478 * iteration, which is handled by the RETEST flag. 2479 */ 2480 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2481 cursor->flags |= HAMMER_CURSOR_RETEST; 2482 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2483 } 2484 2485 /* 2486 * An on-disk record cannot have the same delete_tid 2487 * as its create_tid. In a chain of record updates 2488 * this could result in a duplicate record. 2489 */ 2490 KKASSERT(elm->leaf.base.delete_tid != 2491 elm->leaf.base.create_tid); 2492 } 2493 2494 /* 2495 * Destroy the B-Tree element if asked (typically if a nohistory 2496 * file or mount, or when called by the pruning code). 2497 * 2498 * Adjust the ATEDISK flag to properly support iterations. 2499 */ 2500 if (delete_flags & HAMMER_DELETE_DESTROY) { 2501 data_offset = elm->leaf.data_offset; 2502 data_len = elm->leaf.data_len; 2503 rec_type = elm->leaf.base.rec_type; 2504 if (doprop) { 2505 save_leaf = elm->leaf; 2506 leaf = &save_leaf; 2507 } 2508 if (elm->base.rec_type == HAMMER_RECTYPE_INODE && 2509 elm->leaf.base.delete_tid == 0) { 2510 icount = -1; 2511 } 2512 2513 error = hammer_btree_delete(cursor); 2514 if (error == 0) { 2515 /* 2516 * The deletion moves the next element (if any) to 2517 * the current element position. We must clear 2518 * ATEDISK so this element is not skipped and we 2519 * must set RETEST to force any iteration to re-test 2520 * the element. 2521 */ 2522 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2523 cursor->flags |= HAMMER_CURSOR_RETEST; 2524 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2525 } 2526 } 2527 if (error == 0) { 2528 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 2529 case HAMMER_ZONE_LARGE_DATA: 2530 case HAMMER_ZONE_SMALL_DATA: 2531 case HAMMER_ZONE_META: 2532 hammer_blockmap_free(trans, 2533 data_offset, data_len); 2534 break; 2535 default: 2536 break; 2537 } 2538 } 2539 } 2540 2541 /* 2542 * Track inode count and next_tid. This is used by the mirroring 2543 * and PFS code. icount can be negative, zero, or positive. 2544 */ 2545 if (error == 0 && track) { 2546 if (icount) { 2547 hammer_modify_volume_field(trans, trans->rootvol, 2548 vol0_stat_inodes); 2549 trans->rootvol->ondisk->vol0_stat_inodes += icount; 2550 hammer_modify_volume_done(trans->rootvol); 2551 } 2552 if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) { 2553 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2554 trans->rootvol->ondisk->vol0_next_tid = delete_tid; 2555 hammer_modify_volume_done(trans->rootvol); 2556 } 2557 } 2558 2559 /* 2560 * mirror_tid propagation occurs if the node's mirror_tid had to be 2561 * updated while adjusting the delete_tid. 2562 * 2563 * This occurs when deleting even in nohistory mode, but does not 2564 * occur when pruning an already-deleted node. 2565 * 2566 * cursor->ip is NULL when called from the pruning, mirroring, 2567 * and pfs code. If non-NULL propagation will be conditionalized 2568 * on whether the PFS is in no-history mode or not. 2569 * 2570 * WARNING: cursor's leaf pointer may have changed after do_propagation 2571 * returns! 2572 */ 2573 if (doprop) { 2574 if (cursor->ip) 2575 hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf); 2576 else 2577 hammer_btree_do_propagation(cursor, NULL, leaf); 2578 } 2579 hammer_sync_unlock(trans); 2580 return (error); 2581 } 2582 2583 /* 2584 * Determine whether we can remove a directory. This routine checks whether 2585 * a directory is empty or not and enforces flush connectivity. 2586 * 2587 * Flush connectivity requires that we block if the target directory is 2588 * currently flushing, otherwise it may not end up in the same flush group. 2589 * 2590 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. 2591 */ 2592 int 2593 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) 2594 { 2595 struct hammer_cursor cursor; 2596 int error; 2597 2598 /* 2599 * Check directory empty 2600 */ 2601 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2602 2603 cursor.key_beg.localization = ip->obj_localization + 2604 hammer_dir_localization(ip); 2605 cursor.key_beg.obj_id = ip->obj_id; 2606 cursor.key_beg.create_tid = 0; 2607 cursor.key_beg.delete_tid = 0; 2608 cursor.key_beg.obj_type = 0; 2609 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 2610 cursor.key_beg.key = HAMMER_MIN_KEY; 2611 2612 cursor.key_end = cursor.key_beg; 2613 cursor.key_end.rec_type = 0xFFFF; 2614 cursor.key_end.key = HAMMER_MAX_KEY; 2615 2616 cursor.asof = ip->obj_asof; 2617 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2618 2619 error = hammer_ip_first(&cursor); 2620 if (error == ENOENT) 2621 error = 0; 2622 else if (error == 0) 2623 error = ENOTEMPTY; 2624 hammer_done_cursor(&cursor); 2625 return(error); 2626 } 2627 2628 /* 2629 * Localize the data payload. Directory entries may need their 2630 * localization adjusted. 2631 */ 2632 static 2633 int 2634 hammer_cursor_localize_data(hammer_data_ondisk_t data, 2635 hammer_btree_leaf_elm_t leaf) 2636 { 2637 u_int32_t localization; 2638 2639 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 2640 localization = leaf->base.localization & 2641 HAMMER_LOCALIZE_PSEUDOFS_MASK; 2642 if (data->entry.localization != localization) { 2643 data->entry.localization = localization; 2644 hammer_crc_set_leaf(data, leaf); 2645 } 2646 } 2647 return(0); 2648 } 2649