1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_lookup(hammer_cursor_t cursor); 40 static void hammer_mem_first(hammer_cursor_t cursor); 41 static int hammer_frontend_trunc_callback(hammer_record_t record, 42 void *data __unused); 43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data); 44 static int hammer_record_needs_overwrite_delete(hammer_record_t record); 45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 46 hammer_btree_leaf_elm_t leaf); 47 static int hammer_cursor_localize_data(hammer_data_ondisk_t data, 48 hammer_btree_leaf_elm_t leaf); 49 50 struct rec_trunc_info { 51 u_int16_t rec_type; 52 int64_t trunc_off; 53 }; 54 55 struct hammer_bulk_info { 56 hammer_record_t record; 57 struct hammer_btree_leaf_elm leaf; 58 }; 59 60 /* 61 * Red-black tree support. Comparison code for insertion. 62 */ 63 static int 64 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 65 { 66 if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type) 67 return(-1); 68 if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type) 69 return(1); 70 71 if (rec1->leaf.base.key < rec2->leaf.base.key) 72 return(-1); 73 if (rec1->leaf.base.key > rec2->leaf.base.key) 74 return(1); 75 76 /* 77 * For search & insertion purposes records deleted by the 78 * frontend or deleted/committed by the backend are silently 79 * ignored. Otherwise pipelined insertions will get messed 80 * up. 81 * 82 * rec1 is greater then rec2 if rec1 is marked deleted. 83 * rec1 is less then rec2 if rec2 is marked deleted. 84 * 85 * Multiple deleted records may be present, do not return 0 86 * if both are marked deleted. 87 */ 88 if (rec1->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 89 HAMMER_RECF_COMMITTED)) { 90 return(1); 91 } 92 if (rec2->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 93 HAMMER_RECF_COMMITTED)) { 94 return(-1); 95 } 96 97 return(0); 98 } 99 100 /* 101 * Basic record comparison code similar to hammer_btree_cmp(). 102 * 103 * obj_id is not compared and may not yet be assigned in the record. 104 */ 105 static int 106 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) 107 { 108 if (elm->rec_type < rec->leaf.base.rec_type) 109 return(-3); 110 if (elm->rec_type > rec->leaf.base.rec_type) 111 return(3); 112 113 if (elm->key < rec->leaf.base.key) 114 return(-2); 115 if (elm->key > rec->leaf.base.key) 116 return(2); 117 118 /* 119 * Never match against an item deleted by the frontend 120 * or backend, or committed by the backend. 121 * 122 * elm is less then rec if rec is marked deleted. 123 */ 124 if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 125 HAMMER_RECF_COMMITTED)) { 126 return(-1); 127 } 128 return(0); 129 } 130 131 /* 132 * Ranged scan to locate overlapping record(s). This is used by 133 * hammer_ip_get_bulk() to locate an overlapping record. We have 134 * to use a ranged scan because the keys for data records with the 135 * same file base offset can be different due to differing data_len's. 136 * 137 * NOTE: The base file offset of a data record is (key - data_len), not (key). 138 */ 139 static int 140 hammer_rec_overlap_cmp(hammer_record_t rec, void *data) 141 { 142 struct hammer_bulk_info *info = data; 143 hammer_btree_leaf_elm_t leaf = &info->leaf; 144 145 if (rec->leaf.base.rec_type < leaf->base.rec_type) 146 return(-3); 147 if (rec->leaf.base.rec_type > leaf->base.rec_type) 148 return(3); 149 150 /* 151 * Overlap compare 152 */ 153 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 154 /* rec_beg >= leaf_end */ 155 if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key) 156 return(2); 157 /* rec_end <= leaf_beg */ 158 if (rec->leaf.base.key <= leaf->base.key - leaf->data_len) 159 return(-2); 160 } else { 161 if (rec->leaf.base.key < leaf->base.key) 162 return(-2); 163 if (rec->leaf.base.key > leaf->base.key) 164 return(2); 165 } 166 167 /* 168 * We have to return 0 at this point, even if DELETED_FE is set, 169 * because returning anything else will cause the scan to ignore 170 * one of the branches when we really want it to check both. 171 */ 172 return(0); 173 } 174 175 /* 176 * RB_SCAN comparison code for hammer_mem_first(). The argument order 177 * is reversed so the comparison result has to be negated. key_beg and 178 * key_end are both range-inclusive. 179 * 180 * Localized deletions are not cached in-memory. 181 */ 182 static 183 int 184 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 185 { 186 hammer_cursor_t cursor = data; 187 int r; 188 189 r = hammer_rec_cmp(&cursor->key_beg, rec); 190 if (r > 1) 191 return(-1); 192 r = hammer_rec_cmp(&cursor->key_end, rec); 193 if (r < -1) 194 return(1); 195 return(0); 196 } 197 198 /* 199 * This compare function is used when simply looking up key_beg. 200 */ 201 static 202 int 203 hammer_rec_find_cmp(hammer_record_t rec, void *data) 204 { 205 hammer_cursor_t cursor = data; 206 int r; 207 208 r = hammer_rec_cmp(&cursor->key_beg, rec); 209 if (r > 1) 210 return(-1); 211 if (r < -1) 212 return(1); 213 return(0); 214 } 215 216 /* 217 * Locate blocks within the truncation range. Partial blocks do not count. 218 */ 219 static 220 int 221 hammer_rec_trunc_cmp(hammer_record_t rec, void *data) 222 { 223 struct rec_trunc_info *info = data; 224 225 if (rec->leaf.base.rec_type < info->rec_type) 226 return(-1); 227 if (rec->leaf.base.rec_type > info->rec_type) 228 return(1); 229 230 switch(rec->leaf.base.rec_type) { 231 case HAMMER_RECTYPE_DB: 232 /* 233 * DB record key is not beyond the truncation point, retain. 234 */ 235 if (rec->leaf.base.key < info->trunc_off) 236 return(-1); 237 break; 238 case HAMMER_RECTYPE_DATA: 239 /* 240 * DATA record offset start is not beyond the truncation point, 241 * retain. 242 */ 243 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off) 244 return(-1); 245 break; 246 default: 247 panic("hammer_rec_trunc_cmp: unexpected record type"); 248 } 249 250 /* 251 * The record start is >= the truncation point, return match, 252 * the record should be destroyed. 253 */ 254 return(0); 255 } 256 257 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 258 259 /* 260 * Allocate a record for the caller to finish filling in. The record is 261 * returned referenced. 262 */ 263 hammer_record_t 264 hammer_alloc_mem_record(hammer_inode_t ip, int data_len) 265 { 266 hammer_record_t record; 267 hammer_mount_t hmp; 268 269 hmp = ip->hmp; 270 ++hammer_count_records; 271 record = kmalloc(sizeof(*record), hmp->m_misc, 272 M_WAITOK | M_ZERO | M_USE_RESERVE); 273 record->flush_state = HAMMER_FST_IDLE; 274 record->ip = ip; 275 record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 276 record->leaf.data_len = data_len; 277 hammer_ref(&record->lock); 278 279 if (data_len) { 280 record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO); 281 record->flags |= HAMMER_RECF_ALLOCDATA; 282 ++hammer_count_record_datas; 283 } 284 285 return (record); 286 } 287 288 void 289 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident) 290 { 291 while (record->flush_state == HAMMER_FST_FLUSH) { 292 record->flags |= HAMMER_RECF_WANTED; 293 tsleep(record, 0, ident, 0); 294 } 295 } 296 297 /* 298 * Called from the backend, hammer_inode.c, after a record has been 299 * flushed to disk. The record has been exclusively locked by the 300 * caller and interlocked with BE. 301 * 302 * We clean up the state, unlock, and release the record (the record 303 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). 304 */ 305 void 306 hammer_flush_record_done(hammer_record_t record, int error) 307 { 308 hammer_inode_t target_ip; 309 310 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 311 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 312 313 /* 314 * If an error occured, the backend was unable to sync the 315 * record to its media. Leave the record intact. 316 */ 317 if (error) { 318 hammer_critical_error(record->ip->hmp, record->ip, error, 319 "while flushing record"); 320 } 321 322 --record->flush_group->refs; 323 record->flush_group = NULL; 324 325 /* 326 * Adjust the flush state and dependancy based on success or 327 * failure. 328 */ 329 if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { 330 if ((target_ip = record->target_ip) != NULL) { 331 TAILQ_REMOVE(&target_ip->target_list, record, 332 target_entry); 333 record->target_ip = NULL; 334 hammer_test_inode(target_ip); 335 } 336 record->flush_state = HAMMER_FST_IDLE; 337 } else { 338 if (record->target_ip) { 339 record->flush_state = HAMMER_FST_SETUP; 340 hammer_test_inode(record->ip); 341 hammer_test_inode(record->target_ip); 342 } else { 343 record->flush_state = HAMMER_FST_IDLE; 344 } 345 } 346 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 347 348 /* 349 * Cleanup 350 */ 351 if (record->flags & HAMMER_RECF_WANTED) { 352 record->flags &= ~HAMMER_RECF_WANTED; 353 wakeup(record); 354 } 355 hammer_rel_mem_record(record); 356 } 357 358 /* 359 * Release a memory record. Records marked for deletion are immediately 360 * removed from the RB-Tree but otherwise left intact until the last ref 361 * goes away. 362 */ 363 void 364 hammer_rel_mem_record(struct hammer_record *record) 365 { 366 hammer_mount_t hmp; 367 hammer_reserve_t resv; 368 hammer_inode_t ip; 369 hammer_inode_t target_ip; 370 int diddrop; 371 372 hammer_unref(&record->lock); 373 374 if (record->lock.refs == 0) { 375 /* 376 * Upon release of the last reference wakeup any waiters. 377 * The record structure may get destroyed so callers will 378 * loop up and do a relookup. 379 * 380 * WARNING! Record must be removed from RB-TREE before we 381 * might possibly block. hammer_test_inode() can block! 382 */ 383 ip = record->ip; 384 hmp = ip->hmp; 385 386 /* 387 * Upon release of the last reference a record marked deleted 388 * by the front or backend, or committed by the backend, 389 * is destroyed. 390 */ 391 if (record->flags & (HAMMER_RECF_DELETED_FE | 392 HAMMER_RECF_DELETED_BE | 393 HAMMER_RECF_COMMITTED)) { 394 KKASSERT(ip->lock.refs > 0); 395 KKASSERT(record->flush_state != HAMMER_FST_FLUSH); 396 397 /* 398 * target_ip may have zero refs, we have to ref it 399 * to prevent it from being ripped out from under 400 * us. 401 */ 402 if ((target_ip = record->target_ip) != NULL) { 403 TAILQ_REMOVE(&target_ip->target_list, 404 record, target_entry); 405 record->target_ip = NULL; 406 hammer_ref(&target_ip->lock); 407 } 408 409 /* 410 * Remove the record from the B-Tree 411 */ 412 if (record->flags & HAMMER_RECF_ONRBTREE) { 413 RB_REMOVE(hammer_rec_rb_tree, 414 &record->ip->rec_tree, 415 record); 416 record->flags &= ~HAMMER_RECF_ONRBTREE; 417 KKASSERT(ip->rsv_recs > 0); 418 diddrop = 1; 419 } else { 420 diddrop = 0; 421 } 422 423 /* 424 * We must wait for any direct-IO to complete before 425 * we can destroy the record because the bio may 426 * have a reference to it. 427 */ 428 if (record->flags & 429 (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) { 430 hammer_io_direct_wait(record); 431 } 432 433 /* 434 * Account for the completion after the direct IO 435 * has completed. 436 */ 437 if (diddrop) { 438 --hmp->rsv_recs; 439 --ip->rsv_recs; 440 hmp->rsv_databytes -= record->leaf.data_len; 441 442 if (RB_EMPTY(&record->ip->rec_tree)) { 443 record->ip->flags &= ~HAMMER_INODE_XDIRTY; 444 record->ip->sync_flags &= ~HAMMER_INODE_XDIRTY; 445 hammer_test_inode(record->ip); 446 } 447 if (ip->rsv_recs == hammer_limit_inode_recs - 1) 448 wakeup(&ip->rsv_recs); 449 } 450 451 /* 452 * Do this test after removing record from the B-Tree. 453 */ 454 if (target_ip) { 455 hammer_test_inode(target_ip); 456 hammer_rel_inode(target_ip, 0); 457 } 458 459 if (record->flags & HAMMER_RECF_ALLOCDATA) { 460 --hammer_count_record_datas; 461 kfree(record->data, hmp->m_misc); 462 record->flags &= ~HAMMER_RECF_ALLOCDATA; 463 } 464 465 /* 466 * Release the reservation. 467 * 468 * If the record was not committed we can theoretically 469 * undo the reservation. However, doing so might 470 * create weird edge cases with the ordering of 471 * direct writes because the related buffer cache 472 * elements are per-vnode. So we don't try. 473 */ 474 if ((resv = record->resv) != NULL) { 475 /* XXX undo leaf.data_offset,leaf.data_len */ 476 hammer_blockmap_reserve_complete(hmp, resv); 477 record->resv = NULL; 478 } 479 record->data = NULL; 480 --hammer_count_records; 481 kfree(record, hmp->m_misc); 482 } 483 } 484 } 485 486 /* 487 * Record visibility depends on whether the record is being accessed by 488 * the backend or the frontend. Backend tests ignore the frontend delete 489 * flag. Frontend tests do NOT ignore the backend delete/commit flags and 490 * must also check for commit races. 491 * 492 * Return non-zero if the record is visible, zero if it isn't or if it is 493 * deleted. Returns 0 if the record has been comitted (unless the special 494 * delete-visibility flag is set). A committed record must be located 495 * via the media B-Tree. Returns non-zero if the record is good. 496 * 497 * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory 498 * records to be returned. This is so pending deletions are detected 499 * when using an iterator to locate an unused hash key, or when we need 500 * to locate historical records on-disk to destroy. 501 */ 502 static __inline 503 int 504 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) 505 { 506 if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) 507 return(1); 508 if (cursor->flags & HAMMER_CURSOR_BACKEND) { 509 if (record->flags & (HAMMER_RECF_DELETED_BE | 510 HAMMER_RECF_COMMITTED)) { 511 return(0); 512 } 513 } else { 514 if (record->flags & (HAMMER_RECF_DELETED_FE | 515 HAMMER_RECF_DELETED_BE | 516 HAMMER_RECF_COMMITTED)) { 517 return(0); 518 } 519 } 520 return(1); 521 } 522 523 /* 524 * This callback is used as part of the RB_SCAN function for in-memory 525 * records. We terminate it (return -1) as soon as we get a match. 526 * 527 * This routine is used by frontend code. 528 * 529 * The primary compare code does not account for ASOF lookups. This 530 * code handles that case as well as a few others. 531 */ 532 static 533 int 534 hammer_rec_scan_callback(hammer_record_t rec, void *data) 535 { 536 hammer_cursor_t cursor = data; 537 538 /* 539 * We terminate on success, so this should be NULL on entry. 540 */ 541 KKASSERT(cursor->iprec == NULL); 542 543 /* 544 * Skip if the record was marked deleted or committed. 545 */ 546 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) 547 return(0); 548 549 /* 550 * Skip if not visible due to our as-of TID 551 */ 552 if (cursor->flags & HAMMER_CURSOR_ASOF) { 553 if (cursor->asof < rec->leaf.base.create_tid) 554 return(0); 555 if (rec->leaf.base.delete_tid && 556 cursor->asof >= rec->leaf.base.delete_tid) { 557 return(0); 558 } 559 } 560 561 /* 562 * ref the record. The record is protected from backend B-Tree 563 * interactions by virtue of the cursor's IP lock. 564 */ 565 hammer_ref(&rec->lock); 566 567 /* 568 * The record may have been deleted or committed while we 569 * were blocked. XXX remove? 570 */ 571 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { 572 hammer_rel_mem_record(rec); 573 return(0); 574 } 575 576 /* 577 * Set the matching record and stop the scan. 578 */ 579 cursor->iprec = rec; 580 return(-1); 581 } 582 583 584 /* 585 * Lookup an in-memory record given the key specified in the cursor. Works 586 * just like hammer_btree_lookup() but operates on an inode's in-memory 587 * record list. 588 * 589 * The lookup must fail if the record is marked for deferred deletion. 590 * 591 * The API for mem/btree_lookup() does not mess with the ATE/EOF bits. 592 */ 593 static 594 int 595 hammer_mem_lookup(hammer_cursor_t cursor) 596 { 597 KKASSERT(cursor->ip); 598 if (cursor->iprec) { 599 hammer_rel_mem_record(cursor->iprec); 600 cursor->iprec = NULL; 601 } 602 hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp, 603 hammer_rec_scan_callback, cursor); 604 605 return (cursor->iprec ? 0 : ENOENT); 606 } 607 608 /* 609 * hammer_mem_first() - locate the first in-memory record matching the 610 * cursor within the bounds of the key range. 611 * 612 * WARNING! API is slightly different from btree_first(). hammer_mem_first() 613 * will set ATEMEM the same as MEMEOF, and does not return any error. 614 */ 615 static 616 void 617 hammer_mem_first(hammer_cursor_t cursor) 618 { 619 hammer_inode_t ip; 620 621 ip = cursor->ip; 622 KKASSERT(ip != NULL); 623 624 if (cursor->iprec) { 625 hammer_rel_mem_record(cursor->iprec); 626 cursor->iprec = NULL; 627 } 628 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 629 hammer_rec_scan_callback, cursor); 630 631 if (cursor->iprec) 632 cursor->flags &= ~(HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM); 633 else 634 cursor->flags |= HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM; 635 } 636 637 /************************************************************************ 638 * HAMMER IN-MEMORY RECORD FUNCTIONS * 639 ************************************************************************ 640 * 641 * These functions manipulate in-memory records. Such records typically 642 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 643 */ 644 645 /* 646 * Add a directory entry (dip,ncp) which references inode (ip). 647 * 648 * Note that the low 32 bits of the namekey are set temporarily to create 649 * a unique in-memory record, and may be modified a second time when the 650 * record is synchronized to disk. In particular, the low 32 bits cannot be 651 * all 0's when synching to disk, which is not handled here. 652 * 653 * NOTE: bytes does not include any terminating \0 on name, and name might 654 * not be terminated. 655 */ 656 int 657 hammer_ip_add_directory(struct hammer_transaction *trans, 658 struct hammer_inode *dip, const char *name, int bytes, 659 struct hammer_inode *ip) 660 { 661 struct hammer_cursor cursor; 662 hammer_record_t record; 663 int error; 664 u_int32_t max_iterations; 665 666 record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes)); 667 668 record->type = HAMMER_MEM_RECORD_ADD; 669 record->leaf.base.localization = dip->obj_localization + 670 hammer_dir_localization(dip); 671 record->leaf.base.obj_id = dip->obj_id; 672 record->leaf.base.key = hammer_directory_namekey(dip, name, bytes, 673 &max_iterations); 674 record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 675 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 676 record->data->entry.obj_id = ip->obj_id; 677 record->data->entry.localization = ip->obj_localization; 678 bcopy(name, record->data->entry.name, bytes); 679 680 ++ip->ino_data.nlinks; 681 ip->ino_data.ctime = trans->time; 682 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 683 684 /* 685 * Find an unused namekey. Both the in-memory record tree and 686 * the B-Tree are checked. We do not want historically deleted 687 * names to create a collision as our iteration space may be limited, 688 * and since create_tid wouldn't match anyway an ASOF search 689 * must be used to locate collisions. 690 * 691 * delete-visibility is set so pending deletions do not give us 692 * a false-negative on our ability to use an iterator. 693 * 694 * The iterator must not rollover the key. Directory keys only 695 * use the positive key space. 696 */ 697 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 698 cursor.key_beg = record->leaf.base; 699 cursor.flags |= HAMMER_CURSOR_ASOF; 700 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 701 cursor.asof = ip->obj_asof; 702 703 while (hammer_ip_lookup(&cursor) == 0) { 704 ++record->leaf.base.key; 705 KKASSERT(record->leaf.base.key > 0); 706 cursor.key_beg.key = record->leaf.base.key; 707 if (--max_iterations == 0) { 708 hammer_rel_mem_record(record); 709 error = ENOSPC; 710 goto failed; 711 } 712 } 713 714 /* 715 * The target inode and the directory entry are bound together. 716 */ 717 record->target_ip = ip; 718 record->flush_state = HAMMER_FST_SETUP; 719 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 720 721 /* 722 * The inode now has a dependancy and must be taken out of the idle 723 * state. An inode not in an idle state is given an extra reference. 724 * 725 * When transitioning to a SETUP state flag for an automatic reflush 726 * when the dependancies are disposed of if someone is waiting on 727 * the inode. 728 */ 729 if (ip->flush_state == HAMMER_FST_IDLE) { 730 hammer_ref(&ip->lock); 731 ip->flush_state = HAMMER_FST_SETUP; 732 if (ip->flags & HAMMER_INODE_FLUSHW) 733 ip->flags |= HAMMER_INODE_REFLUSH; 734 } 735 error = hammer_mem_add(record); 736 if (error == 0) { 737 dip->ino_data.mtime = trans->time; 738 hammer_modify_inode(dip, HAMMER_INODE_MTIME); 739 } 740 failed: 741 hammer_done_cursor(&cursor); 742 return(error); 743 } 744 745 /* 746 * Delete the directory entry and update the inode link count. The 747 * cursor must be seeked to the directory entry record being deleted. 748 * 749 * The related inode should be share-locked by the caller. The caller is 750 * on the frontend. It could also be NULL indicating that the directory 751 * entry being removed has no related inode. 752 * 753 * This function can return EDEADLK requiring the caller to terminate 754 * the cursor, any locks, wait on the returned record, and retry. 755 */ 756 int 757 hammer_ip_del_directory(struct hammer_transaction *trans, 758 hammer_cursor_t cursor, struct hammer_inode *dip, 759 struct hammer_inode *ip) 760 { 761 hammer_record_t record; 762 int error; 763 764 if (hammer_cursor_inmem(cursor)) { 765 /* 766 * In-memory (unsynchronized) records can simply be freed. 767 * 768 * Even though the HAMMER_RECF_DELETED_FE flag is ignored 769 * by the backend, we must still avoid races against the 770 * backend potentially syncing the record to the media. 771 * 772 * We cannot call hammer_ip_delete_record(), that routine may 773 * only be called from the backend. 774 */ 775 record = cursor->iprec; 776 if (record->flags & (HAMMER_RECF_INTERLOCK_BE | 777 HAMMER_RECF_DELETED_BE | 778 HAMMER_RECF_COMMITTED)) { 779 KKASSERT(cursor->deadlk_rec == NULL); 780 hammer_ref(&record->lock); 781 cursor->deadlk_rec = record; 782 error = EDEADLK; 783 } else { 784 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 785 record->flags |= HAMMER_RECF_DELETED_FE; 786 error = 0; 787 } 788 } else { 789 /* 790 * If the record is on-disk we have to queue the deletion by 791 * the record's key. This also causes lookups to skip the 792 * record (lookups for the purposes of finding an unused 793 * directory key do not skip the record). 794 */ 795 KKASSERT(dip->flags & 796 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK)); 797 record = hammer_alloc_mem_record(dip, 0); 798 record->type = HAMMER_MEM_RECORD_DEL; 799 record->leaf.base = cursor->leaf->base; 800 KKASSERT(dip->obj_id == record->leaf.base.obj_id); 801 802 /* 803 * ip may be NULL, indicating the deletion of a directory 804 * entry which has no related inode. 805 */ 806 record->target_ip = ip; 807 if (ip) { 808 record->flush_state = HAMMER_FST_SETUP; 809 TAILQ_INSERT_TAIL(&ip->target_list, record, 810 target_entry); 811 } else { 812 record->flush_state = HAMMER_FST_IDLE; 813 } 814 815 /* 816 * The inode now has a dependancy and must be taken out of 817 * the idle state. An inode not in an idle state is given 818 * an extra reference. 819 * 820 * When transitioning to a SETUP state flag for an automatic 821 * reflush when the dependancies are disposed of if someone 822 * is waiting on the inode. 823 */ 824 if (ip && ip->flush_state == HAMMER_FST_IDLE) { 825 hammer_ref(&ip->lock); 826 ip->flush_state = HAMMER_FST_SETUP; 827 if (ip->flags & HAMMER_INODE_FLUSHW) 828 ip->flags |= HAMMER_INODE_REFLUSH; 829 } 830 831 error = hammer_mem_add(record); 832 } 833 834 /* 835 * One less link. The file may still be open in the OS even after 836 * all links have gone away. 837 * 838 * We have to terminate the cursor before syncing the inode to 839 * avoid deadlocking against ourselves. XXX this may no longer 840 * be true. 841 * 842 * If nlinks drops to zero and the vnode is inactive (or there is 843 * no vnode), call hammer_inode_unloadable_check() to zonk the 844 * inode. If we don't do this here the inode will not be destroyed 845 * on-media until we unmount. 846 */ 847 if (error == 0) { 848 if (ip) { 849 --ip->ino_data.nlinks; /* do before we might block */ 850 ip->ino_data.ctime = trans->time; 851 } 852 dip->ino_data.mtime = trans->time; 853 hammer_modify_inode(dip, HAMMER_INODE_MTIME); 854 if (ip) { 855 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY); 856 if (ip->ino_data.nlinks == 0 && 857 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 858 hammer_done_cursor(cursor); 859 hammer_inode_unloadable_check(ip, 1); 860 hammer_flush_inode(ip, 0); 861 } 862 } 863 864 } 865 return(error); 866 } 867 868 /* 869 * Add a record to an inode. 870 * 871 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 872 * initialize the following additional fields: 873 * 874 * The related inode should be share-locked by the caller. The caller is 875 * on the frontend. 876 * 877 * record->rec.entry.base.base.key 878 * record->rec.entry.base.base.rec_type 879 * record->rec.entry.base.base.data_len 880 * record->data (a copy will be kmalloc'd if it cannot be embedded) 881 */ 882 int 883 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 884 { 885 hammer_inode_t ip = record->ip; 886 int error; 887 888 KKASSERT(record->leaf.base.localization != 0); 889 record->leaf.base.obj_id = ip->obj_id; 890 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 891 error = hammer_mem_add(record); 892 return(error); 893 } 894 895 /* 896 * Locate a bulk record in-memory. Bulk records allow disk space to be 897 * reserved so the front-end can flush large data writes without having 898 * to queue the BIO to the flusher. Only the related record gets queued 899 * to the flusher. 900 */ 901 902 static hammer_record_t 903 hammer_ip_get_bulk(hammer_inode_t ip, off_t file_offset, int bytes) 904 { 905 struct hammer_bulk_info info; 906 907 bzero(&info, sizeof(info)); 908 info.leaf.base.obj_id = ip->obj_id; 909 info.leaf.base.key = file_offset + bytes; 910 info.leaf.base.create_tid = 0; 911 info.leaf.base.delete_tid = 0; 912 info.leaf.base.rec_type = HAMMER_RECTYPE_DATA; 913 info.leaf.base.obj_type = 0; /* unused */ 914 info.leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; /* unused */ 915 info.leaf.base.localization = ip->obj_localization + /* unused */ 916 HAMMER_LOCALIZE_MISC; 917 info.leaf.data_len = bytes; 918 919 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp, 920 hammer_bulk_scan_callback, &info); 921 922 return(info.record); /* may be NULL */ 923 } 924 925 /* 926 * Take records vetted by overlap_cmp. The first non-deleted record 927 * (if any) stops the scan. 928 */ 929 static int 930 hammer_bulk_scan_callback(hammer_record_t record, void *data) 931 { 932 struct hammer_bulk_info *info = data; 933 934 if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 935 HAMMER_RECF_COMMITTED)) { 936 return(0); 937 } 938 hammer_ref(&record->lock); 939 info->record = record; 940 return(-1); /* stop scan */ 941 } 942 943 /* 944 * Reserve blockmap space placemarked with an in-memory record. 945 * 946 * This routine is called by the frontend in order to be able to directly 947 * flush a buffer cache buffer. The frontend has locked the related buffer 948 * cache buffers and we should be able to manipulate any overlapping 949 * in-memory records. 950 * 951 * The caller is responsible for adding the returned record. 952 */ 953 hammer_record_t 954 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, 955 int *errorp) 956 { 957 hammer_record_t record; 958 hammer_record_t conflict; 959 int zone; 960 961 /* 962 * Deal with conflicting in-memory records. We cannot have multiple 963 * in-memory records for the same base offset without seriously 964 * confusing the backend, including but not limited to the backend 965 * issuing delete-create-delete or create-delete-create sequences 966 * and asserting on the delete_tid being the same as the create_tid. 967 * 968 * If we encounter a record with the backend interlock set we cannot 969 * immediately delete it without confusing the backend. 970 */ 971 while ((conflict = hammer_ip_get_bulk(ip, file_offset, bytes)) !=NULL) { 972 if (conflict->flags & HAMMER_RECF_INTERLOCK_BE) { 973 conflict->flags |= HAMMER_RECF_WANTED; 974 tsleep(conflict, 0, "hmrrc3", 0); 975 } else { 976 conflict->flags |= HAMMER_RECF_DELETED_FE; 977 } 978 hammer_rel_mem_record(conflict); 979 } 980 981 /* 982 * Create a record to cover the direct write. This is called with 983 * the related BIO locked so there should be no possible conflict. 984 * 985 * The backend is responsible for finalizing the space reserved in 986 * this record. 987 * 988 * XXX bytes not aligned, depend on the reservation code to 989 * align the reservation. 990 */ 991 record = hammer_alloc_mem_record(ip, 0); 992 zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : 993 HAMMER_ZONE_SMALL_DATA_INDEX; 994 record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, 995 &record->leaf.data_offset, 996 errorp); 997 if (record->resv == NULL) { 998 kprintf("hammer_ip_add_bulk: reservation failed\n"); 999 hammer_rel_mem_record(record); 1000 return(NULL); 1001 } 1002 record->type = HAMMER_MEM_RECORD_DATA; 1003 record->leaf.base.rec_type = HAMMER_RECTYPE_DATA; 1004 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 1005 record->leaf.base.obj_id = ip->obj_id; 1006 record->leaf.base.key = file_offset + bytes; 1007 record->leaf.base.localization = ip->obj_localization + 1008 HAMMER_LOCALIZE_MISC; 1009 record->leaf.data_len = bytes; 1010 hammer_crc_set_leaf(data, &record->leaf); 1011 KKASSERT(*errorp == 0); 1012 return(record); 1013 } 1014 1015 /* 1016 * Frontend truncation code. Scan in-memory records only. On-disk records 1017 * and records in a flushing state are handled by the backend. The vnops 1018 * setattr code will handle the block containing the truncation point. 1019 * 1020 * Partial blocks are not deleted. 1021 */ 1022 int 1023 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) 1024 { 1025 struct rec_trunc_info info; 1026 1027 switch(ip->ino_data.obj_type) { 1028 case HAMMER_OBJTYPE_REGFILE: 1029 info.rec_type = HAMMER_RECTYPE_DATA; 1030 break; 1031 case HAMMER_OBJTYPE_DBFILE: 1032 info.rec_type = HAMMER_RECTYPE_DB; 1033 break; 1034 default: 1035 return(EINVAL); 1036 } 1037 info.trunc_off = file_size; 1038 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp, 1039 hammer_frontend_trunc_callback, &info); 1040 return(0); 1041 } 1042 1043 static int 1044 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused) 1045 { 1046 if (record->flags & HAMMER_RECF_DELETED_FE) 1047 return(0); 1048 if (record->flush_state == HAMMER_FST_FLUSH) 1049 return(0); 1050 KKASSERT((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0); 1051 hammer_ref(&record->lock); 1052 record->flags |= HAMMER_RECF_DELETED_FE; 1053 hammer_rel_mem_record(record); 1054 return(0); 1055 } 1056 1057 /* 1058 * Return 1 if the caller must check for and delete existing records 1059 * before writing out a new data record. 1060 * 1061 * Return 0 if the caller can just insert the record into the B-Tree without 1062 * checking. 1063 */ 1064 static int 1065 hammer_record_needs_overwrite_delete(hammer_record_t record) 1066 { 1067 hammer_inode_t ip = record->ip; 1068 int64_t file_offset; 1069 int r; 1070 1071 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) 1072 file_offset = record->leaf.base.key; 1073 else 1074 file_offset = record->leaf.base.key - record->leaf.data_len; 1075 r = (file_offset < ip->save_trunc_off); 1076 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1077 if (ip->save_trunc_off <= record->leaf.base.key) 1078 ip->save_trunc_off = record->leaf.base.key + 1; 1079 } else { 1080 if (ip->save_trunc_off < record->leaf.base.key) 1081 ip->save_trunc_off = record->leaf.base.key; 1082 } 1083 return(r); 1084 } 1085 1086 /* 1087 * Backend code. Sync a record to the media. 1088 */ 1089 int 1090 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) 1091 { 1092 hammer_transaction_t trans = cursor->trans; 1093 int64_t file_offset; 1094 int bytes; 1095 void *bdata; 1096 int error; 1097 int doprop; 1098 1099 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1100 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 1101 KKASSERT(record->leaf.base.localization != 0); 1102 1103 /* 1104 * Any direct-write related to the record must complete before we 1105 * can sync the record to the on-disk media. 1106 */ 1107 if (record->flags & (HAMMER_RECF_DIRECT_IO | HAMMER_RECF_DIRECT_INVAL)) 1108 hammer_io_direct_wait(record); 1109 1110 /* 1111 * If this is a bulk-data record placemarker there may be an existing 1112 * record on-disk, indicating a data overwrite. If there is the 1113 * on-disk record must be deleted before we can insert our new record. 1114 * 1115 * We've synthesized this record and do not know what the create_tid 1116 * on-disk is, nor how much data it represents. 1117 * 1118 * Keep in mind that (key) for data records is (base_offset + len), 1119 * not (base_offset). Also, we only want to get rid of on-disk 1120 * records since we are trying to sync our in-memory record, call 1121 * hammer_ip_delete_range() with truncating set to 1 to make sure 1122 * it skips in-memory records. 1123 * 1124 * It is ok for the lookup to return ENOENT. 1125 * 1126 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have 1127 * to call hammer_ip_delete_range() or not. This also means we must 1128 * update sync_trunc_off() as we write. 1129 */ 1130 if (record->type == HAMMER_MEM_RECORD_DATA && 1131 hammer_record_needs_overwrite_delete(record)) { 1132 file_offset = record->leaf.base.key - record->leaf.data_len; 1133 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 1134 ~HAMMER_BUFMASK; 1135 KKASSERT((file_offset & HAMMER_BUFMASK) == 0); 1136 error = hammer_ip_delete_range( 1137 cursor, record->ip, 1138 file_offset, file_offset + bytes - 1, 1139 1); 1140 if (error && error != ENOENT) 1141 goto done; 1142 } 1143 1144 /* 1145 * If this is a general record there may be an on-disk version 1146 * that must be deleted before we can insert the new record. 1147 */ 1148 if (record->type == HAMMER_MEM_RECORD_GENERAL) { 1149 error = hammer_delete_general(cursor, record->ip, 1150 &record->leaf); 1151 if (error && error != ENOENT) 1152 goto done; 1153 } 1154 1155 /* 1156 * Setup the cursor. 1157 */ 1158 hammer_normalize_cursor(cursor); 1159 cursor->key_beg = record->leaf.base; 1160 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1161 cursor->flags |= HAMMER_CURSOR_BACKEND; 1162 cursor->flags &= ~HAMMER_CURSOR_INSERT; 1163 1164 /* 1165 * Records can wind up on-media before the inode itself is on-media. 1166 * Flag the case. 1167 */ 1168 record->ip->flags |= HAMMER_INODE_DONDISK; 1169 1170 /* 1171 * If we are deleting a directory entry an exact match must be 1172 * found on-disk. 1173 */ 1174 if (record->type == HAMMER_MEM_RECORD_DEL) { 1175 error = hammer_btree_lookup(cursor); 1176 if (error == 0) { 1177 KKASSERT(cursor->iprec == NULL); 1178 error = hammer_ip_delete_record(cursor, record->ip, 1179 trans->tid); 1180 if (error == 0) { 1181 record->flags |= HAMMER_RECF_DELETED_BE | 1182 HAMMER_RECF_COMMITTED; 1183 ++record->ip->rec_generation; 1184 } 1185 } 1186 goto done; 1187 } 1188 1189 /* 1190 * We are inserting. 1191 * 1192 * Issue a lookup to position the cursor and locate the insertion 1193 * point. The target key should not exist. If we are creating a 1194 * directory entry we may have to iterate the low 32 bits of the 1195 * key to find an unused key. 1196 */ 1197 hammer_sync_lock_sh(trans); 1198 cursor->flags |= HAMMER_CURSOR_INSERT; 1199 error = hammer_btree_lookup(cursor); 1200 if (hammer_debug_inode) 1201 kprintf("DOINSERT LOOKUP %d\n", error); 1202 if (error == 0) { 1203 kprintf("hammer_ip_sync_record: duplicate rec " 1204 "at (%016llx)\n", (long long)record->leaf.base.key); 1205 if (hammer_debug_critical) 1206 Debugger("duplicate record1"); 1207 error = EIO; 1208 } 1209 #if 0 1210 if (record->type == HAMMER_MEM_RECORD_DATA) 1211 kprintf("sync_record %016llx ---------------- %016llx %d\n", 1212 record->leaf.base.key - record->leaf.data_len, 1213 record->leaf.data_offset, error); 1214 #endif 1215 1216 if (error != ENOENT) 1217 goto done_unlock; 1218 1219 /* 1220 * Allocate the record and data. The result buffers will be 1221 * marked as being modified and further calls to 1222 * hammer_modify_buffer() will result in unneeded UNDO records. 1223 * 1224 * Support zero-fill records (data == NULL and data_len != 0) 1225 */ 1226 if (record->type == HAMMER_MEM_RECORD_DATA) { 1227 /* 1228 * The data portion of a bulk-data record has already been 1229 * committed to disk, we need only adjust the layer2 1230 * statistics in the same transaction as our B-Tree insert. 1231 */ 1232 KKASSERT(record->leaf.data_offset != 0); 1233 error = hammer_blockmap_finalize(trans, 1234 record->resv, 1235 record->leaf.data_offset, 1236 record->leaf.data_len); 1237 } else if (record->data && record->leaf.data_len) { 1238 /* 1239 * Wholely cached record, with data. Allocate the data. 1240 */ 1241 bdata = hammer_alloc_data(trans, record->leaf.data_len, 1242 record->leaf.base.rec_type, 1243 &record->leaf.data_offset, 1244 &cursor->data_buffer, 1245 0, &error); 1246 if (bdata == NULL) 1247 goto done_unlock; 1248 hammer_crc_set_leaf(record->data, &record->leaf); 1249 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); 1250 bcopy(record->data, bdata, record->leaf.data_len); 1251 hammer_modify_buffer_done(cursor->data_buffer); 1252 } else { 1253 /* 1254 * Wholely cached record, without data. 1255 */ 1256 record->leaf.data_offset = 0; 1257 record->leaf.data_crc = 0; 1258 } 1259 1260 error = hammer_btree_insert(cursor, &record->leaf, &doprop); 1261 if (hammer_debug_inode && error) { 1262 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", 1263 error, 1264 (long long)cursor->node->node_offset, 1265 cursor->index, 1266 (long long)record->leaf.base.key); 1267 } 1268 1269 /* 1270 * Our record is on-disk and we normally mark the in-memory version 1271 * as having been committed (and not BE-deleted). 1272 * 1273 * If the record represented a directory deletion but we had to 1274 * sync a valid directory entry to disk due to dependancies, 1275 * we must convert the record to a covering delete so the 1276 * frontend does not have visibility on the synced entry. 1277 * 1278 * WARNING: cursor's leaf pointer may have changed after do_propagation 1279 * returns! 1280 */ 1281 if (error == 0) { 1282 if (doprop) { 1283 hammer_btree_do_propagation(cursor, 1284 record->ip->pfsm, 1285 &record->leaf); 1286 } 1287 if (record->flags & HAMMER_RECF_CONVERT_DELETE) { 1288 /* 1289 * Must convert deleted directory entry add 1290 * to a directory entry delete. 1291 */ 1292 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 1293 record->flags &= ~HAMMER_RECF_DELETED_FE; 1294 record->type = HAMMER_MEM_RECORD_DEL; 1295 KKASSERT(record->ip->obj_id == record->leaf.base.obj_id); 1296 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1297 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 1298 KKASSERT((record->flags & (HAMMER_RECF_COMMITTED | 1299 HAMMER_RECF_DELETED_BE)) == 0); 1300 /* converted record is not yet committed */ 1301 /* hammer_flush_record_done takes care of the rest */ 1302 } else { 1303 /* 1304 * Everything went fine and we are now done with 1305 * this record. 1306 */ 1307 record->flags |= HAMMER_RECF_COMMITTED; 1308 ++record->ip->rec_generation; 1309 } 1310 } else { 1311 if (record->leaf.data_offset) { 1312 hammer_blockmap_free(trans, record->leaf.data_offset, 1313 record->leaf.data_len); 1314 } 1315 } 1316 done_unlock: 1317 hammer_sync_unlock(trans); 1318 done: 1319 return(error); 1320 } 1321 1322 /* 1323 * Add the record to the inode's rec_tree. The low 32 bits of a directory 1324 * entry's key is used to deal with hash collisions in the upper 32 bits. 1325 * A unique 64 bit key is generated in-memory and may be regenerated a 1326 * second time when the directory record is flushed to the on-disk B-Tree. 1327 * 1328 * A referenced record is passed to this function. This function 1329 * eats the reference. If an error occurs the record will be deleted. 1330 * 1331 * A copy of the temporary record->data pointer provided by the caller 1332 * will be made. 1333 */ 1334 int 1335 hammer_mem_add(hammer_record_t record) 1336 { 1337 hammer_mount_t hmp = record->ip->hmp; 1338 1339 /* 1340 * Make a private copy of record->data 1341 */ 1342 if (record->data) 1343 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA); 1344 1345 /* 1346 * Insert into the RB tree. A unique key should have already 1347 * been selected if this is a directory entry. 1348 */ 1349 if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 1350 record->flags |= HAMMER_RECF_DELETED_FE; 1351 hammer_rel_mem_record(record); 1352 return (EEXIST); 1353 } 1354 ++hmp->count_newrecords; 1355 ++hmp->rsv_recs; 1356 ++record->ip->rsv_recs; 1357 record->ip->hmp->rsv_databytes += record->leaf.data_len; 1358 record->flags |= HAMMER_RECF_ONRBTREE; 1359 hammer_modify_inode(record->ip, HAMMER_INODE_XDIRTY); 1360 hammer_rel_mem_record(record); 1361 return(0); 1362 } 1363 1364 /************************************************************************ 1365 * HAMMER INODE MERGED-RECORD FUNCTIONS * 1366 ************************************************************************ 1367 * 1368 * These functions augment the B-Tree scanning functions in hammer_btree.c 1369 * by merging in-memory records with on-disk records. 1370 */ 1371 1372 /* 1373 * Locate a particular record either in-memory or on-disk. 1374 * 1375 * NOTE: This is basically a standalone routine, hammer_ip_next() may 1376 * NOT be called to iterate results. 1377 */ 1378 int 1379 hammer_ip_lookup(hammer_cursor_t cursor) 1380 { 1381 int error; 1382 1383 /* 1384 * If the element is in-memory return it without searching the 1385 * on-disk B-Tree 1386 */ 1387 KKASSERT(cursor->ip); 1388 error = hammer_mem_lookup(cursor); 1389 if (error == 0) { 1390 cursor->leaf = &cursor->iprec->leaf; 1391 return(error); 1392 } 1393 if (error != ENOENT) 1394 return(error); 1395 1396 /* 1397 * If the inode has on-disk components search the on-disk B-Tree. 1398 */ 1399 if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 1400 return(error); 1401 error = hammer_btree_lookup(cursor); 1402 if (error == 0) 1403 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1404 return(error); 1405 } 1406 1407 /* 1408 * Helper for hammer_ip_first()/hammer_ip_next() 1409 * 1410 * NOTE: Both ATEDISK and DISKEOF will be set the same. This sets up 1411 * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek 1412 * state if hammer_ip_next() needs to re-seek. 1413 */ 1414 static __inline 1415 int 1416 _hammer_ip_seek_btree(hammer_cursor_t cursor) 1417 { 1418 hammer_inode_t ip = cursor->ip; 1419 int error; 1420 1421 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1422 error = hammer_btree_lookup(cursor); 1423 if (error == ENOENT || error == EDEADLK) { 1424 if (hammer_debug_general & 0x2000) { 1425 kprintf("error %d node %p %016llx index %d\n", 1426 error, cursor->node, 1427 (long long)cursor->node->node_offset, 1428 cursor->index); 1429 } 1430 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1431 error = hammer_btree_iterate(cursor); 1432 } 1433 if (error == 0) { 1434 cursor->flags &= ~(HAMMER_CURSOR_DISKEOF | 1435 HAMMER_CURSOR_ATEDISK); 1436 } else { 1437 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1438 HAMMER_CURSOR_ATEDISK; 1439 if (error == ENOENT) 1440 error = 0; 1441 } 1442 } else { 1443 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; 1444 error = 0; 1445 } 1446 return(error); 1447 } 1448 1449 /* 1450 * Helper for hammer_ip_next() 1451 * 1452 * The caller has determined that the media cursor is further along than the 1453 * memory cursor and must be reseeked after a generation number change. 1454 */ 1455 static 1456 int 1457 _hammer_ip_reseek(hammer_cursor_t cursor) 1458 { 1459 struct hammer_base_elm save; 1460 hammer_btree_elm_t elm; 1461 int error; 1462 int r; 1463 int again = 0; 1464 1465 /* 1466 * Do the re-seek. 1467 */ 1468 kprintf("HAMMER: Debug: re-seeked during scan @ino=%016llx\n", 1469 (long long)cursor->ip->obj_id); 1470 save = cursor->key_beg; 1471 cursor->key_beg = cursor->iprec->leaf.base; 1472 error = _hammer_ip_seek_btree(cursor); 1473 KKASSERT(error == 0); 1474 cursor->key_beg = save; 1475 1476 /* 1477 * If the memory record was previous returned to 1478 * the caller and the media record matches 1479 * (-1/+1: only create_tid differs), then iterate 1480 * the media record to avoid a double result. 1481 */ 1482 if ((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0 && 1483 (cursor->flags & HAMMER_CURSOR_LASTWASMEM)) { 1484 elm = &cursor->node->ondisk->elms[cursor->index]; 1485 r = hammer_btree_cmp(&elm->base, 1486 &cursor->iprec->leaf.base); 1487 if (cursor->flags & HAMMER_CURSOR_ASOF) { 1488 if (r >= -1 && r <= 1) { 1489 kprintf("HAMMER: Debug: iterated after " 1490 "re-seek (asof r=%d)\n", r); 1491 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1492 again = 1; 1493 } 1494 } else { 1495 if (r == 0) { 1496 kprintf("HAMMER: Debug: iterated after " 1497 "re-seek\n"); 1498 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1499 again = 1; 1500 } 1501 } 1502 } 1503 return(again); 1504 } 1505 1506 /* 1507 * Locate the first record within the cursor's key_beg/key_end range, 1508 * restricted to a particular inode. 0 is returned on success, ENOENT 1509 * if no records matched the requested range, or some other error. 1510 * 1511 * When 0 is returned hammer_ip_next() may be used to iterate additional 1512 * records within the requested range. 1513 * 1514 * This function can return EDEADLK, requiring the caller to terminate 1515 * the cursor and try again. 1516 */ 1517 1518 int 1519 hammer_ip_first(hammer_cursor_t cursor) 1520 { 1521 hammer_inode_t ip = cursor->ip; 1522 int error; 1523 1524 KKASSERT(ip != NULL); 1525 1526 /* 1527 * Clean up fields and setup for merged scan 1528 */ 1529 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1530 1531 /* 1532 * Search the in-memory record list (Red-Black tree). Unlike the 1533 * B-Tree search, mem_first checks for records in the range. 1534 * 1535 * This function will setup both ATEMEM and MEMEOF properly for 1536 * the ip iteration. ATEMEM will be set if MEMEOF is set. 1537 */ 1538 hammer_mem_first(cursor); 1539 1540 /* 1541 * Detect generation changes during blockages, including 1542 * blockages which occur on the initial btree search. 1543 */ 1544 cursor->rec_generation = cursor->ip->rec_generation; 1545 1546 /* 1547 * Initial search and result 1548 */ 1549 error = _hammer_ip_seek_btree(cursor); 1550 if (error == 0) 1551 error = hammer_ip_next(cursor); 1552 1553 return (error); 1554 } 1555 1556 /* 1557 * Retrieve the next record in a merged iteration within the bounds of the 1558 * cursor. This call may be made multiple times after the cursor has been 1559 * initially searched with hammer_ip_first(). 1560 * 1561 * There are numerous special cases in this code to deal with races between 1562 * in-memory records and on-media records. 1563 * 1564 * 0 is returned on success, ENOENT if no further records match the 1565 * requested range, or some other error code is returned. 1566 */ 1567 int 1568 hammer_ip_next(hammer_cursor_t cursor) 1569 { 1570 hammer_btree_elm_t elm; 1571 hammer_record_t rec; 1572 hammer_record_t tmprec; 1573 int error; 1574 int r; 1575 1576 again: 1577 /* 1578 * Get the next on-disk record 1579 * 1580 * NOTE: If we deleted the last on-disk record we had scanned 1581 * ATEDISK will be clear and RETEST will be set, forcing 1582 * a call to iterate. The fact that ATEDISK is clear causes 1583 * iterate to re-test the 'current' element. If ATEDISK is 1584 * set, iterate will skip the 'current' element. 1585 */ 1586 error = 0; 1587 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1588 if (cursor->flags & (HAMMER_CURSOR_ATEDISK | 1589 HAMMER_CURSOR_RETEST)) { 1590 error = hammer_btree_iterate(cursor); 1591 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1592 if (error == 0) { 1593 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1594 hammer_cache_node(&cursor->ip->cache[1], 1595 cursor->node); 1596 } else if (error == ENOENT) { 1597 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1598 HAMMER_CURSOR_ATEDISK; 1599 error = 0; 1600 } 1601 } 1602 } 1603 1604 /* 1605 * If the generation changed the backend has deleted or committed 1606 * one or more memory records since our last check. 1607 * 1608 * When this case occurs if the disk cursor is > current memory record 1609 * or the disk cursor is at EOF, we must re-seek the disk-cursor. 1610 * Since the cursor is ahead it must have not yet been eaten (if 1611 * not at eof anyway). (XXX data offset case?) 1612 * 1613 * NOTE: we are not doing a full check here. That will be handled 1614 * later on. 1615 * 1616 * If we have exhausted all memory records we do not have to do any 1617 * further seeks. 1618 */ 1619 while (cursor->rec_generation != cursor->ip->rec_generation && 1620 error == 0 1621 ) { 1622 kprintf("HAMMER: Debug: generation changed during scan @ino=%016llx\n", (long long)cursor->ip->obj_id); 1623 cursor->rec_generation = cursor->ip->rec_generation; 1624 if (cursor->flags & HAMMER_CURSOR_MEMEOF) 1625 break; 1626 if (cursor->flags & HAMMER_CURSOR_DISKEOF) { 1627 r = 1; 1628 } else { 1629 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0); 1630 elm = &cursor->node->ondisk->elms[cursor->index]; 1631 r = hammer_btree_cmp(&elm->base, 1632 &cursor->iprec->leaf.base); 1633 } 1634 1635 /* 1636 * Do we re-seek the media cursor? 1637 */ 1638 if (r > 0) { 1639 if (_hammer_ip_reseek(cursor)) 1640 goto again; 1641 } 1642 } 1643 1644 /* 1645 * We can now safely get the next in-memory record. We cannot 1646 * block here. 1647 * 1648 * hammer_rec_scan_cmp: Is the record still in our general range, 1649 * (non-inclusive of snapshot exclusions)? 1650 * hammer_rec_scan_callback: Is the record in our snapshot? 1651 */ 1652 tmprec = NULL; 1653 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1654 /* 1655 * If the current memory record was eaten then get the next 1656 * one. Stale records are skipped. 1657 */ 1658 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1659 tmprec = cursor->iprec; 1660 cursor->iprec = NULL; 1661 rec = hammer_rec_rb_tree_RB_NEXT(tmprec); 1662 while (rec) { 1663 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1664 break; 1665 if (hammer_rec_scan_callback(rec, cursor) != 0) 1666 break; 1667 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1668 } 1669 if (cursor->iprec) { 1670 KKASSERT(cursor->iprec == rec); 1671 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1672 } else { 1673 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1674 } 1675 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1676 } 1677 } 1678 1679 /* 1680 * MEMORY RECORD VALIDITY TEST 1681 * 1682 * (We still can't block, which is why tmprec is being held so 1683 * long). 1684 * 1685 * If the memory record is no longer valid we skip it. It may 1686 * have been deleted by the frontend. If it was deleted or 1687 * committed by the backend the generation change re-seeked the 1688 * disk cursor and the record will be present there. 1689 */ 1690 if (error == 0 && (cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1691 KKASSERT(cursor->iprec); 1692 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0); 1693 if (!hammer_ip_iterate_mem_good(cursor, cursor->iprec)) { 1694 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1695 if (tmprec) 1696 hammer_rel_mem_record(tmprec); 1697 goto again; 1698 } 1699 } 1700 if (tmprec) 1701 hammer_rel_mem_record(tmprec); 1702 1703 /* 1704 * Extract either the disk or memory record depending on their 1705 * relative position. 1706 */ 1707 error = 0; 1708 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1709 case 0: 1710 /* 1711 * Both entries valid. Compare the entries and nominally 1712 * return the first one in the sort order. Numerous cases 1713 * require special attention, however. 1714 */ 1715 elm = &cursor->node->ondisk->elms[cursor->index]; 1716 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base); 1717 1718 /* 1719 * If the two entries differ only by their key (-2/2) or 1720 * create_tid (-1/1), and are DATA records, we may have a 1721 * nominal match. We have to calculate the base file 1722 * offset of the data. 1723 */ 1724 if (r <= 2 && r >= -2 && r != 0 && 1725 cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE && 1726 cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1727 int64_t base1 = elm->leaf.base.key - elm->leaf.data_len; 1728 int64_t base2 = cursor->iprec->leaf.base.key - 1729 cursor->iprec->leaf.data_len; 1730 if (base1 == base2) 1731 r = 0; 1732 } 1733 1734 if (r < 0) { 1735 error = hammer_btree_extract(cursor, 1736 HAMMER_CURSOR_GET_LEAF); 1737 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1738 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1739 break; 1740 } 1741 1742 /* 1743 * If the entries match exactly the memory entry is either 1744 * an on-disk directory entry deletion or a bulk data 1745 * overwrite. If it is a directory entry deletion we eat 1746 * both entries. 1747 * 1748 * For the bulk-data overwrite case it is possible to have 1749 * visibility into both, which simply means the syncer 1750 * hasn't gotten around to doing the delete+insert sequence 1751 * on the B-Tree. Use the memory entry and throw away the 1752 * on-disk entry. 1753 * 1754 * If the in-memory record is not either of these we 1755 * probably caught the syncer while it was syncing it to 1756 * the media. Since we hold a shared lock on the cursor, 1757 * the in-memory record had better be marked deleted at 1758 * this point. 1759 */ 1760 if (r == 0) { 1761 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1762 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1763 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1764 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1765 goto again; 1766 } 1767 } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1768 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1769 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1770 } 1771 /* fall through to memory entry */ 1772 } else { 1773 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags); 1774 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1775 goto again; 1776 } 1777 } 1778 /* fall through to the memory entry */ 1779 case HAMMER_CURSOR_ATEDISK: 1780 /* 1781 * Only the memory entry is valid. 1782 */ 1783 cursor->leaf = &cursor->iprec->leaf; 1784 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1785 cursor->flags |= HAMMER_CURSOR_LASTWASMEM; 1786 1787 /* 1788 * If the memory entry is an on-disk deletion we should have 1789 * also had found a B-Tree record. If the backend beat us 1790 * to it it would have interlocked the cursor and we should 1791 * have seen the in-memory record marked DELETED_FE. 1792 */ 1793 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL && 1794 (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1795 panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags); 1796 } 1797 break; 1798 case HAMMER_CURSOR_ATEMEM: 1799 /* 1800 * Only the disk entry is valid 1801 */ 1802 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1803 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1804 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1805 break; 1806 default: 1807 /* 1808 * Neither entry is valid 1809 * 1810 * XXX error not set properly 1811 */ 1812 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1813 cursor->leaf = NULL; 1814 error = ENOENT; 1815 break; 1816 } 1817 return(error); 1818 } 1819 1820 /* 1821 * Resolve the cursor->data pointer for the current cursor position in 1822 * a merged iteration. 1823 */ 1824 int 1825 hammer_ip_resolve_data(hammer_cursor_t cursor) 1826 { 1827 hammer_record_t record; 1828 int error; 1829 1830 if (hammer_cursor_inmem(cursor)) { 1831 /* 1832 * The data associated with an in-memory record is usually 1833 * kmalloced, but reserve-ahead data records will have an 1834 * on-disk reference. 1835 * 1836 * NOTE: Reserve-ahead data records must be handled in the 1837 * context of the related high level buffer cache buffer 1838 * to interlock against async writes. 1839 */ 1840 record = cursor->iprec; 1841 cursor->data = record->data; 1842 error = 0; 1843 if (cursor->data == NULL) { 1844 KKASSERT(record->leaf.base.rec_type == 1845 HAMMER_RECTYPE_DATA); 1846 cursor->data = hammer_bread_ext(cursor->trans->hmp, 1847 record->leaf.data_offset, 1848 record->leaf.data_len, 1849 &error, 1850 &cursor->data_buffer); 1851 } 1852 } else { 1853 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf; 1854 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1855 } 1856 return(error); 1857 } 1858 1859 /* 1860 * Backend truncation / record replacement - delete records in range. 1861 * 1862 * Delete all records within the specified range for inode ip. In-memory 1863 * records still associated with the frontend are ignored. 1864 * 1865 * If truncating is non-zero in-memory records associated with the back-end 1866 * are ignored. If truncating is > 1 we can return EWOULDBLOCK. 1867 * 1868 * NOTES: 1869 * 1870 * * An unaligned range will cause new records to be added to cover 1871 * the edge cases. (XXX not implemented yet). 1872 * 1873 * * Replacement via reservations (see hammer_ip_sync_record_cursor()) 1874 * also do not deal with unaligned ranges. 1875 * 1876 * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1877 * 1878 * * Record keys for regular file data have to be special-cased since 1879 * they indicate the end of the range (key = base + bytes). 1880 * 1881 * * This function may be asked to delete ridiculously huge ranges, for 1882 * example if someone truncates or removes a 1TB regular file. We 1883 * must be very careful on restarts and we may have to stop w/ 1884 * EWOULDBLOCK to avoid blowing out the buffer cache. 1885 */ 1886 int 1887 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, 1888 int64_t ran_beg, int64_t ran_end, int truncating) 1889 { 1890 hammer_transaction_t trans = cursor->trans; 1891 hammer_btree_leaf_elm_t leaf; 1892 int error; 1893 int64_t off; 1894 int64_t tmp64; 1895 1896 #if 0 1897 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); 1898 #endif 1899 1900 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1901 retry: 1902 hammer_normalize_cursor(cursor); 1903 cursor->key_beg.localization = ip->obj_localization + 1904 HAMMER_LOCALIZE_MISC; 1905 cursor->key_beg.obj_id = ip->obj_id; 1906 cursor->key_beg.create_tid = 0; 1907 cursor->key_beg.delete_tid = 0; 1908 cursor->key_beg.obj_type = 0; 1909 1910 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1911 cursor->key_beg.key = ran_beg; 1912 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB; 1913 } else { 1914 /* 1915 * The key in the B-Tree is (base+bytes), so the first possible 1916 * matching key is ran_beg + 1. 1917 */ 1918 cursor->key_beg.key = ran_beg + 1; 1919 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; 1920 } 1921 1922 cursor->key_end = cursor->key_beg; 1923 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1924 cursor->key_end.key = ran_end; 1925 } else { 1926 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1927 if (tmp64 < ran_end) 1928 cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1929 else 1930 cursor->key_end.key = ran_end + MAXPHYS + 1; 1931 } 1932 1933 cursor->asof = ip->obj_asof; 1934 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1935 cursor->flags |= HAMMER_CURSOR_ASOF; 1936 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1937 cursor->flags |= HAMMER_CURSOR_BACKEND; 1938 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE; 1939 1940 error = hammer_ip_first(cursor); 1941 1942 /* 1943 * Iterate through matching records and mark them as deleted. 1944 */ 1945 while (error == 0) { 1946 leaf = cursor->leaf; 1947 1948 KKASSERT(leaf->base.delete_tid == 0); 1949 KKASSERT(leaf->base.obj_id == ip->obj_id); 1950 1951 /* 1952 * There may be overlap cases for regular file data. Also 1953 * remember the key for a regular file record is (base + len), 1954 * NOT (base). 1955 * 1956 * Note that do to duplicates (mem & media) allowed by 1957 * DELETE_VISIBILITY, off can wind up less then ran_beg. 1958 */ 1959 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 1960 off = leaf->base.key - leaf->data_len; 1961 /* 1962 * Check the left edge case. We currently do not 1963 * split existing records. 1964 */ 1965 if (off < ran_beg && leaf->base.key > ran_beg) { 1966 panic("hammer left edge case %016llx %d\n", 1967 (long long)leaf->base.key, 1968 leaf->data_len); 1969 } 1970 1971 /* 1972 * Check the right edge case. Note that the 1973 * record can be completely out of bounds, which 1974 * terminates the search. 1975 * 1976 * base->key is exclusive of the right edge while 1977 * ran_end is inclusive of the right edge. The 1978 * (key - data_len) left boundary is inclusive. 1979 * 1980 * XXX theory-check this test at some point, are 1981 * we missing a + 1 somewhere? Note that ran_end 1982 * could overflow. 1983 */ 1984 if (leaf->base.key - 1 > ran_end) { 1985 if (leaf->base.key - leaf->data_len > ran_end) 1986 break; 1987 panic("hammer right edge case\n"); 1988 } 1989 } else { 1990 off = leaf->base.key; 1991 } 1992 1993 /* 1994 * Delete the record. When truncating we do not delete 1995 * in-memory (data) records because they represent data 1996 * written after the truncation. 1997 * 1998 * This will also physically destroy the B-Tree entry and 1999 * data if the retention policy dictates. The function 2000 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2001 * to retest the new 'current' element. 2002 */ 2003 if (truncating == 0 || hammer_cursor_ondisk(cursor)) { 2004 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2005 /* 2006 * If we have built up too many meta-buffers we risk 2007 * deadlocking the kernel and must stop. This can 2008 * occur when deleting ridiculously huge files. 2009 * sync_trunc_off is updated so the next cycle does 2010 * not re-iterate records we have already deleted. 2011 * 2012 * This is only done with formal truncations. 2013 */ 2014 if (truncating > 1 && error == 0 && 2015 hammer_flusher_meta_limit(ip->hmp)) { 2016 ip->sync_trunc_off = off; 2017 error = EWOULDBLOCK; 2018 } 2019 } 2020 if (error) 2021 break; 2022 ran_beg = off; /* for restart */ 2023 error = hammer_ip_next(cursor); 2024 } 2025 if (cursor->node) 2026 hammer_cache_node(&ip->cache[1], cursor->node); 2027 2028 if (error == EDEADLK) { 2029 hammer_done_cursor(cursor); 2030 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2031 if (error == 0) 2032 goto retry; 2033 } 2034 if (error == ENOENT) 2035 error = 0; 2036 return(error); 2037 } 2038 2039 /* 2040 * This backend function deletes the specified record on-disk, similar to 2041 * delete_range but for a specific record. Unlike the exact deletions 2042 * used when deleting a directory entry this function uses an ASOF search 2043 * like delete_range. 2044 * 2045 * This function may be called with ip->obj_asof set for a slave snapshot, 2046 * so don't use it. We always delete non-historical records only. 2047 */ 2048 static int 2049 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 2050 hammer_btree_leaf_elm_t leaf) 2051 { 2052 hammer_transaction_t trans = cursor->trans; 2053 int error; 2054 2055 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2056 retry: 2057 hammer_normalize_cursor(cursor); 2058 cursor->key_beg = leaf->base; 2059 cursor->asof = HAMMER_MAX_TID; 2060 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2061 cursor->flags |= HAMMER_CURSOR_ASOF; 2062 cursor->flags |= HAMMER_CURSOR_BACKEND; 2063 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2064 2065 error = hammer_btree_lookup(cursor); 2066 if (error == 0) { 2067 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2068 } 2069 if (error == EDEADLK) { 2070 hammer_done_cursor(cursor); 2071 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2072 if (error == 0) 2073 goto retry; 2074 } 2075 return(error); 2076 } 2077 2078 /* 2079 * This function deletes remaining auxillary records when an inode is 2080 * being deleted. This function explicitly does not delete the 2081 * inode record, directory entry, data, or db records. Those must be 2082 * properly disposed of prior to this call. 2083 */ 2084 int 2085 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp) 2086 { 2087 hammer_transaction_t trans = cursor->trans; 2088 hammer_btree_leaf_elm_t leaf; 2089 int error; 2090 2091 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2092 retry: 2093 hammer_normalize_cursor(cursor); 2094 cursor->key_beg.localization = ip->obj_localization + 2095 HAMMER_LOCALIZE_MISC; 2096 cursor->key_beg.obj_id = ip->obj_id; 2097 cursor->key_beg.create_tid = 0; 2098 cursor->key_beg.delete_tid = 0; 2099 cursor->key_beg.obj_type = 0; 2100 cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START; 2101 cursor->key_beg.key = HAMMER_MIN_KEY; 2102 2103 cursor->key_end = cursor->key_beg; 2104 cursor->key_end.rec_type = HAMMER_RECTYPE_MAX; 2105 cursor->key_end.key = HAMMER_MAX_KEY; 2106 2107 cursor->asof = ip->obj_asof; 2108 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2109 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2110 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 2111 cursor->flags |= HAMMER_CURSOR_BACKEND; 2112 2113 error = hammer_ip_first(cursor); 2114 2115 /* 2116 * Iterate through matching records and mark them as deleted. 2117 */ 2118 while (error == 0) { 2119 leaf = cursor->leaf; 2120 2121 KKASSERT(leaf->base.delete_tid == 0); 2122 2123 /* 2124 * Mark the record and B-Tree entry as deleted. This will 2125 * also physically delete the B-Tree entry, record, and 2126 * data if the retention policy dictates. The function 2127 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2128 * to retest the new 'current' element. 2129 * 2130 * Directory entries (and delete-on-disk directory entries) 2131 * must be synced and cannot be deleted. 2132 */ 2133 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2134 ++*countp; 2135 if (error) 2136 break; 2137 error = hammer_ip_next(cursor); 2138 } 2139 if (cursor->node) 2140 hammer_cache_node(&ip->cache[1], cursor->node); 2141 if (error == EDEADLK) { 2142 hammer_done_cursor(cursor); 2143 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2144 if (error == 0) 2145 goto retry; 2146 } 2147 if (error == ENOENT) 2148 error = 0; 2149 return(error); 2150 } 2151 2152 /* 2153 * Delete the record at the current cursor. On success the cursor will 2154 * be positioned appropriately for an iteration but may no longer be at 2155 * a leaf node. 2156 * 2157 * This routine is only called from the backend. 2158 * 2159 * NOTE: This can return EDEADLK, requiring the caller to terminate the 2160 * cursor and retry. 2161 */ 2162 int 2163 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip, 2164 hammer_tid_t tid) 2165 { 2166 hammer_record_t iprec; 2167 hammer_mount_t hmp; 2168 int error; 2169 2170 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 2171 KKASSERT(tid != 0); 2172 hmp = cursor->node->hmp; 2173 2174 /* 2175 * In-memory (unsynchronized) records can simply be freed. This 2176 * only occurs in range iterations since all other records are 2177 * individually synchronized. Thus there should be no confusion with 2178 * the interlock. 2179 * 2180 * An in-memory record may be deleted before being committed to disk, 2181 * but could have been accessed in the mean time. The reservation 2182 * code will deal with the case. 2183 */ 2184 if (hammer_cursor_inmem(cursor)) { 2185 iprec = cursor->iprec; 2186 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); 2187 iprec->flags |= HAMMER_RECF_DELETED_FE; 2188 iprec->flags |= HAMMER_RECF_DELETED_BE; 2189 KKASSERT(iprec->ip == ip); 2190 ++ip->rec_generation; 2191 return(0); 2192 } 2193 2194 /* 2195 * On-disk records are marked as deleted by updating their delete_tid. 2196 * This does not effect their position in the B-Tree (which is based 2197 * on their create_tid). 2198 * 2199 * Frontend B-Tree operations track inodes so we tell 2200 * hammer_delete_at_cursor() not to. 2201 */ 2202 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 2203 2204 if (error == 0) { 2205 error = hammer_delete_at_cursor( 2206 cursor, 2207 HAMMER_DELETE_ADJUST | hammer_nohistory(ip), 2208 cursor->trans->tid, 2209 cursor->trans->time32, 2210 0, NULL); 2211 } 2212 return(error); 2213 } 2214 2215 /* 2216 * Used to write a generic record w/optional data to the media b-tree 2217 * when no inode context is available. Used by the mirroring and 2218 * snapshot code. 2219 * 2220 * Caller must set cursor->key_beg to leaf->base. The cursor must be 2221 * flagged for backend operation and not flagged ASOF (since we are 2222 * doing an insertion). 2223 * 2224 * This function will acquire the appropriate sync lock and will set 2225 * the cursor insertion flag for the operation, do the btree lookup, 2226 * and the insertion, and clear the insertion flag and sync lock before 2227 * returning. The cursor state will be such that the caller can continue 2228 * scanning (used by the mirroring code). 2229 * 2230 * mode: HAMMER_CREATE_MODE_UMIRROR copyin data, check crc 2231 * HAMMER_CREATE_MODE_SYS bcopy data, generate crc 2232 * 2233 * NOTE: EDEADLK can be returned. The caller must do deadlock handling and 2234 * retry. 2235 * 2236 * EALREADY can be returned if the record already exists (WARNING, 2237 * because ASOF cannot be used no check is made for illegal 2238 * duplicates). 2239 * 2240 * NOTE: Do not use the function for normal inode-related records as this 2241 * functions goes directly to the media and is not integrated with 2242 * in-memory records. 2243 */ 2244 int 2245 hammer_create_at_cursor(hammer_cursor_t cursor, hammer_btree_leaf_elm_t leaf, 2246 void *udata, int mode) 2247 { 2248 hammer_transaction_t trans; 2249 hammer_buffer_t data_buffer; 2250 hammer_off_t ndata_offset; 2251 hammer_tid_t high_tid; 2252 void *ndata; 2253 int error; 2254 int doprop; 2255 2256 trans = cursor->trans; 2257 data_buffer = NULL; 2258 ndata_offset = 0; 2259 doprop = 0; 2260 2261 KKASSERT((cursor->flags & 2262 (HAMMER_CURSOR_BACKEND | HAMMER_CURSOR_ASOF)) == 2263 (HAMMER_CURSOR_BACKEND)); 2264 2265 hammer_sync_lock_sh(trans); 2266 2267 if (leaf->data_len) { 2268 ndata = hammer_alloc_data(trans, leaf->data_len, 2269 leaf->base.rec_type, 2270 &ndata_offset, &data_buffer, 2271 0, &error); 2272 if (ndata == NULL) { 2273 hammer_sync_unlock(trans); 2274 return (error); 2275 } 2276 leaf->data_offset = ndata_offset; 2277 hammer_modify_buffer(trans, data_buffer, NULL, 0); 2278 2279 switch(mode) { 2280 case HAMMER_CREATE_MODE_UMIRROR: 2281 error = copyin(udata, ndata, leaf->data_len); 2282 if (error == 0) { 2283 if (hammer_crc_test_leaf(ndata, leaf) == 0) { 2284 kprintf("data crc mismatch on pipe\n"); 2285 error = EINVAL; 2286 } else { 2287 error = hammer_cursor_localize_data( 2288 ndata, leaf); 2289 } 2290 } 2291 break; 2292 case HAMMER_CREATE_MODE_SYS: 2293 bcopy(udata, ndata, leaf->data_len); 2294 error = 0; 2295 hammer_crc_set_leaf(ndata, leaf); 2296 break; 2297 default: 2298 panic("hammer: hammer_create_at_cursor: bad mode %d", 2299 mode); 2300 break; /* NOT REACHED */ 2301 } 2302 hammer_modify_buffer_done(data_buffer); 2303 } else { 2304 leaf->data_offset = 0; 2305 error = 0; 2306 ndata = NULL; 2307 } 2308 if (error) 2309 goto failed; 2310 2311 /* 2312 * Do the insertion. This can fail with a EDEADLK or EALREADY 2313 */ 2314 cursor->flags |= HAMMER_CURSOR_INSERT; 2315 error = hammer_btree_lookup(cursor); 2316 if (error != ENOENT) { 2317 if (error == 0) 2318 error = EALREADY; 2319 goto failed; 2320 } 2321 error = hammer_btree_insert(cursor, leaf, &doprop); 2322 2323 /* 2324 * Cursor is left on current element, we want to skip it now. 2325 * (in case the caller is scanning) 2326 */ 2327 cursor->flags |= HAMMER_CURSOR_ATEDISK; 2328 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2329 2330 /* 2331 * If the insertion happens to be creating (and not just replacing) 2332 * an inode we have to track it. 2333 */ 2334 if (error == 0 && 2335 leaf->base.rec_type == HAMMER_RECTYPE_INODE && 2336 leaf->base.delete_tid == 0) { 2337 hammer_modify_volume_field(trans, trans->rootvol, 2338 vol0_stat_inodes); 2339 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 2340 hammer_modify_volume_done(trans->rootvol); 2341 } 2342 2343 /* 2344 * vol0_next_tid must track the highest TID stored in the filesystem. 2345 * We do not need to generate undo for this update. 2346 */ 2347 high_tid = leaf->base.create_tid; 2348 if (high_tid < leaf->base.delete_tid) 2349 high_tid = leaf->base.delete_tid; 2350 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 2351 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2352 trans->rootvol->ondisk->vol0_next_tid = high_tid; 2353 hammer_modify_volume_done(trans->rootvol); 2354 } 2355 2356 /* 2357 * WARNING! cursor's leaf pointer may have changed after 2358 * do_propagation returns. 2359 */ 2360 if (error == 0 && doprop) 2361 hammer_btree_do_propagation(cursor, NULL, leaf); 2362 2363 failed: 2364 /* 2365 * Cleanup 2366 */ 2367 if (error && leaf->data_offset) { 2368 hammer_blockmap_free(trans, leaf->data_offset, leaf->data_len); 2369 2370 } 2371 hammer_sync_unlock(trans); 2372 if (data_buffer) 2373 hammer_rel_buffer(data_buffer, 0); 2374 return (error); 2375 } 2376 2377 /* 2378 * Delete the B-Tree element at the current cursor and do any necessary 2379 * mirror propagation. 2380 * 2381 * The cursor must be properly positioned for an iteration on return but 2382 * may be pointing at an internal element. 2383 * 2384 * An element can be un-deleted by passing a delete_tid of 0 with 2385 * HAMMER_DELETE_ADJUST. 2386 */ 2387 int 2388 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, 2389 hammer_tid_t delete_tid, u_int32_t delete_ts, 2390 int track, int64_t *stat_bytes) 2391 { 2392 struct hammer_btree_leaf_elm save_leaf; 2393 hammer_transaction_t trans; 2394 hammer_btree_leaf_elm_t leaf; 2395 hammer_node_t node; 2396 hammer_btree_elm_t elm; 2397 hammer_off_t data_offset; 2398 int32_t data_len; 2399 u_int16_t rec_type; 2400 int error; 2401 int icount; 2402 int doprop; 2403 2404 error = hammer_cursor_upgrade(cursor); 2405 if (error) 2406 return(error); 2407 2408 trans = cursor->trans; 2409 node = cursor->node; 2410 elm = &node->ondisk->elms[cursor->index]; 2411 leaf = &elm->leaf; 2412 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 2413 2414 hammer_sync_lock_sh(trans); 2415 doprop = 0; 2416 icount = 0; 2417 2418 /* 2419 * Adjust the delete_tid. Update the mirror_tid propagation field 2420 * as well. delete_tid can be 0 (undelete -- used by mirroring). 2421 */ 2422 if (delete_flags & HAMMER_DELETE_ADJUST) { 2423 if (elm->base.rec_type == HAMMER_RECTYPE_INODE) { 2424 if (elm->leaf.base.delete_tid == 0 && delete_tid) 2425 icount = -1; 2426 if (elm->leaf.base.delete_tid && delete_tid == 0) 2427 icount = 1; 2428 } 2429 2430 hammer_modify_node(trans, node, elm, sizeof(*elm)); 2431 elm->leaf.base.delete_tid = delete_tid; 2432 elm->leaf.delete_ts = delete_ts; 2433 hammer_modify_node_done(node); 2434 2435 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) { 2436 hammer_modify_node_field(trans, node, mirror_tid); 2437 node->ondisk->mirror_tid = elm->leaf.base.delete_tid; 2438 hammer_modify_node_done(node); 2439 doprop = 1; 2440 if (hammer_debug_general & 0x0002) { 2441 kprintf("delete_at_cursor: propagate %016llx" 2442 " @%016llx\n", 2443 (long long)elm->leaf.base.delete_tid, 2444 (long long)node->node_offset); 2445 } 2446 } 2447 2448 /* 2449 * Adjust for the iteration. We have deleted the current 2450 * element and want to clear ATEDISK so the iteration does 2451 * not skip the element after, which now becomes the current 2452 * element. This element must be re-tested if doing an 2453 * iteration, which is handled by the RETEST flag. 2454 */ 2455 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2456 cursor->flags |= HAMMER_CURSOR_RETEST; 2457 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2458 } 2459 2460 /* 2461 * An on-disk record cannot have the same delete_tid 2462 * as its create_tid. In a chain of record updates 2463 * this could result in a duplicate record. 2464 */ 2465 KKASSERT(elm->leaf.base.delete_tid != 2466 elm->leaf.base.create_tid); 2467 } 2468 2469 /* 2470 * Destroy the B-Tree element if asked (typically if a nohistory 2471 * file or mount, or when called by the pruning code). 2472 * 2473 * Adjust the ATEDISK flag to properly support iterations. 2474 */ 2475 if (delete_flags & HAMMER_DELETE_DESTROY) { 2476 data_offset = elm->leaf.data_offset; 2477 data_len = elm->leaf.data_len; 2478 rec_type = elm->leaf.base.rec_type; 2479 if (doprop) { 2480 save_leaf = elm->leaf; 2481 leaf = &save_leaf; 2482 } 2483 if (elm->base.rec_type == HAMMER_RECTYPE_INODE && 2484 elm->leaf.base.delete_tid == 0) { 2485 icount = -1; 2486 } 2487 2488 error = hammer_btree_delete(cursor); 2489 if (error == 0) { 2490 /* 2491 * The deletion moves the next element (if any) to 2492 * the current element position. We must clear 2493 * ATEDISK so this element is not skipped and we 2494 * must set RETEST to force any iteration to re-test 2495 * the element. 2496 */ 2497 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2498 cursor->flags |= HAMMER_CURSOR_RETEST; 2499 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2500 } 2501 } 2502 if (error == 0) { 2503 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 2504 case HAMMER_ZONE_LARGE_DATA: 2505 case HAMMER_ZONE_SMALL_DATA: 2506 case HAMMER_ZONE_META: 2507 hammer_blockmap_free(trans, 2508 data_offset, data_len); 2509 break; 2510 default: 2511 break; 2512 } 2513 } 2514 } 2515 2516 /* 2517 * Track inode count and next_tid. This is used by the mirroring 2518 * and PFS code. icount can be negative, zero, or positive. 2519 */ 2520 if (error == 0 && track) { 2521 if (icount) { 2522 hammer_modify_volume_field(trans, trans->rootvol, 2523 vol0_stat_inodes); 2524 trans->rootvol->ondisk->vol0_stat_inodes += icount; 2525 hammer_modify_volume_done(trans->rootvol); 2526 } 2527 if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) { 2528 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2529 trans->rootvol->ondisk->vol0_next_tid = delete_tid; 2530 hammer_modify_volume_done(trans->rootvol); 2531 } 2532 } 2533 2534 /* 2535 * mirror_tid propagation occurs if the node's mirror_tid had to be 2536 * updated while adjusting the delete_tid. 2537 * 2538 * This occurs when deleting even in nohistory mode, but does not 2539 * occur when pruning an already-deleted node. 2540 * 2541 * cursor->ip is NULL when called from the pruning, mirroring, 2542 * and pfs code. If non-NULL propagation will be conditionalized 2543 * on whether the PFS is in no-history mode or not. 2544 * 2545 * WARNING: cursor's leaf pointer may have changed after do_propagation 2546 * returns! 2547 */ 2548 if (doprop) { 2549 if (cursor->ip) 2550 hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf); 2551 else 2552 hammer_btree_do_propagation(cursor, NULL, leaf); 2553 } 2554 hammer_sync_unlock(trans); 2555 return (error); 2556 } 2557 2558 /* 2559 * Determine whether we can remove a directory. This routine checks whether 2560 * a directory is empty or not and enforces flush connectivity. 2561 * 2562 * Flush connectivity requires that we block if the target directory is 2563 * currently flushing, otherwise it may not end up in the same flush group. 2564 * 2565 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. 2566 */ 2567 int 2568 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) 2569 { 2570 struct hammer_cursor cursor; 2571 int error; 2572 2573 /* 2574 * Check directory empty 2575 */ 2576 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2577 2578 cursor.key_beg.localization = ip->obj_localization + 2579 hammer_dir_localization(ip); 2580 cursor.key_beg.obj_id = ip->obj_id; 2581 cursor.key_beg.create_tid = 0; 2582 cursor.key_beg.delete_tid = 0; 2583 cursor.key_beg.obj_type = 0; 2584 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 2585 cursor.key_beg.key = HAMMER_MIN_KEY; 2586 2587 cursor.key_end = cursor.key_beg; 2588 cursor.key_end.rec_type = 0xFFFF; 2589 cursor.key_end.key = HAMMER_MAX_KEY; 2590 2591 cursor.asof = ip->obj_asof; 2592 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2593 2594 error = hammer_ip_first(&cursor); 2595 if (error == ENOENT) 2596 error = 0; 2597 else if (error == 0) 2598 error = ENOTEMPTY; 2599 hammer_done_cursor(&cursor); 2600 return(error); 2601 } 2602 2603 /* 2604 * Localize the data payload. Directory entries may need their 2605 * localization adjusted. 2606 */ 2607 static 2608 int 2609 hammer_cursor_localize_data(hammer_data_ondisk_t data, 2610 hammer_btree_leaf_elm_t leaf) 2611 { 2612 u_int32_t localization; 2613 2614 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 2615 localization = leaf->base.localization & 2616 HAMMER_LOCALIZE_PSEUDOFS_MASK; 2617 if (data->entry.localization != localization) { 2618 data->entry.localization = localization; 2619 hammer_crc_set_leaf(data, leaf); 2620 } 2621 } 2622 return(0); 2623 } 2624