1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.97 2008/09/23 22:28:56 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_lookup(hammer_cursor_t cursor); 40 static void hammer_mem_first(hammer_cursor_t cursor); 41 static int hammer_frontend_trunc_callback(hammer_record_t record, 42 void *data __unused); 43 static int hammer_bulk_scan_callback(hammer_record_t record, void *data); 44 static int hammer_record_needs_overwrite_delete(hammer_record_t record); 45 static int hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 46 hammer_btree_leaf_elm_t leaf); 47 static int hammer_cursor_localize_data(hammer_data_ondisk_t data, 48 hammer_btree_leaf_elm_t leaf); 49 50 struct rec_trunc_info { 51 u_int16_t rec_type; 52 int64_t trunc_off; 53 }; 54 55 struct hammer_bulk_info { 56 hammer_record_t record; 57 hammer_record_t conflict; 58 }; 59 60 /* 61 * Red-black tree support. Comparison code for insertion. 62 */ 63 static int 64 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 65 { 66 if (rec1->leaf.base.rec_type < rec2->leaf.base.rec_type) 67 return(-1); 68 if (rec1->leaf.base.rec_type > rec2->leaf.base.rec_type) 69 return(1); 70 71 if (rec1->leaf.base.key < rec2->leaf.base.key) 72 return(-1); 73 if (rec1->leaf.base.key > rec2->leaf.base.key) 74 return(1); 75 76 /* 77 * For search & insertion purposes records deleted by the 78 * frontend or deleted/committed by the backend are silently 79 * ignored. Otherwise pipelined insertions will get messed 80 * up. 81 * 82 * rec1 is greater then rec2 if rec1 is marked deleted. 83 * rec1 is less then rec2 if rec2 is marked deleted. 84 * 85 * Multiple deleted records may be present, do not return 0 86 * if both are marked deleted. 87 */ 88 if (rec1->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 89 HAMMER_RECF_COMMITTED)) { 90 return(1); 91 } 92 if (rec2->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 93 HAMMER_RECF_COMMITTED)) { 94 return(-1); 95 } 96 97 return(0); 98 } 99 100 /* 101 * Basic record comparison code similar to hammer_btree_cmp(). 102 * 103 * obj_id is not compared and may not yet be assigned in the record. 104 */ 105 static int 106 hammer_rec_cmp(hammer_base_elm_t elm, hammer_record_t rec) 107 { 108 if (elm->rec_type < rec->leaf.base.rec_type) 109 return(-3); 110 if (elm->rec_type > rec->leaf.base.rec_type) 111 return(3); 112 113 if (elm->key < rec->leaf.base.key) 114 return(-2); 115 if (elm->key > rec->leaf.base.key) 116 return(2); 117 118 /* 119 * Never match against an item deleted by the frontend 120 * or backend, or committed by the backend. 121 * 122 * elm is less then rec if rec is marked deleted. 123 */ 124 if (rec->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 125 HAMMER_RECF_COMMITTED)) { 126 return(-1); 127 } 128 return(0); 129 } 130 131 /* 132 * Ranged scan to locate overlapping record(s). This is used by 133 * hammer_ip_get_bulk() to locate an overlapping record. We have 134 * to use a ranged scan because the keys for data records with the 135 * same file base offset can be different due to differing data_len's. 136 * 137 * NOTE: The base file offset of a data record is (key - data_len), not (key). 138 */ 139 static int 140 hammer_rec_overlap_cmp(hammer_record_t rec, void *data) 141 { 142 struct hammer_bulk_info *info = data; 143 hammer_btree_leaf_elm_t leaf = &info->record->leaf; 144 145 if (rec->leaf.base.rec_type < leaf->base.rec_type) 146 return(-3); 147 if (rec->leaf.base.rec_type > leaf->base.rec_type) 148 return(3); 149 150 /* 151 * Overlap compare 152 */ 153 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 154 /* rec_beg >= leaf_end */ 155 if (rec->leaf.base.key - rec->leaf.data_len >= leaf->base.key) 156 return(2); 157 /* rec_end <= leaf_beg */ 158 if (rec->leaf.base.key <= leaf->base.key - leaf->data_len) 159 return(-2); 160 } else { 161 if (rec->leaf.base.key < leaf->base.key) 162 return(-2); 163 if (rec->leaf.base.key > leaf->base.key) 164 return(2); 165 } 166 167 /* 168 * We have to return 0 at this point, even if DELETED_FE is set, 169 * because returning anything else will cause the scan to ignore 170 * one of the branches when we really want it to check both. 171 */ 172 return(0); 173 } 174 175 /* 176 * RB_SCAN comparison code for hammer_mem_first(). The argument order 177 * is reversed so the comparison result has to be negated. key_beg and 178 * key_end are both range-inclusive. 179 * 180 * Localized deletions are not cached in-memory. 181 */ 182 static 183 int 184 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 185 { 186 hammer_cursor_t cursor = data; 187 int r; 188 189 r = hammer_rec_cmp(&cursor->key_beg, rec); 190 if (r > 1) 191 return(-1); 192 r = hammer_rec_cmp(&cursor->key_end, rec); 193 if (r < -1) 194 return(1); 195 return(0); 196 } 197 198 /* 199 * This compare function is used when simply looking up key_beg. 200 */ 201 static 202 int 203 hammer_rec_find_cmp(hammer_record_t rec, void *data) 204 { 205 hammer_cursor_t cursor = data; 206 int r; 207 208 r = hammer_rec_cmp(&cursor->key_beg, rec); 209 if (r > 1) 210 return(-1); 211 if (r < -1) 212 return(1); 213 return(0); 214 } 215 216 /* 217 * Locate blocks within the truncation range. Partial blocks do not count. 218 */ 219 static 220 int 221 hammer_rec_trunc_cmp(hammer_record_t rec, void *data) 222 { 223 struct rec_trunc_info *info = data; 224 225 if (rec->leaf.base.rec_type < info->rec_type) 226 return(-1); 227 if (rec->leaf.base.rec_type > info->rec_type) 228 return(1); 229 230 switch(rec->leaf.base.rec_type) { 231 case HAMMER_RECTYPE_DB: 232 /* 233 * DB record key is not beyond the truncation point, retain. 234 */ 235 if (rec->leaf.base.key < info->trunc_off) 236 return(-1); 237 break; 238 case HAMMER_RECTYPE_DATA: 239 /* 240 * DATA record offset start is not beyond the truncation point, 241 * retain. 242 */ 243 if (rec->leaf.base.key - rec->leaf.data_len < info->trunc_off) 244 return(-1); 245 break; 246 default: 247 panic("hammer_rec_trunc_cmp: unexpected record type"); 248 } 249 250 /* 251 * The record start is >= the truncation point, return match, 252 * the record should be destroyed. 253 */ 254 return(0); 255 } 256 257 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 258 259 /* 260 * Allocate a record for the caller to finish filling in. The record is 261 * returned referenced. 262 */ 263 hammer_record_t 264 hammer_alloc_mem_record(hammer_inode_t ip, int data_len) 265 { 266 hammer_record_t record; 267 hammer_mount_t hmp; 268 269 hmp = ip->hmp; 270 ++hammer_count_records; 271 record = kmalloc(sizeof(*record), hmp->m_misc, 272 M_WAITOK | M_ZERO | M_USE_RESERVE); 273 record->flush_state = HAMMER_FST_IDLE; 274 record->ip = ip; 275 record->leaf.base.btype = HAMMER_BTREE_TYPE_RECORD; 276 record->leaf.data_len = data_len; 277 hammer_ref(&record->lock); 278 279 if (data_len) { 280 record->data = kmalloc(data_len, hmp->m_misc, M_WAITOK | M_ZERO); 281 record->flags |= HAMMER_RECF_ALLOCDATA; 282 ++hammer_count_record_datas; 283 } 284 285 return (record); 286 } 287 288 void 289 hammer_wait_mem_record_ident(hammer_record_t record, const char *ident) 290 { 291 while (record->flush_state == HAMMER_FST_FLUSH) { 292 record->flags |= HAMMER_RECF_WANTED; 293 tsleep(record, 0, ident, 0); 294 } 295 } 296 297 /* 298 * Called from the backend, hammer_inode.c, after a record has been 299 * flushed to disk. The record has been exclusively locked by the 300 * caller and interlocked with BE. 301 * 302 * We clean up the state, unlock, and release the record (the record 303 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). 304 */ 305 void 306 hammer_flush_record_done(hammer_record_t record, int error) 307 { 308 hammer_inode_t target_ip; 309 310 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 311 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 312 313 /* 314 * If an error occured, the backend was unable to sync the 315 * record to its media. Leave the record intact. 316 */ 317 if (error) { 318 hammer_critical_error(record->ip->hmp, record->ip, error, 319 "while flushing record"); 320 } 321 322 --record->flush_group->refs; 323 record->flush_group = NULL; 324 325 /* 326 * Adjust the flush state and dependancy based on success or 327 * failure. 328 */ 329 if (record->flags & (HAMMER_RECF_DELETED_BE | HAMMER_RECF_COMMITTED)) { 330 if ((target_ip = record->target_ip) != NULL) { 331 TAILQ_REMOVE(&target_ip->target_list, record, 332 target_entry); 333 record->target_ip = NULL; 334 hammer_test_inode(target_ip); 335 } 336 record->flush_state = HAMMER_FST_IDLE; 337 } else { 338 if (record->target_ip) { 339 record->flush_state = HAMMER_FST_SETUP; 340 hammer_test_inode(record->ip); 341 hammer_test_inode(record->target_ip); 342 } else { 343 record->flush_state = HAMMER_FST_IDLE; 344 } 345 } 346 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 347 348 /* 349 * Cleanup 350 */ 351 if (record->flags & HAMMER_RECF_WANTED) { 352 record->flags &= ~HAMMER_RECF_WANTED; 353 wakeup(record); 354 } 355 hammer_rel_mem_record(record); 356 } 357 358 /* 359 * Release a memory record. Records marked for deletion are immediately 360 * removed from the RB-Tree but otherwise left intact until the last ref 361 * goes away. 362 */ 363 void 364 hammer_rel_mem_record(struct hammer_record *record) 365 { 366 hammer_mount_t hmp; 367 hammer_reserve_t resv; 368 hammer_inode_t ip; 369 hammer_inode_t target_ip; 370 int diddrop; 371 372 hammer_rel(&record->lock); 373 374 if (hammer_norefs(&record->lock)) { 375 /* 376 * Upon release of the last reference wakeup any waiters. 377 * The record structure may get destroyed so callers will 378 * loop up and do a relookup. 379 * 380 * WARNING! Record must be removed from RB-TREE before we 381 * might possibly block. hammer_test_inode() can block! 382 */ 383 ip = record->ip; 384 hmp = ip->hmp; 385 386 /* 387 * Upon release of the last reference a record marked deleted 388 * by the front or backend, or committed by the backend, 389 * is destroyed. 390 */ 391 if (record->flags & (HAMMER_RECF_DELETED_FE | 392 HAMMER_RECF_DELETED_BE | 393 HAMMER_RECF_COMMITTED)) { 394 KKASSERT(hammer_isactive(&ip->lock) > 0); 395 KKASSERT(record->flush_state != HAMMER_FST_FLUSH); 396 397 /* 398 * target_ip may have zero refs, we have to ref it 399 * to prevent it from being ripped out from under 400 * us. 401 */ 402 if ((target_ip = record->target_ip) != NULL) { 403 TAILQ_REMOVE(&target_ip->target_list, 404 record, target_entry); 405 record->target_ip = NULL; 406 hammer_ref(&target_ip->lock); 407 } 408 409 /* 410 * Remove the record from the B-Tree 411 */ 412 if (record->flags & HAMMER_RECF_ONRBTREE) { 413 RB_REMOVE(hammer_rec_rb_tree, 414 &record->ip->rec_tree, 415 record); 416 record->flags &= ~HAMMER_RECF_ONRBTREE; 417 KKASSERT(ip->rsv_recs > 0); 418 if (RB_EMPTY(&record->ip->rec_tree)) { 419 record->ip->flags &= 420 ~HAMMER_INODE_XDIRTY; 421 record->ip->sync_flags &= 422 ~HAMMER_INODE_XDIRTY; 423 } 424 diddrop = 1; 425 } else { 426 diddrop = 0; 427 } 428 429 /* 430 * We must wait for any direct-IO to complete before 431 * we can destroy the record because the bio may 432 * have a reference to it. 433 */ 434 if (record->gflags & 435 (HAMMER_RECG_DIRECT_IO | HAMMER_RECG_DIRECT_INVAL)) { 436 hammer_io_direct_wait(record); 437 } 438 439 /* 440 * Account for the completion after the direct IO 441 * has completed. 442 */ 443 if (diddrop) { 444 --hmp->rsv_recs; 445 --ip->rsv_recs; 446 hmp->rsv_databytes -= record->leaf.data_len; 447 448 if (RB_EMPTY(&record->ip->rec_tree)) 449 hammer_test_inode(record->ip); 450 if (ip->rsv_recs == hammer_limit_inode_recs - 1) 451 wakeup(&ip->rsv_recs); 452 } 453 454 /* 455 * Do this test after removing record from the B-Tree. 456 */ 457 if (target_ip) { 458 hammer_test_inode(target_ip); 459 hammer_rel_inode(target_ip, 0); 460 } 461 462 if (record->flags & HAMMER_RECF_ALLOCDATA) { 463 --hammer_count_record_datas; 464 kfree(record->data, hmp->m_misc); 465 record->flags &= ~HAMMER_RECF_ALLOCDATA; 466 } 467 468 /* 469 * Release the reservation. 470 * 471 * If the record was not committed we can theoretically 472 * undo the reservation. However, doing so might 473 * create weird edge cases with the ordering of 474 * direct writes because the related buffer cache 475 * elements are per-vnode. So we don't try. 476 */ 477 if ((resv = record->resv) != NULL) { 478 /* XXX undo leaf.data_offset,leaf.data_len */ 479 hammer_blockmap_reserve_complete(hmp, resv); 480 record->resv = NULL; 481 } 482 record->data = NULL; 483 --hammer_count_records; 484 kfree(record, hmp->m_misc); 485 } 486 } 487 } 488 489 /* 490 * Record visibility depends on whether the record is being accessed by 491 * the backend or the frontend. Backend tests ignore the frontend delete 492 * flag. Frontend tests do NOT ignore the backend delete/commit flags and 493 * must also check for commit races. 494 * 495 * Return non-zero if the record is visible, zero if it isn't or if it is 496 * deleted. Returns 0 if the record has been comitted (unless the special 497 * delete-visibility flag is set). A committed record must be located 498 * via the media B-Tree. Returns non-zero if the record is good. 499 * 500 * If HAMMER_CURSOR_DELETE_VISIBILITY is set we allow deleted memory 501 * records to be returned. This is so pending deletions are detected 502 * when using an iterator to locate an unused hash key, or when we need 503 * to locate historical records on-disk to destroy. 504 */ 505 static __inline 506 int 507 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) 508 { 509 if (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) 510 return(1); 511 if (cursor->flags & HAMMER_CURSOR_BACKEND) { 512 if (record->flags & (HAMMER_RECF_DELETED_BE | 513 HAMMER_RECF_COMMITTED)) { 514 return(0); 515 } 516 } else { 517 if (record->flags & (HAMMER_RECF_DELETED_FE | 518 HAMMER_RECF_DELETED_BE | 519 HAMMER_RECF_COMMITTED)) { 520 return(0); 521 } 522 } 523 return(1); 524 } 525 526 /* 527 * This callback is used as part of the RB_SCAN function for in-memory 528 * records. We terminate it (return -1) as soon as we get a match. 529 * 530 * This routine is used by frontend code. 531 * 532 * The primary compare code does not account for ASOF lookups. This 533 * code handles that case as well as a few others. 534 */ 535 static 536 int 537 hammer_rec_scan_callback(hammer_record_t rec, void *data) 538 { 539 hammer_cursor_t cursor = data; 540 541 /* 542 * We terminate on success, so this should be NULL on entry. 543 */ 544 KKASSERT(cursor->iprec == NULL); 545 546 /* 547 * Skip if the record was marked deleted or committed. 548 */ 549 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) 550 return(0); 551 552 /* 553 * Skip if not visible due to our as-of TID 554 */ 555 if (cursor->flags & HAMMER_CURSOR_ASOF) { 556 if (cursor->asof < rec->leaf.base.create_tid) 557 return(0); 558 if (rec->leaf.base.delete_tid && 559 cursor->asof >= rec->leaf.base.delete_tid) { 560 return(0); 561 } 562 } 563 564 /* 565 * ref the record. The record is protected from backend B-Tree 566 * interactions by virtue of the cursor's IP lock. 567 */ 568 hammer_ref(&rec->lock); 569 570 /* 571 * The record may have been deleted or committed while we 572 * were blocked. XXX remove? 573 */ 574 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { 575 hammer_rel_mem_record(rec); 576 return(0); 577 } 578 579 /* 580 * Set the matching record and stop the scan. 581 */ 582 cursor->iprec = rec; 583 return(-1); 584 } 585 586 587 /* 588 * Lookup an in-memory record given the key specified in the cursor. Works 589 * just like hammer_btree_lookup() but operates on an inode's in-memory 590 * record list. 591 * 592 * The lookup must fail if the record is marked for deferred deletion. 593 * 594 * The API for mem/btree_lookup() does not mess with the ATE/EOF bits. 595 */ 596 static 597 int 598 hammer_mem_lookup(hammer_cursor_t cursor) 599 { 600 KKASSERT(cursor->ip); 601 if (cursor->iprec) { 602 hammer_rel_mem_record(cursor->iprec); 603 cursor->iprec = NULL; 604 } 605 hammer_rec_rb_tree_RB_SCAN(&cursor->ip->rec_tree, hammer_rec_find_cmp, 606 hammer_rec_scan_callback, cursor); 607 608 return (cursor->iprec ? 0 : ENOENT); 609 } 610 611 /* 612 * hammer_mem_first() - locate the first in-memory record matching the 613 * cursor within the bounds of the key range. 614 * 615 * WARNING! API is slightly different from btree_first(). hammer_mem_first() 616 * will set ATEMEM the same as MEMEOF, and does not return any error. 617 */ 618 static 619 void 620 hammer_mem_first(hammer_cursor_t cursor) 621 { 622 hammer_inode_t ip; 623 624 ip = cursor->ip; 625 KKASSERT(ip != NULL); 626 627 if (cursor->iprec) { 628 hammer_rel_mem_record(cursor->iprec); 629 cursor->iprec = NULL; 630 } 631 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 632 hammer_rec_scan_callback, cursor); 633 634 if (cursor->iprec) 635 cursor->flags &= ~(HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM); 636 else 637 cursor->flags |= HAMMER_CURSOR_MEMEOF | HAMMER_CURSOR_ATEMEM; 638 } 639 640 /************************************************************************ 641 * HAMMER IN-MEMORY RECORD FUNCTIONS * 642 ************************************************************************ 643 * 644 * These functions manipulate in-memory records. Such records typically 645 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 646 */ 647 648 /* 649 * Add a directory entry (dip,ncp) which references inode (ip). 650 * 651 * Note that the low 32 bits of the namekey are set temporarily to create 652 * a unique in-memory record, and may be modified a second time when the 653 * record is synchronized to disk. In particular, the low 32 bits cannot be 654 * all 0's when synching to disk, which is not handled here. 655 * 656 * NOTE: bytes does not include any terminating \0 on name, and name might 657 * not be terminated. 658 */ 659 int 660 hammer_ip_add_directory(struct hammer_transaction *trans, 661 struct hammer_inode *dip, const char *name, int bytes, 662 struct hammer_inode *ip) 663 { 664 struct hammer_cursor cursor; 665 hammer_record_t record; 666 int error; 667 u_int32_t max_iterations; 668 669 record = hammer_alloc_mem_record(dip, HAMMER_ENTRY_SIZE(bytes)); 670 671 record->type = HAMMER_MEM_RECORD_ADD; 672 record->leaf.base.localization = dip->obj_localization + 673 hammer_dir_localization(dip); 674 record->leaf.base.obj_id = dip->obj_id; 675 record->leaf.base.key = hammer_directory_namekey(dip, name, bytes, 676 &max_iterations); 677 record->leaf.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 678 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 679 record->data->entry.obj_id = ip->obj_id; 680 record->data->entry.localization = ip->obj_localization; 681 bcopy(name, record->data->entry.name, bytes); 682 683 ++ip->ino_data.nlinks; 684 ip->ino_data.ctime = trans->time; 685 hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); 686 687 /* 688 * Find an unused namekey. Both the in-memory record tree and 689 * the B-Tree are checked. We do not want historically deleted 690 * names to create a collision as our iteration space may be limited, 691 * and since create_tid wouldn't match anyway an ASOF search 692 * must be used to locate collisions. 693 * 694 * delete-visibility is set so pending deletions do not give us 695 * a false-negative on our ability to use an iterator. 696 * 697 * The iterator must not rollover the key. Directory keys only 698 * use the positive key space. 699 */ 700 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip); 701 cursor.key_beg = record->leaf.base; 702 cursor.flags |= HAMMER_CURSOR_ASOF; 703 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 704 cursor.asof = ip->obj_asof; 705 706 while (hammer_ip_lookup(&cursor) == 0) { 707 ++record->leaf.base.key; 708 KKASSERT(record->leaf.base.key > 0); 709 cursor.key_beg.key = record->leaf.base.key; 710 if (--max_iterations == 0) { 711 hammer_rel_mem_record(record); 712 error = ENOSPC; 713 goto failed; 714 } 715 } 716 717 /* 718 * The target inode and the directory entry are bound together. 719 */ 720 record->target_ip = ip; 721 record->flush_state = HAMMER_FST_SETUP; 722 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 723 724 /* 725 * The inode now has a dependancy and must be taken out of the idle 726 * state. An inode not in an idle state is given an extra reference. 727 * 728 * When transitioning to a SETUP state flag for an automatic reflush 729 * when the dependancies are disposed of if someone is waiting on 730 * the inode. 731 */ 732 if (ip->flush_state == HAMMER_FST_IDLE) { 733 hammer_ref(&ip->lock); 734 ip->flush_state = HAMMER_FST_SETUP; 735 if (ip->flags & HAMMER_INODE_FLUSHW) 736 ip->flags |= HAMMER_INODE_REFLUSH; 737 } 738 error = hammer_mem_add(record); 739 if (error == 0) { 740 dip->ino_data.mtime = trans->time; 741 hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); 742 } 743 failed: 744 hammer_done_cursor(&cursor); 745 return(error); 746 } 747 748 /* 749 * Delete the directory entry and update the inode link count. The 750 * cursor must be seeked to the directory entry record being deleted. 751 * 752 * The related inode should be share-locked by the caller. The caller is 753 * on the frontend. It could also be NULL indicating that the directory 754 * entry being removed has no related inode. 755 * 756 * This function can return EDEADLK requiring the caller to terminate 757 * the cursor, any locks, wait on the returned record, and retry. 758 */ 759 int 760 hammer_ip_del_directory(struct hammer_transaction *trans, 761 hammer_cursor_t cursor, struct hammer_inode *dip, 762 struct hammer_inode *ip) 763 { 764 hammer_record_t record; 765 int error; 766 767 if (hammer_cursor_inmem(cursor)) { 768 /* 769 * In-memory (unsynchronized) records can simply be freed. 770 * 771 * Even though the HAMMER_RECF_DELETED_FE flag is ignored 772 * by the backend, we must still avoid races against the 773 * backend potentially syncing the record to the media. 774 * 775 * We cannot call hammer_ip_delete_record(), that routine may 776 * only be called from the backend. 777 */ 778 record = cursor->iprec; 779 if (record->flags & (HAMMER_RECF_INTERLOCK_BE | 780 HAMMER_RECF_DELETED_BE | 781 HAMMER_RECF_COMMITTED)) { 782 KKASSERT(cursor->deadlk_rec == NULL); 783 hammer_ref(&record->lock); 784 cursor->deadlk_rec = record; 785 error = EDEADLK; 786 } else { 787 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 788 record->flags |= HAMMER_RECF_DELETED_FE; 789 error = 0; 790 } 791 } else { 792 /* 793 * If the record is on-disk we have to queue the deletion by 794 * the record's key. This also causes lookups to skip the 795 * record (lookups for the purposes of finding an unused 796 * directory key do not skip the record). 797 */ 798 KKASSERT(dip->flags & 799 (HAMMER_INODE_ONDISK | HAMMER_INODE_DONDISK)); 800 record = hammer_alloc_mem_record(dip, 0); 801 record->type = HAMMER_MEM_RECORD_DEL; 802 record->leaf.base = cursor->leaf->base; 803 KKASSERT(dip->obj_id == record->leaf.base.obj_id); 804 805 /* 806 * ip may be NULL, indicating the deletion of a directory 807 * entry which has no related inode. 808 */ 809 record->target_ip = ip; 810 if (ip) { 811 record->flush_state = HAMMER_FST_SETUP; 812 TAILQ_INSERT_TAIL(&ip->target_list, record, 813 target_entry); 814 } else { 815 record->flush_state = HAMMER_FST_IDLE; 816 } 817 818 /* 819 * The inode now has a dependancy and must be taken out of 820 * the idle state. An inode not in an idle state is given 821 * an extra reference. 822 * 823 * When transitioning to a SETUP state flag for an automatic 824 * reflush when the dependancies are disposed of if someone 825 * is waiting on the inode. 826 */ 827 if (ip && ip->flush_state == HAMMER_FST_IDLE) { 828 hammer_ref(&ip->lock); 829 ip->flush_state = HAMMER_FST_SETUP; 830 if (ip->flags & HAMMER_INODE_FLUSHW) 831 ip->flags |= HAMMER_INODE_REFLUSH; 832 } 833 834 error = hammer_mem_add(record); 835 } 836 837 /* 838 * One less link. The file may still be open in the OS even after 839 * all links have gone away. 840 * 841 * We have to terminate the cursor before syncing the inode to 842 * avoid deadlocking against ourselves. XXX this may no longer 843 * be true. 844 * 845 * If nlinks drops to zero and the vnode is inactive (or there is 846 * no vnode), call hammer_inode_unloadable_check() to zonk the 847 * inode. If we don't do this here the inode will not be destroyed 848 * on-media until we unmount. 849 */ 850 if (error == 0) { 851 if (ip) { 852 --ip->ino_data.nlinks; /* do before we might block */ 853 ip->ino_data.ctime = trans->time; 854 } 855 dip->ino_data.mtime = trans->time; 856 hammer_modify_inode(trans, dip, HAMMER_INODE_MTIME); 857 if (ip) { 858 hammer_modify_inode(trans, ip, HAMMER_INODE_DDIRTY); 859 if (ip->ino_data.nlinks == 0 && 860 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 861 hammer_done_cursor(cursor); 862 hammer_inode_unloadable_check(ip, 1); 863 hammer_flush_inode(ip, 0); 864 } 865 } 866 867 } 868 return(error); 869 } 870 871 /* 872 * Add a record to an inode. 873 * 874 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 875 * initialize the following additional fields: 876 * 877 * The related inode should be share-locked by the caller. The caller is 878 * on the frontend. 879 * 880 * record->rec.entry.base.base.key 881 * record->rec.entry.base.base.rec_type 882 * record->rec.entry.base.base.data_len 883 * record->data (a copy will be kmalloc'd if it cannot be embedded) 884 */ 885 int 886 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 887 { 888 hammer_inode_t ip = record->ip; 889 int error; 890 891 KKASSERT(record->leaf.base.localization != 0); 892 record->leaf.base.obj_id = ip->obj_id; 893 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 894 error = hammer_mem_add(record); 895 return(error); 896 } 897 898 /* 899 * Locate a pre-existing bulk record in memory. The caller wishes to 900 * replace the record with a new one. The existing record may have a 901 * different length (and thus a different key) so we have to use an 902 * overlap check function. 903 */ 904 static hammer_record_t 905 hammer_ip_get_bulk(hammer_record_t record) 906 { 907 struct hammer_bulk_info info; 908 hammer_inode_t ip = record->ip; 909 910 info.record = record; 911 info.conflict = NULL; 912 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_overlap_cmp, 913 hammer_bulk_scan_callback, &info); 914 915 return(info.conflict); /* may be NULL */ 916 } 917 918 /* 919 * Take records vetted by overlap_cmp. The first non-deleted record 920 * (if any) stops the scan. 921 */ 922 static int 923 hammer_bulk_scan_callback(hammer_record_t record, void *data) 924 { 925 struct hammer_bulk_info *info = data; 926 927 if (record->flags & (HAMMER_RECF_DELETED_FE | HAMMER_RECF_DELETED_BE | 928 HAMMER_RECF_COMMITTED)) { 929 return(0); 930 } 931 hammer_ref(&record->lock); 932 info->conflict = record; 933 return(-1); /* stop scan */ 934 } 935 936 /* 937 * Reserve blockmap space placemarked with an in-memory record. 938 * 939 * This routine is called by the frontend in order to be able to directly 940 * flush a buffer cache buffer. The frontend has locked the related buffer 941 * cache buffers and we should be able to manipulate any overlapping 942 * in-memory records. 943 * 944 * The caller is responsible for adding the returned record and deleting 945 * the returned conflicting record (if any), typically by calling 946 * hammer_ip_replace_bulk() (via hammer_io_direct_write()). 947 */ 948 hammer_record_t 949 hammer_ip_add_bulk(hammer_inode_t ip, off_t file_offset, void *data, int bytes, 950 int *errorp) 951 { 952 hammer_record_t record; 953 hammer_dedup_cache_t dcp; 954 hammer_crc_t crc; 955 int zone; 956 957 /* 958 * Create a record to cover the direct write. The record cannot 959 * be added to the in-memory RB tree here as it might conflict 960 * with an existing memory record. See hammer_io_direct_write(). 961 * 962 * The backend is responsible for finalizing the space reserved in 963 * this record. 964 * 965 * XXX bytes not aligned, depend on the reservation code to 966 * align the reservation. 967 */ 968 record = hammer_alloc_mem_record(ip, 0); 969 zone = (bytes >= HAMMER_BUFSIZE) ? HAMMER_ZONE_LARGE_DATA_INDEX : 970 HAMMER_ZONE_SMALL_DATA_INDEX; 971 if (bytes == 0) 972 crc = 0; 973 else 974 crc = crc32(data, bytes); 975 976 if (hammer_live_dedup == 0) 977 goto nodedup; 978 if ((dcp = hammer_dedup_cache_lookup(ip->hmp, crc)) != NULL) { 979 struct hammer_dedup_cache tmp = *dcp; 980 981 record->resv = hammer_blockmap_reserve_dedup(ip->hmp, zone, 982 bytes, tmp.data_offset, errorp); 983 if (record->resv == NULL) 984 goto nodedup; 985 986 if (!hammer_dedup_validate(&tmp, zone, bytes, data)) { 987 hammer_blockmap_reserve_complete(ip->hmp, record->resv); 988 goto nodedup; 989 } 990 991 record->leaf.data_offset = tmp.data_offset; 992 record->flags |= HAMMER_RECF_DEDUPED; 993 } else { 994 nodedup: 995 record->resv = hammer_blockmap_reserve(ip->hmp, zone, bytes, 996 &record->leaf.data_offset, errorp); 997 if (record->resv == NULL) { 998 kprintf("hammer_ip_add_bulk: reservation failed\n"); 999 hammer_rel_mem_record(record); 1000 return(NULL); 1001 } 1002 } 1003 1004 record->type = HAMMER_MEM_RECORD_DATA; 1005 record->leaf.base.rec_type = HAMMER_RECTYPE_DATA; 1006 record->leaf.base.obj_type = ip->ino_leaf.base.obj_type; 1007 record->leaf.base.obj_id = ip->obj_id; 1008 record->leaf.base.key = file_offset + bytes; 1009 record->leaf.base.localization = ip->obj_localization + 1010 HAMMER_LOCALIZE_MISC; 1011 record->leaf.data_len = bytes; 1012 record->leaf.data_crc = crc; 1013 KKASSERT(*errorp == 0); 1014 1015 return(record); 1016 } 1017 1018 /* 1019 * Called by hammer_io_direct_write() prior to any possible completion 1020 * of the BIO to emplace the memory record associated with the I/O and 1021 * to replace any prior memory record which might still be active. 1022 * 1023 * Setting the FE deleted flag on the old record (if any) avoids any RB 1024 * tree insertion conflict, amoung other things. 1025 * 1026 * This has to be done prior to the caller completing any related buffer 1027 * cache I/O or a reinstantiation of the buffer may load data from the 1028 * old media location instead of the new media location. The holding 1029 * of the locked buffer cache buffer serves to interlock the record 1030 * replacement operation. 1031 */ 1032 void 1033 hammer_ip_replace_bulk(hammer_mount_t hmp, hammer_record_t record) 1034 { 1035 hammer_record_t conflict; 1036 int error; 1037 1038 while ((conflict = hammer_ip_get_bulk(record)) != NULL) { 1039 if ((conflict->flags & HAMMER_RECF_INTERLOCK_BE) == 0) { 1040 conflict->flags |= HAMMER_RECF_DELETED_FE; 1041 break; 1042 } 1043 conflict->flags |= HAMMER_RECF_WANTED; 1044 tsleep(conflict, 0, "hmrrc3", 0); 1045 hammer_rel_mem_record(conflict); 1046 } 1047 error = hammer_mem_add(record); 1048 if (conflict) 1049 hammer_rel_mem_record(conflict); 1050 KKASSERT(error == 0); 1051 } 1052 1053 /* 1054 * Frontend truncation code. Scan in-memory records only. On-disk records 1055 * and records in a flushing state are handled by the backend. The vnops 1056 * setattr code will handle the block containing the truncation point. 1057 * 1058 * Partial blocks are not deleted. 1059 * 1060 * This code is only called on regular files. 1061 */ 1062 int 1063 hammer_ip_frontend_trunc(struct hammer_inode *ip, off_t file_size) 1064 { 1065 struct rec_trunc_info info; 1066 1067 switch(ip->ino_data.obj_type) { 1068 case HAMMER_OBJTYPE_REGFILE: 1069 info.rec_type = HAMMER_RECTYPE_DATA; 1070 break; 1071 case HAMMER_OBJTYPE_DBFILE: 1072 info.rec_type = HAMMER_RECTYPE_DB; 1073 break; 1074 default: 1075 return(EINVAL); 1076 } 1077 info.trunc_off = file_size; 1078 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_trunc_cmp, 1079 hammer_frontend_trunc_callback, &info); 1080 return(0); 1081 } 1082 1083 /* 1084 * Scan callback for frontend records to destroy during a truncation. 1085 * We must ensure that DELETED_FE is set on the record or the frontend 1086 * will get confused in future read() calls. 1087 * 1088 * NOTE: DELETED_FE cannot be set while the record interlock (BE) is held. 1089 * In this rare case we must wait for the interlock to be cleared. 1090 * 1091 * NOTE: This function is only called on regular files. There are further 1092 * restrictions to the setting of DELETED_FE on directory records 1093 * undergoing a flush due to sensitive inode link count calculations. 1094 */ 1095 static int 1096 hammer_frontend_trunc_callback(hammer_record_t record, void *data __unused) 1097 { 1098 if (record->flags & HAMMER_RECF_DELETED_FE) 1099 return(0); 1100 #if 0 1101 if (record->flush_state == HAMMER_FST_FLUSH) 1102 return(0); 1103 #endif 1104 hammer_ref(&record->lock); 1105 while (record->flags & HAMMER_RECF_INTERLOCK_BE) 1106 hammer_wait_mem_record_ident(record, "hmmtrr"); 1107 record->flags |= HAMMER_RECF_DELETED_FE; 1108 hammer_rel_mem_record(record); 1109 return(0); 1110 } 1111 1112 /* 1113 * Return 1 if the caller must check for and delete existing records 1114 * before writing out a new data record. 1115 * 1116 * Return 0 if the caller can just insert the record into the B-Tree without 1117 * checking. 1118 */ 1119 static int 1120 hammer_record_needs_overwrite_delete(hammer_record_t record) 1121 { 1122 hammer_inode_t ip = record->ip; 1123 int64_t file_offset; 1124 int r; 1125 1126 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) 1127 file_offset = record->leaf.base.key; 1128 else 1129 file_offset = record->leaf.base.key - record->leaf.data_len; 1130 r = (file_offset < ip->save_trunc_off); 1131 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1132 if (ip->save_trunc_off <= record->leaf.base.key) 1133 ip->save_trunc_off = record->leaf.base.key + 1; 1134 } else { 1135 if (ip->save_trunc_off < record->leaf.base.key) 1136 ip->save_trunc_off = record->leaf.base.key; 1137 } 1138 return(r); 1139 } 1140 1141 /* 1142 * Backend code. Sync a record to the media. 1143 */ 1144 int 1145 hammer_ip_sync_record_cursor(hammer_cursor_t cursor, hammer_record_t record) 1146 { 1147 hammer_transaction_t trans = cursor->trans; 1148 int64_t file_offset; 1149 int bytes; 1150 void *bdata; 1151 int error; 1152 int doprop; 1153 1154 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1155 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 1156 KKASSERT(record->leaf.base.localization != 0); 1157 1158 /* 1159 * Any direct-write related to the record must complete before we 1160 * can sync the record to the on-disk media. 1161 */ 1162 if (record->gflags & (HAMMER_RECG_DIRECT_IO | HAMMER_RECG_DIRECT_INVAL)) 1163 hammer_io_direct_wait(record); 1164 1165 /* 1166 * If this is a bulk-data record placemarker there may be an existing 1167 * record on-disk, indicating a data overwrite. If there is the 1168 * on-disk record must be deleted before we can insert our new record. 1169 * 1170 * We've synthesized this record and do not know what the create_tid 1171 * on-disk is, nor how much data it represents. 1172 * 1173 * Keep in mind that (key) for data records is (base_offset + len), 1174 * not (base_offset). Also, we only want to get rid of on-disk 1175 * records since we are trying to sync our in-memory record, call 1176 * hammer_ip_delete_range() with truncating set to 1 to make sure 1177 * it skips in-memory records. 1178 * 1179 * It is ok for the lookup to return ENOENT. 1180 * 1181 * NOTE OPTIMIZATION: sync_trunc_off is used to determine if we have 1182 * to call hammer_ip_delete_range() or not. This also means we must 1183 * update sync_trunc_off() as we write. 1184 */ 1185 if (record->type == HAMMER_MEM_RECORD_DATA && 1186 hammer_record_needs_overwrite_delete(record)) { 1187 file_offset = record->leaf.base.key - record->leaf.data_len; 1188 bytes = (record->leaf.data_len + HAMMER_BUFMASK) & 1189 ~HAMMER_BUFMASK; 1190 KKASSERT((file_offset & HAMMER_BUFMASK) == 0); 1191 error = hammer_ip_delete_range( 1192 cursor, record->ip, 1193 file_offset, file_offset + bytes - 1, 1194 1); 1195 if (error && error != ENOENT) 1196 goto done; 1197 } 1198 1199 /* 1200 * If this is a general record there may be an on-disk version 1201 * that must be deleted before we can insert the new record. 1202 */ 1203 if (record->type == HAMMER_MEM_RECORD_GENERAL) { 1204 error = hammer_delete_general(cursor, record->ip, 1205 &record->leaf); 1206 if (error && error != ENOENT) 1207 goto done; 1208 } 1209 1210 /* 1211 * Setup the cursor. 1212 */ 1213 hammer_normalize_cursor(cursor); 1214 cursor->key_beg = record->leaf.base; 1215 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1216 cursor->flags |= HAMMER_CURSOR_BACKEND; 1217 cursor->flags &= ~HAMMER_CURSOR_INSERT; 1218 1219 /* 1220 * Records can wind up on-media before the inode itself is on-media. 1221 * Flag the case. 1222 */ 1223 record->ip->flags |= HAMMER_INODE_DONDISK; 1224 1225 /* 1226 * If we are deleting a directory entry an exact match must be 1227 * found on-disk. 1228 */ 1229 if (record->type == HAMMER_MEM_RECORD_DEL) { 1230 error = hammer_btree_lookup(cursor); 1231 if (error == 0) { 1232 KKASSERT(cursor->iprec == NULL); 1233 error = hammer_ip_delete_record(cursor, record->ip, 1234 trans->tid); 1235 if (error == 0) { 1236 record->flags |= HAMMER_RECF_DELETED_BE | 1237 HAMMER_RECF_COMMITTED; 1238 ++record->ip->rec_generation; 1239 } 1240 } 1241 goto done; 1242 } 1243 1244 /* 1245 * We are inserting. 1246 * 1247 * Issue a lookup to position the cursor and locate the insertion 1248 * point. The target key should not exist. If we are creating a 1249 * directory entry we may have to iterate the low 32 bits of the 1250 * key to find an unused key. 1251 */ 1252 hammer_sync_lock_sh(trans); 1253 cursor->flags |= HAMMER_CURSOR_INSERT; 1254 error = hammer_btree_lookup(cursor); 1255 if (hammer_debug_inode) 1256 kprintf("DOINSERT LOOKUP %d\n", error); 1257 if (error == 0) { 1258 kprintf("hammer_ip_sync_record: duplicate rec " 1259 "at (%016llx)\n", (long long)record->leaf.base.key); 1260 if (hammer_debug_critical) 1261 Debugger("duplicate record1"); 1262 error = EIO; 1263 } 1264 #if 0 1265 if (record->type == HAMMER_MEM_RECORD_DATA) 1266 kprintf("sync_record %016llx ---------------- %016llx %d\n", 1267 record->leaf.base.key - record->leaf.data_len, 1268 record->leaf.data_offset, error); 1269 #endif 1270 1271 if (error != ENOENT) 1272 goto done_unlock; 1273 1274 /* 1275 * Allocate the record and data. The result buffers will be 1276 * marked as being modified and further calls to 1277 * hammer_modify_buffer() will result in unneeded UNDO records. 1278 * 1279 * Support zero-fill records (data == NULL and data_len != 0) 1280 */ 1281 if (record->type == HAMMER_MEM_RECORD_DATA) { 1282 /* 1283 * The data portion of a bulk-data record has already been 1284 * committed to disk, we need only adjust the layer2 1285 * statistics in the same transaction as our B-Tree insert. 1286 */ 1287 KKASSERT(record->leaf.data_offset != 0); 1288 error = hammer_blockmap_finalize(trans, 1289 record->resv, 1290 record->leaf.data_offset, 1291 record->leaf.data_len); 1292 1293 if (hammer_live_dedup == 2 && 1294 (record->flags & HAMMER_RECF_DEDUPED) == 0) { 1295 hammer_dedup_cache_add(record->ip, &record->leaf); 1296 } 1297 } else if (record->data && record->leaf.data_len) { 1298 /* 1299 * Wholely cached record, with data. Allocate the data. 1300 */ 1301 bdata = hammer_alloc_data(trans, record->leaf.data_len, 1302 record->leaf.base.rec_type, 1303 &record->leaf.data_offset, 1304 &cursor->data_buffer, 1305 0, &error); 1306 if (bdata == NULL) 1307 goto done_unlock; 1308 hammer_crc_set_leaf(record->data, &record->leaf); 1309 hammer_modify_buffer(trans, cursor->data_buffer, NULL, 0); 1310 bcopy(record->data, bdata, record->leaf.data_len); 1311 hammer_modify_buffer_done(cursor->data_buffer); 1312 } else { 1313 /* 1314 * Wholely cached record, without data. 1315 */ 1316 record->leaf.data_offset = 0; 1317 record->leaf.data_crc = 0; 1318 } 1319 1320 error = hammer_btree_insert(cursor, &record->leaf, &doprop); 1321 if (hammer_debug_inode && error) { 1322 kprintf("BTREE INSERT error %d @ %016llx:%d key %016llx\n", 1323 error, 1324 (long long)cursor->node->node_offset, 1325 cursor->index, 1326 (long long)record->leaf.base.key); 1327 } 1328 1329 /* 1330 * Our record is on-disk and we normally mark the in-memory version 1331 * as having been committed (and not BE-deleted). 1332 * 1333 * If the record represented a directory deletion but we had to 1334 * sync a valid directory entry to disk due to dependancies, 1335 * we must convert the record to a covering delete so the 1336 * frontend does not have visibility on the synced entry. 1337 * 1338 * WARNING: cursor's leaf pointer may have changed after do_propagation 1339 * returns! 1340 */ 1341 if (error == 0) { 1342 if (doprop) { 1343 hammer_btree_do_propagation(cursor, 1344 record->ip->pfsm, 1345 &record->leaf); 1346 } 1347 if (record->flags & HAMMER_RECF_CONVERT_DELETE) { 1348 /* 1349 * Must convert deleted directory entry add 1350 * to a directory entry delete. 1351 */ 1352 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 1353 record->flags &= ~HAMMER_RECF_DELETED_FE; 1354 record->type = HAMMER_MEM_RECORD_DEL; 1355 KKASSERT(record->ip->obj_id == record->leaf.base.obj_id); 1356 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 1357 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 1358 KKASSERT((record->flags & (HAMMER_RECF_COMMITTED | 1359 HAMMER_RECF_DELETED_BE)) == 0); 1360 /* converted record is not yet committed */ 1361 /* hammer_flush_record_done takes care of the rest */ 1362 } else { 1363 /* 1364 * Everything went fine and we are now done with 1365 * this record. 1366 */ 1367 record->flags |= HAMMER_RECF_COMMITTED; 1368 ++record->ip->rec_generation; 1369 } 1370 } else { 1371 if (record->leaf.data_offset) { 1372 hammer_blockmap_free(trans, record->leaf.data_offset, 1373 record->leaf.data_len); 1374 } 1375 } 1376 done_unlock: 1377 hammer_sync_unlock(trans); 1378 done: 1379 return(error); 1380 } 1381 1382 /* 1383 * Add the record to the inode's rec_tree. The low 32 bits of a directory 1384 * entry's key is used to deal with hash collisions in the upper 32 bits. 1385 * A unique 64 bit key is generated in-memory and may be regenerated a 1386 * second time when the directory record is flushed to the on-disk B-Tree. 1387 * 1388 * A referenced record is passed to this function. This function 1389 * eats the reference. If an error occurs the record will be deleted. 1390 * 1391 * A copy of the temporary record->data pointer provided by the caller 1392 * will be made. 1393 */ 1394 int 1395 hammer_mem_add(hammer_record_t record) 1396 { 1397 hammer_mount_t hmp = record->ip->hmp; 1398 1399 /* 1400 * Make a private copy of record->data 1401 */ 1402 if (record->data) 1403 KKASSERT(record->flags & HAMMER_RECF_ALLOCDATA); 1404 1405 /* 1406 * Insert into the RB tree. A unique key should have already 1407 * been selected if this is a directory entry. 1408 */ 1409 if (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 1410 record->flags |= HAMMER_RECF_DELETED_FE; 1411 hammer_rel_mem_record(record); 1412 return (EEXIST); 1413 } 1414 ++hmp->count_newrecords; 1415 ++hmp->rsv_recs; 1416 ++record->ip->rsv_recs; 1417 record->ip->hmp->rsv_databytes += record->leaf.data_len; 1418 record->flags |= HAMMER_RECF_ONRBTREE; 1419 hammer_modify_inode(NULL, record->ip, HAMMER_INODE_XDIRTY); 1420 hammer_rel_mem_record(record); 1421 return(0); 1422 } 1423 1424 /************************************************************************ 1425 * HAMMER INODE MERGED-RECORD FUNCTIONS * 1426 ************************************************************************ 1427 * 1428 * These functions augment the B-Tree scanning functions in hammer_btree.c 1429 * by merging in-memory records with on-disk records. 1430 */ 1431 1432 /* 1433 * Locate a particular record either in-memory or on-disk. 1434 * 1435 * NOTE: This is basically a standalone routine, hammer_ip_next() may 1436 * NOT be called to iterate results. 1437 */ 1438 int 1439 hammer_ip_lookup(hammer_cursor_t cursor) 1440 { 1441 int error; 1442 1443 /* 1444 * If the element is in-memory return it without searching the 1445 * on-disk B-Tree 1446 */ 1447 KKASSERT(cursor->ip); 1448 error = hammer_mem_lookup(cursor); 1449 if (error == 0) { 1450 cursor->leaf = &cursor->iprec->leaf; 1451 return(error); 1452 } 1453 if (error != ENOENT) 1454 return(error); 1455 1456 /* 1457 * If the inode has on-disk components search the on-disk B-Tree. 1458 */ 1459 if ((cursor->ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 1460 return(error); 1461 error = hammer_btree_lookup(cursor); 1462 if (error == 0) 1463 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1464 return(error); 1465 } 1466 1467 /* 1468 * Helper for hammer_ip_first()/hammer_ip_next() 1469 * 1470 * NOTE: Both ATEDISK and DISKEOF will be set the same. This sets up 1471 * hammer_ip_first() for calling hammer_ip_next(), and sets up the re-seek 1472 * state if hammer_ip_next() needs to re-seek. 1473 */ 1474 static __inline 1475 int 1476 _hammer_ip_seek_btree(hammer_cursor_t cursor) 1477 { 1478 hammer_inode_t ip = cursor->ip; 1479 int error; 1480 1481 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1482 error = hammer_btree_lookup(cursor); 1483 if (error == ENOENT || error == EDEADLK) { 1484 if (hammer_debug_general & 0x2000) { 1485 kprintf("error %d node %p %016llx index %d\n", 1486 error, cursor->node, 1487 (long long)cursor->node->node_offset, 1488 cursor->index); 1489 } 1490 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1491 error = hammer_btree_iterate(cursor); 1492 } 1493 if (error == 0) { 1494 cursor->flags &= ~(HAMMER_CURSOR_DISKEOF | 1495 HAMMER_CURSOR_ATEDISK); 1496 } else { 1497 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1498 HAMMER_CURSOR_ATEDISK; 1499 if (error == ENOENT) 1500 error = 0; 1501 } 1502 } else { 1503 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_ATEDISK; 1504 error = 0; 1505 } 1506 return(error); 1507 } 1508 1509 /* 1510 * Helper for hammer_ip_next() 1511 * 1512 * The caller has determined that the media cursor is further along than the 1513 * memory cursor and must be reseeked after a generation number change. 1514 */ 1515 static 1516 int 1517 _hammer_ip_reseek(hammer_cursor_t cursor) 1518 { 1519 struct hammer_base_elm save; 1520 hammer_btree_elm_t elm; 1521 int error; 1522 int r; 1523 int again = 0; 1524 1525 /* 1526 * Do the re-seek. 1527 */ 1528 kprintf("HAMMER: Debug: re-seeked during scan @ino=%016llx\n", 1529 (long long)cursor->ip->obj_id); 1530 save = cursor->key_beg; 1531 cursor->key_beg = cursor->iprec->leaf.base; 1532 error = _hammer_ip_seek_btree(cursor); 1533 KKASSERT(error == 0); 1534 cursor->key_beg = save; 1535 1536 /* 1537 * If the memory record was previous returned to 1538 * the caller and the media record matches 1539 * (-1/+1: only create_tid differs), then iterate 1540 * the media record to avoid a double result. 1541 */ 1542 if ((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0 && 1543 (cursor->flags & HAMMER_CURSOR_LASTWASMEM)) { 1544 elm = &cursor->node->ondisk->elms[cursor->index]; 1545 r = hammer_btree_cmp(&elm->base, 1546 &cursor->iprec->leaf.base); 1547 if (cursor->flags & HAMMER_CURSOR_ASOF) { 1548 if (r >= -1 && r <= 1) { 1549 kprintf("HAMMER: Debug: iterated after " 1550 "re-seek (asof r=%d)\n", r); 1551 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1552 again = 1; 1553 } 1554 } else { 1555 if (r == 0) { 1556 kprintf("HAMMER: Debug: iterated after " 1557 "re-seek\n"); 1558 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1559 again = 1; 1560 } 1561 } 1562 } 1563 return(again); 1564 } 1565 1566 /* 1567 * Locate the first record within the cursor's key_beg/key_end range, 1568 * restricted to a particular inode. 0 is returned on success, ENOENT 1569 * if no records matched the requested range, or some other error. 1570 * 1571 * When 0 is returned hammer_ip_next() may be used to iterate additional 1572 * records within the requested range. 1573 * 1574 * This function can return EDEADLK, requiring the caller to terminate 1575 * the cursor and try again. 1576 */ 1577 1578 int 1579 hammer_ip_first(hammer_cursor_t cursor) 1580 { 1581 hammer_inode_t ip __debugvar = cursor->ip; 1582 int error; 1583 1584 KKASSERT(ip != NULL); 1585 1586 /* 1587 * Clean up fields and setup for merged scan 1588 */ 1589 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1590 1591 /* 1592 * Search the in-memory record list (Red-Black tree). Unlike the 1593 * B-Tree search, mem_first checks for records in the range. 1594 * 1595 * This function will setup both ATEMEM and MEMEOF properly for 1596 * the ip iteration. ATEMEM will be set if MEMEOF is set. 1597 */ 1598 hammer_mem_first(cursor); 1599 1600 /* 1601 * Detect generation changes during blockages, including 1602 * blockages which occur on the initial btree search. 1603 */ 1604 cursor->rec_generation = cursor->ip->rec_generation; 1605 1606 /* 1607 * Initial search and result 1608 */ 1609 error = _hammer_ip_seek_btree(cursor); 1610 if (error == 0) 1611 error = hammer_ip_next(cursor); 1612 1613 return (error); 1614 } 1615 1616 /* 1617 * Retrieve the next record in a merged iteration within the bounds of the 1618 * cursor. This call may be made multiple times after the cursor has been 1619 * initially searched with hammer_ip_first(). 1620 * 1621 * There are numerous special cases in this code to deal with races between 1622 * in-memory records and on-media records. 1623 * 1624 * 0 is returned on success, ENOENT if no further records match the 1625 * requested range, or some other error code is returned. 1626 */ 1627 int 1628 hammer_ip_next(hammer_cursor_t cursor) 1629 { 1630 hammer_btree_elm_t elm; 1631 hammer_record_t rec; 1632 hammer_record_t tmprec; 1633 int error; 1634 int r; 1635 1636 again: 1637 /* 1638 * Get the next on-disk record 1639 * 1640 * NOTE: If we deleted the last on-disk record we had scanned 1641 * ATEDISK will be clear and RETEST will be set, forcing 1642 * a call to iterate. The fact that ATEDISK is clear causes 1643 * iterate to re-test the 'current' element. If ATEDISK is 1644 * set, iterate will skip the 'current' element. 1645 */ 1646 error = 0; 1647 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1648 if (cursor->flags & (HAMMER_CURSOR_ATEDISK | 1649 HAMMER_CURSOR_RETEST)) { 1650 error = hammer_btree_iterate(cursor); 1651 cursor->flags &= ~HAMMER_CURSOR_RETEST; 1652 if (error == 0) { 1653 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1654 hammer_cache_node(&cursor->ip->cache[1], 1655 cursor->node); 1656 } else if (error == ENOENT) { 1657 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1658 HAMMER_CURSOR_ATEDISK; 1659 error = 0; 1660 } 1661 } 1662 } 1663 1664 /* 1665 * If the generation changed the backend has deleted or committed 1666 * one or more memory records since our last check. 1667 * 1668 * When this case occurs if the disk cursor is > current memory record 1669 * or the disk cursor is at EOF, we must re-seek the disk-cursor. 1670 * Since the cursor is ahead it must have not yet been eaten (if 1671 * not at eof anyway). (XXX data offset case?) 1672 * 1673 * NOTE: we are not doing a full check here. That will be handled 1674 * later on. 1675 * 1676 * If we have exhausted all memory records we do not have to do any 1677 * further seeks. 1678 */ 1679 while (cursor->rec_generation != cursor->ip->rec_generation && 1680 error == 0 1681 ) { 1682 kprintf("HAMMER: Debug: generation changed during scan @ino=%016llx\n", (long long)cursor->ip->obj_id); 1683 cursor->rec_generation = cursor->ip->rec_generation; 1684 if (cursor->flags & HAMMER_CURSOR_MEMEOF) 1685 break; 1686 if (cursor->flags & HAMMER_CURSOR_DISKEOF) { 1687 r = 1; 1688 } else { 1689 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEDISK) == 0); 1690 elm = &cursor->node->ondisk->elms[cursor->index]; 1691 r = hammer_btree_cmp(&elm->base, 1692 &cursor->iprec->leaf.base); 1693 } 1694 1695 /* 1696 * Do we re-seek the media cursor? 1697 */ 1698 if (r > 0) { 1699 if (_hammer_ip_reseek(cursor)) 1700 goto again; 1701 } 1702 } 1703 1704 /* 1705 * We can now safely get the next in-memory record. We cannot 1706 * block here. 1707 * 1708 * hammer_rec_scan_cmp: Is the record still in our general range, 1709 * (non-inclusive of snapshot exclusions)? 1710 * hammer_rec_scan_callback: Is the record in our snapshot? 1711 */ 1712 tmprec = NULL; 1713 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1714 /* 1715 * If the current memory record was eaten then get the next 1716 * one. Stale records are skipped. 1717 */ 1718 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1719 tmprec = cursor->iprec; 1720 cursor->iprec = NULL; 1721 rec = hammer_rec_rb_tree_RB_NEXT(tmprec); 1722 while (rec) { 1723 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1724 break; 1725 if (hammer_rec_scan_callback(rec, cursor) != 0) 1726 break; 1727 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1728 } 1729 if (cursor->iprec) { 1730 KKASSERT(cursor->iprec == rec); 1731 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1732 } else { 1733 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1734 } 1735 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1736 } 1737 } 1738 1739 /* 1740 * MEMORY RECORD VALIDITY TEST 1741 * 1742 * (We still can't block, which is why tmprec is being held so 1743 * long). 1744 * 1745 * If the memory record is no longer valid we skip it. It may 1746 * have been deleted by the frontend. If it was deleted or 1747 * committed by the backend the generation change re-seeked the 1748 * disk cursor and the record will be present there. 1749 */ 1750 if (error == 0 && (cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1751 KKASSERT(cursor->iprec); 1752 KKASSERT((cursor->flags & HAMMER_CURSOR_ATEMEM) == 0); 1753 if (!hammer_ip_iterate_mem_good(cursor, cursor->iprec)) { 1754 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1755 if (tmprec) 1756 hammer_rel_mem_record(tmprec); 1757 goto again; 1758 } 1759 } 1760 if (tmprec) 1761 hammer_rel_mem_record(tmprec); 1762 1763 /* 1764 * Extract either the disk or memory record depending on their 1765 * relative position. 1766 */ 1767 error = 0; 1768 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1769 case 0: 1770 /* 1771 * Both entries valid. Compare the entries and nominally 1772 * return the first one in the sort order. Numerous cases 1773 * require special attention, however. 1774 */ 1775 elm = &cursor->node->ondisk->elms[cursor->index]; 1776 r = hammer_btree_cmp(&elm->base, &cursor->iprec->leaf.base); 1777 1778 /* 1779 * If the two entries differ only by their key (-2/2) or 1780 * create_tid (-1/1), and are DATA records, we may have a 1781 * nominal match. We have to calculate the base file 1782 * offset of the data. 1783 */ 1784 if (r <= 2 && r >= -2 && r != 0 && 1785 cursor->ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE && 1786 cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1787 int64_t base1 = elm->leaf.base.key - elm->leaf.data_len; 1788 int64_t base2 = cursor->iprec->leaf.base.key - 1789 cursor->iprec->leaf.data_len; 1790 if (base1 == base2) 1791 r = 0; 1792 } 1793 1794 if (r < 0) { 1795 error = hammer_btree_extract(cursor, 1796 HAMMER_CURSOR_GET_LEAF); 1797 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1798 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1799 break; 1800 } 1801 1802 /* 1803 * If the entries match exactly the memory entry is either 1804 * an on-disk directory entry deletion or a bulk data 1805 * overwrite. If it is a directory entry deletion we eat 1806 * both entries. 1807 * 1808 * For the bulk-data overwrite case it is possible to have 1809 * visibility into both, which simply means the syncer 1810 * hasn't gotten around to doing the delete+insert sequence 1811 * on the B-Tree. Use the memory entry and throw away the 1812 * on-disk entry. 1813 * 1814 * If the in-memory record is not either of these we 1815 * probably caught the syncer while it was syncing it to 1816 * the media. Since we hold a shared lock on the cursor, 1817 * the in-memory record had better be marked deleted at 1818 * this point. 1819 */ 1820 if (r == 0) { 1821 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1822 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1823 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1824 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1825 goto again; 1826 } 1827 } else if (cursor->iprec->type == HAMMER_MEM_RECORD_DATA) { 1828 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1829 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1830 } 1831 /* fall through to memory entry */ 1832 } else { 1833 panic("hammer_ip_next: duplicate mem/b-tree entry %p %d %08x", cursor->iprec, cursor->iprec->type, cursor->iprec->flags); 1834 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1835 goto again; 1836 } 1837 } 1838 /* fall through to the memory entry */ 1839 case HAMMER_CURSOR_ATEDISK: 1840 /* 1841 * Only the memory entry is valid. 1842 */ 1843 cursor->leaf = &cursor->iprec->leaf; 1844 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1845 cursor->flags |= HAMMER_CURSOR_LASTWASMEM; 1846 1847 /* 1848 * If the memory entry is an on-disk deletion we should have 1849 * also had found a B-Tree record. If the backend beat us 1850 * to it it would have interlocked the cursor and we should 1851 * have seen the in-memory record marked DELETED_FE. 1852 */ 1853 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL && 1854 (cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1855 panic("hammer_ip_next: del-on-disk with no b-tree entry iprec %p flags %08x", cursor->iprec, cursor->iprec->flags); 1856 } 1857 break; 1858 case HAMMER_CURSOR_ATEMEM: 1859 /* 1860 * Only the disk entry is valid 1861 */ 1862 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 1863 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1864 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1865 break; 1866 default: 1867 /* 1868 * Neither entry is valid 1869 * 1870 * XXX error not set properly 1871 */ 1872 cursor->flags &= ~HAMMER_CURSOR_LASTWASMEM; 1873 cursor->leaf = NULL; 1874 error = ENOENT; 1875 break; 1876 } 1877 return(error); 1878 } 1879 1880 /* 1881 * Resolve the cursor->data pointer for the current cursor position in 1882 * a merged iteration. 1883 */ 1884 int 1885 hammer_ip_resolve_data(hammer_cursor_t cursor) 1886 { 1887 hammer_record_t record; 1888 int error; 1889 1890 if (hammer_cursor_inmem(cursor)) { 1891 /* 1892 * The data associated with an in-memory record is usually 1893 * kmalloced, but reserve-ahead data records will have an 1894 * on-disk reference. 1895 * 1896 * NOTE: Reserve-ahead data records must be handled in the 1897 * context of the related high level buffer cache buffer 1898 * to interlock against async writes. 1899 */ 1900 record = cursor->iprec; 1901 cursor->data = record->data; 1902 error = 0; 1903 if (cursor->data == NULL) { 1904 KKASSERT(record->leaf.base.rec_type == 1905 HAMMER_RECTYPE_DATA); 1906 cursor->data = hammer_bread_ext(cursor->trans->hmp, 1907 record->leaf.data_offset, 1908 record->leaf.data_len, 1909 &error, 1910 &cursor->data_buffer); 1911 } 1912 } else { 1913 cursor->leaf = &cursor->node->ondisk->elms[cursor->index].leaf; 1914 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1915 } 1916 return(error); 1917 } 1918 1919 /* 1920 * Backend truncation / record replacement - delete records in range. 1921 * 1922 * Delete all records within the specified range for inode ip. In-memory 1923 * records still associated with the frontend are ignored. 1924 * 1925 * If truncating is non-zero in-memory records associated with the back-end 1926 * are ignored. If truncating is > 1 we can return EWOULDBLOCK. 1927 * 1928 * NOTES: 1929 * 1930 * * An unaligned range will cause new records to be added to cover 1931 * the edge cases. (XXX not implemented yet). 1932 * 1933 * * Replacement via reservations (see hammer_ip_sync_record_cursor()) 1934 * also do not deal with unaligned ranges. 1935 * 1936 * * ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1937 * 1938 * * Record keys for regular file data have to be special-cased since 1939 * they indicate the end of the range (key = base + bytes). 1940 * 1941 * * This function may be asked to delete ridiculously huge ranges, for 1942 * example if someone truncates or removes a 1TB regular file. We 1943 * must be very careful on restarts and we may have to stop w/ 1944 * EWOULDBLOCK to avoid blowing out the buffer cache. 1945 */ 1946 int 1947 hammer_ip_delete_range(hammer_cursor_t cursor, hammer_inode_t ip, 1948 int64_t ran_beg, int64_t ran_end, int truncating) 1949 { 1950 hammer_transaction_t trans = cursor->trans; 1951 hammer_btree_leaf_elm_t leaf; 1952 int error; 1953 int64_t off; 1954 int64_t tmp64; 1955 1956 #if 0 1957 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); 1958 #endif 1959 1960 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1961 retry: 1962 hammer_normalize_cursor(cursor); 1963 cursor->key_beg.localization = ip->obj_localization + 1964 HAMMER_LOCALIZE_MISC; 1965 cursor->key_beg.obj_id = ip->obj_id; 1966 cursor->key_beg.create_tid = 0; 1967 cursor->key_beg.delete_tid = 0; 1968 cursor->key_beg.obj_type = 0; 1969 1970 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1971 cursor->key_beg.key = ran_beg; 1972 cursor->key_beg.rec_type = HAMMER_RECTYPE_DB; 1973 } else { 1974 /* 1975 * The key in the B-Tree is (base+bytes), so the first possible 1976 * matching key is ran_beg + 1. 1977 */ 1978 cursor->key_beg.key = ran_beg + 1; 1979 cursor->key_beg.rec_type = HAMMER_RECTYPE_DATA; 1980 } 1981 1982 cursor->key_end = cursor->key_beg; 1983 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) { 1984 cursor->key_end.key = ran_end; 1985 } else { 1986 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1987 if (tmp64 < ran_end) 1988 cursor->key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1989 else 1990 cursor->key_end.key = ran_end + MAXPHYS + 1; 1991 } 1992 1993 cursor->asof = ip->obj_asof; 1994 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 1995 cursor->flags |= HAMMER_CURSOR_ASOF; 1996 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1997 cursor->flags |= HAMMER_CURSOR_BACKEND; 1998 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE; 1999 2000 error = hammer_ip_first(cursor); 2001 2002 /* 2003 * Iterate through matching records and mark them as deleted. 2004 */ 2005 while (error == 0) { 2006 leaf = cursor->leaf; 2007 2008 KKASSERT(leaf->base.delete_tid == 0); 2009 KKASSERT(leaf->base.obj_id == ip->obj_id); 2010 2011 /* 2012 * There may be overlap cases for regular file data. Also 2013 * remember the key for a regular file record is (base + len), 2014 * NOT (base). 2015 * 2016 * Note that due to duplicates (mem & media) allowed by 2017 * DELETE_VISIBILITY, off can wind up less then ran_beg. 2018 */ 2019 if (leaf->base.rec_type == HAMMER_RECTYPE_DATA) { 2020 off = leaf->base.key - leaf->data_len; 2021 /* 2022 * Check the left edge case. We currently do not 2023 * split existing records. 2024 */ 2025 if (off < ran_beg && leaf->base.key > ran_beg) { 2026 panic("hammer left edge case %016llx %d\n", 2027 (long long)leaf->base.key, 2028 leaf->data_len); 2029 } 2030 2031 /* 2032 * Check the right edge case. Note that the 2033 * record can be completely out of bounds, which 2034 * terminates the search. 2035 * 2036 * base->key is exclusive of the right edge while 2037 * ran_end is inclusive of the right edge. The 2038 * (key - data_len) left boundary is inclusive. 2039 * 2040 * XXX theory-check this test at some point, are 2041 * we missing a + 1 somewhere? Note that ran_end 2042 * could overflow. 2043 */ 2044 if (leaf->base.key - 1 > ran_end) { 2045 if (leaf->base.key - leaf->data_len > ran_end) 2046 break; 2047 panic("hammer right edge case\n"); 2048 } 2049 } else { 2050 off = leaf->base.key; 2051 } 2052 2053 /* 2054 * Delete the record. When truncating we do not delete 2055 * in-memory (data) records because they represent data 2056 * written after the truncation. 2057 * 2058 * This will also physically destroy the B-Tree entry and 2059 * data if the retention policy dictates. The function 2060 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2061 * to retest the new 'current' element. 2062 */ 2063 if (truncating == 0 || hammer_cursor_ondisk(cursor)) { 2064 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2065 /* 2066 * If we have built up too many meta-buffers we risk 2067 * deadlocking the kernel and must stop. This can 2068 * occur when deleting ridiculously huge files. 2069 * sync_trunc_off is updated so the next cycle does 2070 * not re-iterate records we have already deleted. 2071 * 2072 * This is only done with formal truncations. 2073 */ 2074 if (truncating > 1 && error == 0 && 2075 hammer_flusher_meta_limit(ip->hmp)) { 2076 ip->sync_trunc_off = off; 2077 error = EWOULDBLOCK; 2078 } 2079 } 2080 if (error) 2081 break; 2082 ran_beg = off; /* for restart */ 2083 error = hammer_ip_next(cursor); 2084 } 2085 if (cursor->node) 2086 hammer_cache_node(&ip->cache[1], cursor->node); 2087 2088 if (error == EDEADLK) { 2089 hammer_done_cursor(cursor); 2090 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2091 if (error == 0) 2092 goto retry; 2093 } 2094 if (error == ENOENT) 2095 error = 0; 2096 return(error); 2097 } 2098 2099 /* 2100 * This backend function deletes the specified record on-disk, similar to 2101 * delete_range but for a specific record. Unlike the exact deletions 2102 * used when deleting a directory entry this function uses an ASOF search 2103 * like delete_range. 2104 * 2105 * This function may be called with ip->obj_asof set for a slave snapshot, 2106 * so don't use it. We always delete non-historical records only. 2107 */ 2108 static int 2109 hammer_delete_general(hammer_cursor_t cursor, hammer_inode_t ip, 2110 hammer_btree_leaf_elm_t leaf) 2111 { 2112 hammer_transaction_t trans = cursor->trans; 2113 int error; 2114 2115 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2116 retry: 2117 hammer_normalize_cursor(cursor); 2118 cursor->key_beg = leaf->base; 2119 cursor->asof = HAMMER_MAX_TID; 2120 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2121 cursor->flags |= HAMMER_CURSOR_ASOF; 2122 cursor->flags |= HAMMER_CURSOR_BACKEND; 2123 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2124 2125 error = hammer_btree_lookup(cursor); 2126 if (error == 0) { 2127 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2128 } 2129 if (error == EDEADLK) { 2130 hammer_done_cursor(cursor); 2131 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2132 if (error == 0) 2133 goto retry; 2134 } 2135 return(error); 2136 } 2137 2138 /* 2139 * This function deletes remaining auxillary records when an inode is 2140 * being deleted. This function explicitly does not delete the 2141 * inode record, directory entry, data, or db records. Those must be 2142 * properly disposed of prior to this call. 2143 */ 2144 int 2145 hammer_ip_delete_clean(hammer_cursor_t cursor, hammer_inode_t ip, int *countp) 2146 { 2147 hammer_transaction_t trans = cursor->trans; 2148 hammer_btree_leaf_elm_t leaf; 2149 int error; 2150 2151 KKASSERT(trans->type == HAMMER_TRANS_FLS); 2152 retry: 2153 hammer_normalize_cursor(cursor); 2154 cursor->key_beg.localization = ip->obj_localization + 2155 HAMMER_LOCALIZE_MISC; 2156 cursor->key_beg.obj_id = ip->obj_id; 2157 cursor->key_beg.create_tid = 0; 2158 cursor->key_beg.delete_tid = 0; 2159 cursor->key_beg.obj_type = 0; 2160 cursor->key_beg.rec_type = HAMMER_RECTYPE_CLEAN_START; 2161 cursor->key_beg.key = HAMMER_MIN_KEY; 2162 2163 cursor->key_end = cursor->key_beg; 2164 cursor->key_end.rec_type = HAMMER_RECTYPE_MAX; 2165 cursor->key_end.key = HAMMER_MAX_KEY; 2166 2167 cursor->asof = ip->obj_asof; 2168 cursor->flags &= ~HAMMER_CURSOR_INITMASK; 2169 cursor->flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2170 cursor->flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 2171 cursor->flags |= HAMMER_CURSOR_BACKEND; 2172 2173 error = hammer_ip_first(cursor); 2174 2175 /* 2176 * Iterate through matching records and mark them as deleted. 2177 */ 2178 while (error == 0) { 2179 leaf = cursor->leaf; 2180 2181 KKASSERT(leaf->base.delete_tid == 0); 2182 2183 /* 2184 * Mark the record and B-Tree entry as deleted. This will 2185 * also physically delete the B-Tree entry, record, and 2186 * data if the retention policy dictates. The function 2187 * will set HAMMER_CURSOR_RETEST to cause hammer_ip_next() 2188 * to retest the new 'current' element. 2189 * 2190 * Directory entries (and delete-on-disk directory entries) 2191 * must be synced and cannot be deleted. 2192 */ 2193 error = hammer_ip_delete_record(cursor, ip, trans->tid); 2194 ++*countp; 2195 if (error) 2196 break; 2197 error = hammer_ip_next(cursor); 2198 } 2199 if (cursor->node) 2200 hammer_cache_node(&ip->cache[1], cursor->node); 2201 if (error == EDEADLK) { 2202 hammer_done_cursor(cursor); 2203 error = hammer_init_cursor(trans, cursor, &ip->cache[1], ip); 2204 if (error == 0) 2205 goto retry; 2206 } 2207 if (error == ENOENT) 2208 error = 0; 2209 return(error); 2210 } 2211 2212 /* 2213 * Delete the record at the current cursor. On success the cursor will 2214 * be positioned appropriately for an iteration but may no longer be at 2215 * a leaf node. 2216 * 2217 * This routine is only called from the backend. 2218 * 2219 * NOTE: This can return EDEADLK, requiring the caller to terminate the 2220 * cursor and retry. 2221 */ 2222 int 2223 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_inode_t ip, 2224 hammer_tid_t tid) 2225 { 2226 hammer_record_t iprec; 2227 hammer_mount_t hmp; 2228 int error; 2229 2230 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 2231 KKASSERT(tid != 0); 2232 hmp = cursor->node->hmp; 2233 2234 /* 2235 * In-memory (unsynchronized) records can simply be freed. This 2236 * only occurs in range iterations since all other records are 2237 * individually synchronized. Thus there should be no confusion with 2238 * the interlock. 2239 * 2240 * An in-memory record may be deleted before being committed to disk, 2241 * but could have been accessed in the mean time. The reservation 2242 * code will deal with the case. 2243 */ 2244 if (hammer_cursor_inmem(cursor)) { 2245 iprec = cursor->iprec; 2246 KKASSERT((iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); 2247 iprec->flags |= HAMMER_RECF_DELETED_FE; 2248 iprec->flags |= HAMMER_RECF_DELETED_BE; 2249 KKASSERT(iprec->ip == ip); 2250 ++ip->rec_generation; 2251 return(0); 2252 } 2253 2254 /* 2255 * On-disk records are marked as deleted by updating their delete_tid. 2256 * This does not effect their position in the B-Tree (which is based 2257 * on their create_tid). 2258 * 2259 * Frontend B-Tree operations track inodes so we tell 2260 * hammer_delete_at_cursor() not to. 2261 */ 2262 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_LEAF); 2263 2264 if (error == 0) { 2265 error = hammer_delete_at_cursor( 2266 cursor, 2267 HAMMER_DELETE_ADJUST | hammer_nohistory(ip), 2268 cursor->trans->tid, 2269 cursor->trans->time32, 2270 0, NULL); 2271 } 2272 return(error); 2273 } 2274 2275 /* 2276 * Used to write a generic record w/optional data to the media b-tree 2277 * when no inode context is available. Used by the mirroring and 2278 * snapshot code. 2279 * 2280 * Caller must set cursor->key_beg to leaf->base. The cursor must be 2281 * flagged for backend operation and not flagged ASOF (since we are 2282 * doing an insertion). 2283 * 2284 * This function will acquire the appropriate sync lock and will set 2285 * the cursor insertion flag for the operation, do the btree lookup, 2286 * and the insertion, and clear the insertion flag and sync lock before 2287 * returning. The cursor state will be such that the caller can continue 2288 * scanning (used by the mirroring code). 2289 * 2290 * mode: HAMMER_CREATE_MODE_UMIRROR copyin data, check crc 2291 * HAMMER_CREATE_MODE_SYS bcopy data, generate crc 2292 * 2293 * NOTE: EDEADLK can be returned. The caller must do deadlock handling and 2294 * retry. 2295 * 2296 * EALREADY can be returned if the record already exists (WARNING, 2297 * because ASOF cannot be used no check is made for illegal 2298 * duplicates). 2299 * 2300 * NOTE: Do not use the function for normal inode-related records as this 2301 * functions goes directly to the media and is not integrated with 2302 * in-memory records. 2303 */ 2304 int 2305 hammer_create_at_cursor(hammer_cursor_t cursor, hammer_btree_leaf_elm_t leaf, 2306 void *udata, int mode) 2307 { 2308 hammer_transaction_t trans; 2309 hammer_buffer_t data_buffer; 2310 hammer_off_t ndata_offset; 2311 hammer_tid_t high_tid; 2312 void *ndata; 2313 int error; 2314 int doprop; 2315 2316 trans = cursor->trans; 2317 data_buffer = NULL; 2318 ndata_offset = 0; 2319 doprop = 0; 2320 2321 KKASSERT((cursor->flags & 2322 (HAMMER_CURSOR_BACKEND | HAMMER_CURSOR_ASOF)) == 2323 (HAMMER_CURSOR_BACKEND)); 2324 2325 hammer_sync_lock_sh(trans); 2326 2327 if (leaf->data_len) { 2328 ndata = hammer_alloc_data(trans, leaf->data_len, 2329 leaf->base.rec_type, 2330 &ndata_offset, &data_buffer, 2331 0, &error); 2332 if (ndata == NULL) { 2333 hammer_sync_unlock(trans); 2334 return (error); 2335 } 2336 leaf->data_offset = ndata_offset; 2337 hammer_modify_buffer(trans, data_buffer, NULL, 0); 2338 2339 switch(mode) { 2340 case HAMMER_CREATE_MODE_UMIRROR: 2341 error = copyin(udata, ndata, leaf->data_len); 2342 if (error == 0) { 2343 if (hammer_crc_test_leaf(ndata, leaf) == 0) { 2344 kprintf("data crc mismatch on pipe\n"); 2345 error = EINVAL; 2346 } else { 2347 error = hammer_cursor_localize_data( 2348 ndata, leaf); 2349 } 2350 } 2351 break; 2352 case HAMMER_CREATE_MODE_SYS: 2353 bcopy(udata, ndata, leaf->data_len); 2354 error = 0; 2355 hammer_crc_set_leaf(ndata, leaf); 2356 break; 2357 default: 2358 panic("hammer: hammer_create_at_cursor: bad mode %d", 2359 mode); 2360 break; /* NOT REACHED */ 2361 } 2362 hammer_modify_buffer_done(data_buffer); 2363 } else { 2364 leaf->data_offset = 0; 2365 error = 0; 2366 ndata = NULL; 2367 } 2368 if (error) 2369 goto failed; 2370 2371 /* 2372 * Do the insertion. This can fail with a EDEADLK or EALREADY 2373 */ 2374 cursor->flags |= HAMMER_CURSOR_INSERT; 2375 error = hammer_btree_lookup(cursor); 2376 if (error != ENOENT) { 2377 if (error == 0) 2378 error = EALREADY; 2379 goto failed; 2380 } 2381 error = hammer_btree_insert(cursor, leaf, &doprop); 2382 2383 /* 2384 * Cursor is left on current element, we want to skip it now. 2385 * (in case the caller is scanning) 2386 */ 2387 cursor->flags |= HAMMER_CURSOR_ATEDISK; 2388 cursor->flags &= ~HAMMER_CURSOR_INSERT; 2389 2390 /* 2391 * If the insertion happens to be creating (and not just replacing) 2392 * an inode we have to track it. 2393 */ 2394 if (error == 0 && 2395 leaf->base.rec_type == HAMMER_RECTYPE_INODE && 2396 leaf->base.delete_tid == 0) { 2397 hammer_modify_volume_field(trans, trans->rootvol, 2398 vol0_stat_inodes); 2399 ++trans->hmp->rootvol->ondisk->vol0_stat_inodes; 2400 hammer_modify_volume_done(trans->rootvol); 2401 } 2402 2403 /* 2404 * vol0_next_tid must track the highest TID stored in the filesystem. 2405 * We do not need to generate undo for this update. 2406 */ 2407 high_tid = leaf->base.create_tid; 2408 if (high_tid < leaf->base.delete_tid) 2409 high_tid = leaf->base.delete_tid; 2410 if (trans->rootvol->ondisk->vol0_next_tid < high_tid) { 2411 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2412 trans->rootvol->ondisk->vol0_next_tid = high_tid; 2413 hammer_modify_volume_done(trans->rootvol); 2414 } 2415 2416 /* 2417 * WARNING! cursor's leaf pointer may have changed after 2418 * do_propagation returns. 2419 */ 2420 if (error == 0 && doprop) 2421 hammer_btree_do_propagation(cursor, NULL, leaf); 2422 2423 failed: 2424 /* 2425 * Cleanup 2426 */ 2427 if (error && leaf->data_offset) { 2428 hammer_blockmap_free(trans, leaf->data_offset, leaf->data_len); 2429 2430 } 2431 hammer_sync_unlock(trans); 2432 if (data_buffer) 2433 hammer_rel_buffer(data_buffer, 0); 2434 return (error); 2435 } 2436 2437 /* 2438 * Delete the B-Tree element at the current cursor and do any necessary 2439 * mirror propagation. 2440 * 2441 * The cursor must be properly positioned for an iteration on return but 2442 * may be pointing at an internal element. 2443 * 2444 * An element can be un-deleted by passing a delete_tid of 0 with 2445 * HAMMER_DELETE_ADJUST. 2446 */ 2447 int 2448 hammer_delete_at_cursor(hammer_cursor_t cursor, int delete_flags, 2449 hammer_tid_t delete_tid, u_int32_t delete_ts, 2450 int track, int64_t *stat_bytes) 2451 { 2452 struct hammer_btree_leaf_elm save_leaf; 2453 hammer_transaction_t trans; 2454 hammer_btree_leaf_elm_t leaf; 2455 hammer_node_t node; 2456 hammer_btree_elm_t elm; 2457 hammer_off_t data_offset; 2458 int32_t data_len; 2459 u_int16_t rec_type; 2460 int error; 2461 int icount; 2462 int doprop; 2463 2464 error = hammer_cursor_upgrade(cursor); 2465 if (error) 2466 return(error); 2467 2468 trans = cursor->trans; 2469 node = cursor->node; 2470 elm = &node->ondisk->elms[cursor->index]; 2471 leaf = &elm->leaf; 2472 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 2473 2474 hammer_sync_lock_sh(trans); 2475 doprop = 0; 2476 icount = 0; 2477 2478 /* 2479 * Adjust the delete_tid. Update the mirror_tid propagation field 2480 * as well. delete_tid can be 0 (undelete -- used by mirroring). 2481 */ 2482 if (delete_flags & HAMMER_DELETE_ADJUST) { 2483 if (elm->base.rec_type == HAMMER_RECTYPE_INODE) { 2484 if (elm->leaf.base.delete_tid == 0 && delete_tid) 2485 icount = -1; 2486 if (elm->leaf.base.delete_tid && delete_tid == 0) 2487 icount = 1; 2488 } 2489 2490 hammer_modify_node(trans, node, elm, sizeof(*elm)); 2491 elm->leaf.base.delete_tid = delete_tid; 2492 elm->leaf.delete_ts = delete_ts; 2493 hammer_modify_node_done(node); 2494 2495 if (elm->leaf.base.delete_tid > node->ondisk->mirror_tid) { 2496 hammer_modify_node_field(trans, node, mirror_tid); 2497 node->ondisk->mirror_tid = elm->leaf.base.delete_tid; 2498 hammer_modify_node_done(node); 2499 doprop = 1; 2500 if (hammer_debug_general & 0x0002) { 2501 kprintf("delete_at_cursor: propagate %016llx" 2502 " @%016llx\n", 2503 (long long)elm->leaf.base.delete_tid, 2504 (long long)node->node_offset); 2505 } 2506 } 2507 2508 /* 2509 * Adjust for the iteration. We have deleted the current 2510 * element and want to clear ATEDISK so the iteration does 2511 * not skip the element after, which now becomes the current 2512 * element. This element must be re-tested if doing an 2513 * iteration, which is handled by the RETEST flag. 2514 */ 2515 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2516 cursor->flags |= HAMMER_CURSOR_RETEST; 2517 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2518 } 2519 2520 /* 2521 * An on-disk record cannot have the same delete_tid 2522 * as its create_tid. In a chain of record updates 2523 * this could result in a duplicate record. 2524 */ 2525 KKASSERT(elm->leaf.base.delete_tid != 2526 elm->leaf.base.create_tid); 2527 } 2528 2529 /* 2530 * Destroy the B-Tree element if asked (typically if a nohistory 2531 * file or mount, or when called by the pruning code). 2532 * 2533 * Adjust the ATEDISK flag to properly support iterations. 2534 */ 2535 if (delete_flags & HAMMER_DELETE_DESTROY) { 2536 data_offset = elm->leaf.data_offset; 2537 data_len = elm->leaf.data_len; 2538 rec_type = elm->leaf.base.rec_type; 2539 if (doprop) { 2540 save_leaf = elm->leaf; 2541 leaf = &save_leaf; 2542 } 2543 if (elm->base.rec_type == HAMMER_RECTYPE_INODE && 2544 elm->leaf.base.delete_tid == 0) { 2545 icount = -1; 2546 } 2547 2548 error = hammer_btree_delete(cursor); 2549 if (error == 0) { 2550 /* 2551 * The deletion moves the next element (if any) to 2552 * the current element position. We must clear 2553 * ATEDISK so this element is not skipped and we 2554 * must set RETEST to force any iteration to re-test 2555 * the element. 2556 */ 2557 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 2558 cursor->flags |= HAMMER_CURSOR_RETEST; 2559 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 2560 } 2561 } 2562 if (error == 0) { 2563 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 2564 case HAMMER_ZONE_LARGE_DATA: 2565 case HAMMER_ZONE_SMALL_DATA: 2566 case HAMMER_ZONE_META: 2567 hammer_blockmap_free(trans, 2568 data_offset, data_len); 2569 break; 2570 default: 2571 break; 2572 } 2573 } 2574 } 2575 2576 /* 2577 * Track inode count and next_tid. This is used by the mirroring 2578 * and PFS code. icount can be negative, zero, or positive. 2579 */ 2580 if (error == 0 && track) { 2581 if (icount) { 2582 hammer_modify_volume_field(trans, trans->rootvol, 2583 vol0_stat_inodes); 2584 trans->rootvol->ondisk->vol0_stat_inodes += icount; 2585 hammer_modify_volume_done(trans->rootvol); 2586 } 2587 if (trans->rootvol->ondisk->vol0_next_tid < delete_tid) { 2588 hammer_modify_volume(trans, trans->rootvol, NULL, 0); 2589 trans->rootvol->ondisk->vol0_next_tid = delete_tid; 2590 hammer_modify_volume_done(trans->rootvol); 2591 } 2592 } 2593 2594 /* 2595 * mirror_tid propagation occurs if the node's mirror_tid had to be 2596 * updated while adjusting the delete_tid. 2597 * 2598 * This occurs when deleting even in nohistory mode, but does not 2599 * occur when pruning an already-deleted node. 2600 * 2601 * cursor->ip is NULL when called from the pruning, mirroring, 2602 * and pfs code. If non-NULL propagation will be conditionalized 2603 * on whether the PFS is in no-history mode or not. 2604 * 2605 * WARNING: cursor's leaf pointer may have changed after do_propagation 2606 * returns! 2607 */ 2608 if (doprop) { 2609 if (cursor->ip) 2610 hammer_btree_do_propagation(cursor, cursor->ip->pfsm, leaf); 2611 else 2612 hammer_btree_do_propagation(cursor, NULL, leaf); 2613 } 2614 hammer_sync_unlock(trans); 2615 return (error); 2616 } 2617 2618 /* 2619 * Determine whether we can remove a directory. This routine checks whether 2620 * a directory is empty or not and enforces flush connectivity. 2621 * 2622 * Flush connectivity requires that we block if the target directory is 2623 * currently flushing, otherwise it may not end up in the same flush group. 2624 * 2625 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. 2626 */ 2627 int 2628 hammer_ip_check_directory_empty(hammer_transaction_t trans, hammer_inode_t ip) 2629 { 2630 struct hammer_cursor cursor; 2631 int error; 2632 2633 /* 2634 * Check directory empty 2635 */ 2636 hammer_init_cursor(trans, &cursor, &ip->cache[1], ip); 2637 2638 cursor.key_beg.localization = ip->obj_localization + 2639 hammer_dir_localization(ip); 2640 cursor.key_beg.obj_id = ip->obj_id; 2641 cursor.key_beg.create_tid = 0; 2642 cursor.key_beg.delete_tid = 0; 2643 cursor.key_beg.obj_type = 0; 2644 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 2645 cursor.key_beg.key = HAMMER_MIN_KEY; 2646 2647 cursor.key_end = cursor.key_beg; 2648 cursor.key_end.rec_type = 0xFFFF; 2649 cursor.key_end.key = HAMMER_MAX_KEY; 2650 2651 cursor.asof = ip->obj_asof; 2652 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 2653 2654 error = hammer_ip_first(&cursor); 2655 if (error == ENOENT) 2656 error = 0; 2657 else if (error == 0) 2658 error = ENOTEMPTY; 2659 hammer_done_cursor(&cursor); 2660 return(error); 2661 } 2662 2663 /* 2664 * Localize the data payload. Directory entries may need their 2665 * localization adjusted. 2666 */ 2667 static 2668 int 2669 hammer_cursor_localize_data(hammer_data_ondisk_t data, 2670 hammer_btree_leaf_elm_t leaf) 2671 { 2672 u_int32_t localization; 2673 2674 if (leaf->base.rec_type == HAMMER_RECTYPE_DIRENTRY) { 2675 localization = leaf->base.localization & 2676 HAMMER_LOCALIZE_PSEUDOFS_MASK; 2677 if (data->entry.localization != localization) { 2678 data->entry.localization = localization; 2679 hammer_crc_set_leaf(data, leaf); 2680 } 2681 } 2682 return(0); 2683 } 2684