1 /* 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vfs/hammer/hammer_object.c,v 1.48 2008/05/02 01:00:42 dillon Exp $ 35 */ 36 37 #include "hammer.h" 38 39 static int hammer_mem_add(hammer_transaction_t trans, hammer_record_t record); 40 static int hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip); 41 static int hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip); 42 43 /* 44 * Red-black tree support. 45 */ 46 static int 47 hammer_rec_rb_compare(hammer_record_t rec1, hammer_record_t rec2) 48 { 49 if (rec1->rec.base.base.rec_type < rec2->rec.base.base.rec_type) 50 return(-1); 51 if (rec1->rec.base.base.rec_type > rec2->rec.base.base.rec_type) 52 return(1); 53 54 if (rec1->rec.base.base.key < rec2->rec.base.base.key) 55 return(-1); 56 if (rec1->rec.base.base.key > rec2->rec.base.base.key) 57 return(1); 58 59 if (rec1->rec.base.base.create_tid == 0) { 60 if (rec2->rec.base.base.create_tid == 0) 61 return(0); 62 return(1); 63 } 64 if (rec2->rec.base.base.create_tid == 0) 65 return(-1); 66 67 if (rec1->rec.base.base.create_tid < rec2->rec.base.base.create_tid) 68 return(-1); 69 if (rec1->rec.base.base.create_tid > rec2->rec.base.base.create_tid) 70 return(1); 71 72 /* 73 * Never match against an item deleted by the front-end. 74 */ 75 if (rec1->flags & HAMMER_RECF_DELETED_FE) 76 return(1); 77 if (rec2->flags & HAMMER_RECF_DELETED_FE) 78 return(-1); 79 80 return(0); 81 } 82 83 static int 84 hammer_rec_compare(hammer_base_elm_t info, hammer_record_t rec) 85 { 86 if (info->rec_type < rec->rec.base.base.rec_type) 87 return(-3); 88 if (info->rec_type > rec->rec.base.base.rec_type) 89 return(3); 90 91 if (info->key < rec->rec.base.base.key) 92 return(-2); 93 if (info->key > rec->rec.base.base.key) 94 return(2); 95 96 if (info->create_tid == 0) { 97 if (rec->rec.base.base.create_tid == 0) 98 return(0); 99 return(1); 100 } 101 if (rec->rec.base.base.create_tid == 0) 102 return(-1); 103 if (info->create_tid < rec->rec.base.base.create_tid) 104 return(-1); 105 if (info->create_tid > rec->rec.base.base.create_tid) 106 return(1); 107 return(0); 108 } 109 110 /* 111 * RB_SCAN comparison code for hammer_mem_first(). The argument order 112 * is reversed so the comparison result has to be negated. key_beg and 113 * key_end are both range-inclusive. 114 * 115 * The creation timestamp can cause hammer_rec_compare() to return -1 or +1. 116 * These do not stop the scan. 117 * 118 * Localized deletions are not cached in-memory. 119 */ 120 static 121 int 122 hammer_rec_scan_cmp(hammer_record_t rec, void *data) 123 { 124 hammer_cursor_t cursor = data; 125 int r; 126 127 r = hammer_rec_compare(&cursor->key_beg, rec); 128 if (r > 1) 129 return(-1); 130 r = hammer_rec_compare(&cursor->key_end, rec); 131 if (r < -1) 132 return(1); 133 return(0); 134 } 135 136 /* 137 * This compare function is used when simply looking up key_beg. 138 */ 139 static 140 int 141 hammer_rec_find_cmp(hammer_record_t rec, void *data) 142 { 143 hammer_cursor_t cursor = data; 144 int r; 145 146 r = hammer_rec_compare(&cursor->key_beg, rec); 147 if (r > 1) 148 return(-1); 149 if (r < -1) 150 return(1); 151 return(0); 152 } 153 154 RB_GENERATE(hammer_rec_rb_tree, hammer_record, rb_node, hammer_rec_rb_compare); 155 RB_GENERATE_XLOOKUP(hammer_rec_rb_tree, INFO, hammer_record, rb_node, 156 hammer_rec_compare, hammer_base_elm_t); 157 158 /* 159 * Allocate a record for the caller to finish filling in. The record is 160 * returned referenced. 161 */ 162 hammer_record_t 163 hammer_alloc_mem_record(hammer_inode_t ip) 164 { 165 hammer_record_t record; 166 167 ++hammer_count_records; 168 record = kmalloc(sizeof(*record), M_HAMMER, M_WAITOK|M_ZERO); 169 record->flush_state = HAMMER_FST_IDLE; 170 record->ip = ip; 171 record->rec.base.base.btype = HAMMER_BTREE_TYPE_RECORD; 172 hammer_ref(&record->lock); 173 return (record); 174 } 175 176 void 177 hammer_wait_mem_record(hammer_record_t record) 178 { 179 while (record->flush_state == HAMMER_FST_FLUSH) { 180 record->flags |= HAMMER_RECF_WANTED; 181 tsleep(record, 0, "hmrrc2", 0); 182 } 183 } 184 185 /* 186 * Called from the backend, hammer_inode.c, after a record has been 187 * flushed to disk. The record has been exclusively locked by the 188 * caller and interlocked with BE. 189 * 190 * We clean up the state, unlock, and release the record (the record 191 * was referenced by the fact that it was in the HAMMER_FST_FLUSH state). 192 */ 193 void 194 hammer_flush_record_done(hammer_record_t record, int error) 195 { 196 hammer_inode_t target_ip; 197 int cleanup = 0; 198 199 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 200 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 201 202 if (error) { 203 /* 204 * An error occured, the backend was unable to sync the 205 * record to its media. Leave the record intact. 206 */ 207 Debugger("flush_record_done error"); 208 } else if (record->flags & HAMMER_RECF_CONVERT_DELETE) { 209 /* 210 * deleted-record to delete-on-disk conversion, occurs when 211 * we sync a record to disk which is marked deleted by the 212 * frontend, but not deleted from the point of view of the 213 * backend. 214 */ 215 if (record->flags & HAMMER_RECF_DELETED_BE) { 216 record->flags |= HAMMER_RECF_DELETED_FE; 217 cleanup = 1; 218 } else { 219 KKASSERT(record->type == HAMMER_MEM_RECORD_DEL); 220 } 221 } else { 222 /* 223 * Normal completion, record has been disposed of (by 224 * having been synchronized to the media). 225 */ 226 record->flags |= HAMMER_RECF_DELETED_FE; 227 record->flags |= HAMMER_RECF_DELETED_BE; 228 cleanup = 1; 229 } 230 if (cleanup) { 231 if ((target_ip = record->target_ip) != NULL) { 232 TAILQ_REMOVE(&target_ip->target_list, record, 233 target_entry); 234 record->target_ip = NULL; 235 hammer_test_inode(target_ip); 236 } 237 record->flush_state = HAMMER_FST_IDLE; 238 } else { 239 if (record->target_ip) 240 record->flush_state = HAMMER_FST_SETUP; 241 else 242 record->flush_state = HAMMER_FST_IDLE; 243 } 244 245 record->flags &= ~HAMMER_RECF_INTERLOCK_BE; 246 record->flags &= ~HAMMER_RECF_CONVERT_DELETE; 247 if (record->flags & HAMMER_RECF_WANTED) { 248 record->flags &= ~HAMMER_RECF_WANTED; 249 wakeup(record); 250 } 251 hammer_rel_mem_record(record); 252 } 253 254 /* 255 * Release a memory record. Records marked for deletion are immediately 256 * removed from the RB-Tree but otherwise left intact until the last ref 257 * goes away. 258 */ 259 void 260 hammer_rel_mem_record(struct hammer_record *record) 261 { 262 hammer_inode_t ip, target_ip; 263 264 hammer_unref(&record->lock); 265 266 if (record->flags & HAMMER_RECF_DELETED_FE) { 267 if (record->lock.refs == 0) { 268 KKASSERT(record->flush_state != HAMMER_FST_FLUSH); 269 270 ip = record->ip; 271 if ((target_ip = record->target_ip) != NULL) { 272 TAILQ_REMOVE(&target_ip->target_list, 273 record, target_entry); 274 record->target_ip = NULL; 275 hammer_test_inode(target_ip); 276 } 277 278 if (record->flags & HAMMER_RECF_ONRBTREE) { 279 RB_REMOVE(hammer_rec_rb_tree, 280 &record->ip->rec_tree, 281 record); 282 record->flags &= ~HAMMER_RECF_ONRBTREE; 283 } 284 if (record->flags & HAMMER_RECF_ALLOCDATA) { 285 --hammer_count_record_datas; 286 kfree(record->data, M_HAMMER); 287 record->flags &= ~HAMMER_RECF_ALLOCDATA; 288 } 289 record->data = NULL; 290 --hammer_count_records; 291 kfree(record, M_HAMMER); 292 return; 293 } 294 } 295 } 296 297 /* 298 * Record visibility depends on whether the record is being accessed by 299 * the backend or the frontend. 300 * 301 * Return non-zero if the record is visible, zero if it isn't or if it is 302 * deleted. 303 */ 304 static __inline 305 int 306 hammer_ip_iterate_mem_good(hammer_cursor_t cursor, hammer_record_t record) 307 { 308 if (cursor->flags & HAMMER_CURSOR_BACKEND) { 309 if (record->flags & HAMMER_RECF_DELETED_BE) 310 return(0); 311 if ((record->flags & HAMMER_RECF_INTERLOCK_BE) == 0) 312 return(0); 313 } else { 314 if (record->flags & HAMMER_RECF_DELETED_FE) 315 return(0); 316 } 317 return(1); 318 } 319 320 /* 321 * This callback is used as part of the RB_SCAN function for in-memory 322 * records. We terminate it (return -1) as soon as we get a match. 323 * 324 * This routine is used by frontend code. 325 * 326 * The primary compare code does not account for ASOF lookups. This 327 * code handles that case as well as a few others. 328 */ 329 static 330 int 331 hammer_rec_scan_callback(hammer_record_t rec, void *data) 332 { 333 hammer_cursor_t cursor = data; 334 335 /* 336 * We terminate on success, so this should be NULL on entry. 337 */ 338 KKASSERT(cursor->iprec == NULL); 339 340 /* 341 * Skip if the record was marked deleted. 342 */ 343 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) 344 return(0); 345 346 /* 347 * Skip if not visible due to our as-of TID 348 */ 349 if (cursor->flags & HAMMER_CURSOR_ASOF) { 350 if (cursor->asof < rec->rec.base.base.create_tid) 351 return(0); 352 if (rec->rec.base.base.delete_tid && 353 cursor->asof >= rec->rec.base.base.delete_tid) { 354 return(0); 355 } 356 } 357 358 /* 359 * If the record is queued to the flusher we have to block until 360 * it isn't. Otherwise we may see duplication between our memory 361 * cache and the media. 362 */ 363 hammer_ref(&rec->lock); 364 365 #warning "This deadlocks" 366 #if 0 367 if (rec->flush_state == HAMMER_FST_FLUSH) 368 hammer_wait_mem_record(rec); 369 #endif 370 371 /* 372 * The record may have been deleted while we were blocked. 373 */ 374 if (hammer_ip_iterate_mem_good(cursor, rec) == 0) { 375 hammer_rel_mem_record(rec); 376 return(0); 377 } 378 379 /* 380 * Set the matching record and stop the scan. 381 */ 382 cursor->iprec = rec; 383 return(-1); 384 } 385 386 387 /* 388 * Lookup an in-memory record given the key specified in the cursor. Works 389 * just like hammer_btree_lookup() but operates on an inode's in-memory 390 * record list. 391 * 392 * The lookup must fail if the record is marked for deferred deletion. 393 */ 394 static 395 int 396 hammer_mem_lookup(hammer_cursor_t cursor, hammer_inode_t ip) 397 { 398 int error; 399 400 if (cursor->iprec) { 401 hammer_rel_mem_record(cursor->iprec); 402 cursor->iprec = NULL; 403 } 404 if (cursor->ip) { 405 KKASSERT(cursor->ip->cursor_ip_refs > 0); 406 --cursor->ip->cursor_ip_refs; 407 #if 0 408 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 409 &cursor->ip->rec_tree); 410 #endif 411 } 412 cursor->ip = ip; 413 #if 0 414 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 415 #endif 416 ++ip->cursor_ip_refs; 417 418 #if 0 419 cursor->scan.node = NULL; 420 #endif 421 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_find_cmp, 422 hammer_rec_scan_callback, cursor); 423 424 if (cursor->iprec == NULL) 425 error = ENOENT; 426 else 427 error = 0; 428 return(error); 429 } 430 431 /* 432 * hammer_mem_first() - locate the first in-memory record matching the 433 * cursor within the bounds of the key range. 434 */ 435 static 436 int 437 hammer_mem_first(hammer_cursor_t cursor, hammer_inode_t ip) 438 { 439 if (cursor->iprec) { 440 hammer_rel_mem_record(cursor->iprec); 441 cursor->iprec = NULL; 442 } 443 if (cursor->ip) { 444 KKASSERT(cursor->ip->cursor_ip_refs > 0); 445 --cursor->ip->cursor_ip_refs; 446 #if 0 447 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 448 &cursor->ip->rec_tree); 449 #endif 450 } 451 cursor->ip = ip; 452 #if 0 453 hammer_rec_rb_tree_scan_info_link(&cursor->scan, &ip->rec_tree); 454 #endif 455 ++ip->cursor_ip_refs; 456 457 #if 0 458 cursor->scan.node = NULL; 459 #endif 460 hammer_rec_rb_tree_RB_SCAN(&ip->rec_tree, hammer_rec_scan_cmp, 461 hammer_rec_scan_callback, cursor); 462 463 /* 464 * Adjust scan.node and keep it linked into the RB-tree so we can 465 * hold the cursor through third party modifications of the RB-tree. 466 */ 467 if (cursor->iprec) { 468 #if 0 469 cursor->scan.node = hammer_rec_rb_tree_RB_NEXT(cursor->iprec); 470 #endif 471 return(0); 472 } 473 return(ENOENT); 474 } 475 476 void 477 hammer_mem_done(hammer_cursor_t cursor) 478 { 479 if (cursor->ip) { 480 KKASSERT(cursor->ip->cursor_ip_refs > 0); 481 --cursor->ip->cursor_ip_refs; 482 #if 0 483 hammer_rec_rb_tree_scan_info_done(&cursor->scan, 484 &cursor->ip->rec_tree); 485 #endif 486 cursor->ip = NULL; 487 } 488 if (cursor->iprec) { 489 hammer_rel_mem_record(cursor->iprec); 490 cursor->iprec = NULL; 491 } 492 } 493 494 /************************************************************************ 495 * HAMMER IN-MEMORY RECORD FUNCTIONS * 496 ************************************************************************ 497 * 498 * These functions manipulate in-memory records. Such records typically 499 * exist prior to being committed to disk or indexed via the on-disk B-Tree. 500 */ 501 502 /* 503 * Add a directory entry (dip,ncp) which references inode (ip). 504 * 505 * Note that the low 32 bits of the namekey are set temporarily to create 506 * a unique in-memory record, and may be modified a second time when the 507 * record is synchronized to disk. In particular, the low 32 bits cannot be 508 * all 0's when synching to disk, which is not handled here. 509 */ 510 int 511 hammer_ip_add_directory(struct hammer_transaction *trans, 512 struct hammer_inode *dip, struct namecache *ncp, 513 struct hammer_inode *ip) 514 { 515 hammer_record_t record; 516 int error; 517 int bytes; 518 519 record = hammer_alloc_mem_record(dip); 520 521 bytes = ncp->nc_nlen; /* NOTE: terminating \0 is NOT included */ 522 if (++trans->hmp->namekey_iterator == 0) 523 ++trans->hmp->namekey_iterator; 524 525 record->type = HAMMER_MEM_RECORD_ADD; 526 record->rec.entry.base.base.obj_id = dip->obj_id; 527 record->rec.entry.base.base.key = 528 hammer_directory_namekey(ncp->nc_name, bytes); 529 record->rec.entry.base.base.key += trans->hmp->namekey_iterator; 530 record->rec.entry.base.base.rec_type = HAMMER_RECTYPE_DIRENTRY; 531 record->rec.entry.base.base.obj_type = ip->ino_rec.base.base.obj_type; 532 record->rec.entry.obj_id = ip->obj_id; 533 record->data = (void *)ncp->nc_name; 534 record->rec.entry.base.data_len = bytes; 535 ++ip->ino_rec.ino_nlinks; 536 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 537 538 /* 539 * The target inode and the directory entry are bound together. 540 */ 541 record->target_ip = ip; 542 record->flush_state = HAMMER_FST_SETUP; 543 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 544 545 /* 546 * The inode now has a dependancy and must be taken out of the idle 547 * state. An inode not in an idle state is given an extra reference. 548 */ 549 if (ip->flush_state == HAMMER_FST_IDLE) { 550 hammer_ref(&ip->lock); 551 ip->flush_state = HAMMER_FST_SETUP; 552 } 553 554 /* NOTE: copies record->data */ 555 error = hammer_mem_add(trans, record); 556 return(error); 557 } 558 559 /* 560 * Delete the directory entry and update the inode link count. The 561 * cursor must be seeked to the directory entry record being deleted. 562 * 563 * The related inode should be share-locked by the caller. The caller is 564 * on the frontend. 565 * 566 * This function can return EDEADLK requiring the caller to terminate 567 * the cursor, any locks, wait on the returned record, and retry. 568 */ 569 int 570 hammer_ip_del_directory(struct hammer_transaction *trans, 571 hammer_cursor_t cursor, struct hammer_inode *dip, 572 struct hammer_inode *ip) 573 { 574 hammer_record_t record; 575 int error; 576 577 if (cursor->record == &cursor->iprec->rec) { 578 /* 579 * In-memory (unsynchronized) records can simply be freed. 580 * Even though the HAMMER_RECF_DELETED_FE flag is ignored 581 * by the backend, we must still avoid races against the 582 * backend potentially syncing the record to the media. 583 * 584 * We cannot call hammer_ip_delete_record(), that routine may 585 * only be called from the backend. 586 */ 587 record = cursor->iprec; 588 if (record->flags & HAMMER_RECF_INTERLOCK_BE) { 589 KKASSERT(cursor->deadlk_rec == NULL); 590 hammer_ref(&record->lock); 591 cursor->deadlk_rec = record; 592 error = EDEADLK; 593 } else { 594 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 595 record->flags |= HAMMER_RECF_DELETED_FE; 596 error = 0; 597 } 598 } else { 599 /* 600 * If the record is on-disk we have to queue the deletion by 601 * the record's key. This also causes lookups to skip the 602 * record. 603 */ 604 record = hammer_alloc_mem_record(dip); 605 record->type = HAMMER_MEM_RECORD_DEL; 606 record->rec.entry.base.base = cursor->record->base.base; 607 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 608 609 record->target_ip = ip; 610 record->flush_state = HAMMER_FST_SETUP; 611 TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry); 612 613 /* 614 * The inode now has a dependancy and must be taken out of 615 * the idle state. An inode not in an idle state is given 616 * an extra reference. 617 */ 618 if (ip->flush_state == HAMMER_FST_IDLE) { 619 hammer_ref(&ip->lock); 620 ip->flush_state = HAMMER_FST_SETUP; 621 } 622 623 error = hammer_mem_add(trans, record); 624 } 625 626 /* 627 * One less link. The file may still be open in the OS even after 628 * all links have gone away so we only try to sync if the OS has 629 * no references and nlinks falls to 0. 630 * 631 * We have to terminate the cursor before syncing the inode to 632 * avoid deadlocking against ourselves. 633 * 634 * XXX we can't sync the inode here because the encompassing 635 * transaction might be a rename and might update the inode 636 * again with a new link. That would force the delete_tid to be 637 * the same as the create_tid and cause a panic. 638 */ 639 if (error == 0) { 640 --ip->ino_rec.ino_nlinks; 641 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 642 if (ip->ino_rec.ino_nlinks == 0 && 643 (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) { 644 hammer_done_cursor(cursor); 645 } 646 647 } 648 return(error); 649 } 650 651 /* 652 * Add a record to an inode. 653 * 654 * The caller must allocate the record with hammer_alloc_mem_record(ip) and 655 * initialize the following additional fields: 656 * 657 * The related inode should be share-locked by the caller. The caller is 658 * on the frontend. 659 * 660 * record->rec.entry.base.base.key 661 * record->rec.entry.base.base.rec_type 662 * record->rec.entry.base.base.data_len 663 * record->data (a copy will be kmalloc'd if it cannot be embedded) 664 */ 665 int 666 hammer_ip_add_record(struct hammer_transaction *trans, hammer_record_t record) 667 { 668 hammer_inode_t ip = record->ip; 669 int error; 670 671 record->rec.base.base.obj_id = ip->obj_id; 672 record->rec.base.base.obj_type = ip->ino_rec.base.base.obj_type; 673 674 hammer_modify_inode(trans, ip, HAMMER_INODE_RDIRTY); 675 676 /* NOTE: copies record->data */ 677 error = hammer_mem_add(trans, record); 678 return(error); 679 } 680 681 /* 682 * Sync data from a buffer cache buffer (typically) to the filesystem. This 683 * is called via the strategy called from a cached data source. This code 684 * is responsible for actually writing a data record out to the disk. 685 * 686 * This can only occur non-historically (i.e. 'current' data only). 687 * 688 * The file offset must be HAMMER_BUFSIZE aligned but the data length 689 * can be truncated. The record (currently) always represents a BUFSIZE 690 * swath of space whether the data is truncated or not. 691 */ 692 int 693 hammer_ip_sync_data(hammer_transaction_t trans, hammer_inode_t ip, 694 int64_t offset, void *data, int bytes) 695 { 696 struct hammer_cursor cursor; 697 hammer_record_ondisk_t rec; 698 union hammer_btree_elm elm; 699 hammer_off_t rec_offset; 700 void *bdata; 701 int error; 702 703 KKASSERT((offset & HAMMER_BUFMASK) == 0); 704 KKASSERT(trans->type == HAMMER_TRANS_FLS); 705 retry: 706 error = hammer_init_cursor(trans, &cursor, &ip->cache[0]); 707 if (error) 708 return(error); 709 cursor.key_beg.obj_id = ip->obj_id; 710 cursor.key_beg.key = offset + bytes; 711 cursor.key_beg.create_tid = trans->tid; 712 cursor.key_beg.delete_tid = 0; 713 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 714 cursor.asof = trans->tid; 715 cursor.flags |= HAMMER_CURSOR_INSERT; 716 cursor.flags |= HAMMER_CURSOR_BACKEND; 717 718 /* 719 * Issue a lookup to position the cursor. 720 */ 721 error = hammer_btree_lookup(&cursor); 722 if (error == 0) { 723 kprintf("hammer_ip_sync_data: duplicate data at " 724 "(%lld,%d) tid %016llx\n", 725 offset, bytes, trans->tid); 726 hammer_print_btree_elm(&cursor.node->ondisk->elms[cursor.index], 727 HAMMER_BTREE_TYPE_LEAF, cursor.index); 728 panic("Duplicate data"); 729 error = EIO; 730 } 731 if (error != ENOENT) 732 goto done; 733 734 /* 735 * Allocate record and data space. HAMMER_RECTYPE_DATA records 736 * can cross buffer boundaries so we may have to split our bcopy. 737 */ 738 rec = hammer_alloc_record(trans, &rec_offset, HAMMER_RECTYPE_DATA, 739 &cursor.record_buffer, 740 bytes, &bdata, 741 &cursor.data_buffer, &error); 742 if (rec == NULL) 743 goto done; 744 if (hammer_debug_general & 0x1000) 745 kprintf("OOB RECOR2 DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, rec->base.data_len); 746 747 /* 748 * Fill everything in and insert our B-Tree node. 749 * 750 * NOTE: hammer_alloc_record() has already marked the related 751 * buffers as modified. If we do it again we will generate 752 * unnecessary undo elements. 753 */ 754 hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0); 755 rec->base.base.btype = HAMMER_BTREE_TYPE_RECORD; 756 rec->base.base.obj_id = ip->obj_id; 757 rec->base.base.key = offset + bytes; 758 rec->base.base.create_tid = trans->tid; 759 rec->base.base.delete_tid = 0; 760 rec->base.base.rec_type = HAMMER_RECTYPE_DATA; 761 rec->base.data_crc = crc32(data, bytes); 762 hammer_modify_buffer_done(cursor.record_buffer); 763 KKASSERT(rec->base.data_len == bytes); 764 765 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0); 766 bcopy(data, bdata, bytes); 767 hammer_modify_buffer_done(cursor.data_buffer); 768 769 elm.leaf.base = rec->base.base; 770 elm.leaf.rec_offset = rec_offset; 771 elm.leaf.data_offset = rec->base.data_off; 772 elm.leaf.data_len = bytes; 773 elm.leaf.data_crc = rec->base.data_crc; 774 775 /* 776 * Data records can wind up on-disk before the inode itself is 777 * on-disk. One must assume data records may be on-disk if either 778 * HAMMER_INODE_DONDISK or HAMMER_INODE_ONDISK is set 779 */ 780 ip->flags |= HAMMER_INODE_DONDISK; 781 782 error = hammer_btree_insert(&cursor, &elm); 783 if (error == 0) 784 goto done; 785 786 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); 787 done: 788 hammer_done_cursor(&cursor); 789 if (error == EDEADLK) 790 goto retry; 791 return(error); 792 } 793 794 /* 795 * Sync an in-memory record to the disk. This is called by the backend. 796 * This code is responsible for actually writing a record out to the disk. 797 * 798 * This routine can only be called by the backend and the record 799 * must have been interlocked with BE. It will remain interlocked on 800 * return. The caller is responsible for the record's disposition. 801 */ 802 int 803 hammer_ip_sync_record(hammer_transaction_t trans, hammer_record_t record) 804 { 805 struct hammer_cursor cursor; 806 hammer_record_ondisk_t rec; 807 union hammer_btree_elm elm; 808 hammer_off_t rec_offset; 809 void *bdata; 810 int error; 811 812 KKASSERT(record->flush_state == HAMMER_FST_FLUSH); 813 KKASSERT(record->flags & HAMMER_RECF_INTERLOCK_BE); 814 815 retry: 816 /* 817 * Get a cursor, we will either be inserting or deleting. 818 */ 819 error = hammer_init_cursor(trans, &cursor, &record->ip->cache[0]); 820 if (error) 821 return(error); 822 cursor.key_beg = record->rec.base.base; 823 cursor.flags |= HAMMER_CURSOR_BACKEND; 824 825 /* 826 * If we are deleting an exact match must be found on-disk. 827 */ 828 if (record->type == HAMMER_MEM_RECORD_DEL) { 829 error = hammer_btree_lookup(&cursor); 830 if (error == 0) 831 error = hammer_ip_delete_record(&cursor, trans->tid); 832 goto done; 833 } 834 835 /* 836 * We are inserting. 837 * 838 * Issue a lookup to position the cursor and locate the cluster. The 839 * target key should not exist. If we are creating a directory entry 840 * we may have to iterate the low 32 bits of the key to find an unused 841 * key. 842 */ 843 cursor.flags |= HAMMER_CURSOR_INSERT; 844 845 for (;;) { 846 error = hammer_btree_lookup(&cursor); 847 if (error) 848 break; 849 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) { 850 kprintf("hammer_ip_sync_record: duplicate rec " 851 "at (%016llx)\n", record->rec.base.base.key); 852 Debugger("duplicate record1"); 853 error = EIO; 854 break; 855 } 856 if (++trans->hmp->namekey_iterator == 0) 857 ++trans->hmp->namekey_iterator; 858 record->rec.base.base.key &= ~(0xFFFFFFFFLL); 859 record->rec.base.base.key |= trans->hmp->namekey_iterator; 860 cursor.key_beg.key = record->rec.base.base.key; 861 } 862 if (error != ENOENT) 863 goto done; 864 865 /* 866 * Allocate the record and data. The result buffers will be 867 * marked as being modified and further calls to 868 * hammer_modify_buffer() will result in unneeded UNDO records. 869 * 870 * Support zero-fill records (data == NULL and data_len != 0) 871 */ 872 if (record->data == NULL) { 873 rec = hammer_alloc_record(trans, &rec_offset, 874 record->rec.base.base.rec_type, 875 &cursor.record_buffer, 876 0, &bdata, 877 NULL, &error); 878 if (hammer_debug_general & 0x1000) 879 kprintf("NULL RECORD DATA\n"); 880 } else if (record->flags & HAMMER_RECF_INBAND) { 881 rec = hammer_alloc_record(trans, &rec_offset, 882 record->rec.base.base.rec_type, 883 &cursor.record_buffer, 884 record->rec.base.data_len, &bdata, 885 NULL, &error); 886 if (hammer_debug_general & 0x1000) 887 kprintf("INBAND RECORD DATA %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len); 888 } else { 889 rec = hammer_alloc_record(trans, &rec_offset, 890 record->rec.base.base.rec_type, 891 &cursor.record_buffer, 892 record->rec.base.data_len, &bdata, 893 &cursor.data_buffer, &error); 894 if (hammer_debug_general & 0x1000) 895 kprintf("OOB RECORD DATA REC %016llx DATA %016llx LEN=%d\n", rec_offset, rec->base.data_off, record->rec.base.data_len); 896 } 897 898 if (rec == NULL) 899 goto done; 900 901 /* 902 * Fill in the remaining fields and insert our B-Tree node. 903 */ 904 hammer_modify_buffer(trans, cursor.record_buffer, NULL, 0); 905 rec->base.base = record->rec.base.base; 906 bcopy(&record->rec.base + 1, &rec->base + 1, 907 HAMMER_RECORD_SIZE - sizeof(record->rec.base)); 908 909 /* 910 * Copy the data and deal with zero-fill support. 911 */ 912 if (record->data && (record->flags & HAMMER_RECF_INBAND)) { 913 rec->base.data_crc = crc32(record->data, rec->base.data_len); 914 bcopy(record->data, bdata, rec->base.data_len); 915 } else if (record->data) { 916 rec->base.data_crc = crc32(record->data, rec->base.data_len); 917 hammer_modify_buffer(trans, cursor.data_buffer, NULL, 0); 918 bcopy(record->data, bdata, rec->base.data_len); 919 hammer_modify_buffer_done(cursor.data_buffer); 920 } else { 921 rec->base.data_len = record->rec.base.data_len; 922 } 923 hammer_modify_buffer_done(cursor.record_buffer); 924 925 elm.leaf.base = record->rec.base.base; 926 elm.leaf.rec_offset = rec_offset; 927 elm.leaf.data_offset = rec->base.data_off; 928 elm.leaf.data_len = rec->base.data_len; 929 elm.leaf.data_crc = rec->base.data_crc; 930 931 error = hammer_btree_insert(&cursor, &elm); 932 933 /* 934 * This occurs when the frontend creates a record and queues it to 935 * the backend, then tries to delete the record. The backend must 936 * still sync the record to the media as if it were not deleted, 937 * but must interlock with the frontend to ensure that the 938 * synchronized record is not visible to the frontend, which means 939 * converting it from an ADD record to a DEL record. 940 * 941 * The DEL record then masks the record synced to disk until another 942 * round can delete it for real. 943 */ 944 if (error == 0 && (record->flags & HAMMER_RECF_CONVERT_DELETE)) { 945 KKASSERT(record->type == HAMMER_MEM_RECORD_ADD); 946 record->flags &= ~HAMMER_RECF_DELETED_FE; 947 record->type = HAMMER_MEM_RECORD_DEL; 948 if (record->flush_state == HAMMER_FST_SETUP) { 949 hammer_test_inode(record->ip); 950 hammer_test_inode(record->target_ip); 951 } 952 } 953 954 /* 955 * If the error occured unwind the operation. 956 */ 957 if (error) 958 hammer_blockmap_free(trans, rec_offset, HAMMER_RECORD_SIZE); 959 960 done: 961 hammer_done_cursor(&cursor); 962 if (error == EDEADLK) 963 goto retry; 964 return(error); 965 } 966 967 /* 968 * Add the record to the inode's rec_tree. The low 32 bits of a directory 969 * entry's key is used to deal with hash collisions in the upper 32 bits. 970 * A unique 64 bit key is generated in-memory and may be regenerated a 971 * second time when the directory record is flushed to the on-disk B-Tree. 972 * 973 * A referenced record is passed to this function. This function 974 * eats the reference. If an error occurs the record will be deleted. 975 * 976 * A copy of the temporary record->data pointer provided by the caller 977 * will be made. 978 */ 979 static 980 int 981 hammer_mem_add(struct hammer_transaction *trans, hammer_record_t record) 982 { 983 void *data; 984 int bytes; 985 int reclen; 986 987 /* 988 * Make a private copy of record->data 989 */ 990 if (record->data) { 991 /* 992 * Try to embed the data in extra space in the record 993 * union, otherwise allocate a copy. 994 */ 995 bytes = record->rec.base.data_len; 996 switch(record->rec.base.base.rec_type) { 997 case HAMMER_RECTYPE_DIRENTRY: 998 reclen = offsetof(struct hammer_entry_record, name[0]); 999 break; 1000 case HAMMER_RECTYPE_DATA: 1001 reclen = offsetof(struct hammer_data_record, data[0]); 1002 break; 1003 default: 1004 reclen = sizeof(record->rec); 1005 break; 1006 } 1007 if (reclen + bytes <= HAMMER_RECORD_SIZE) { 1008 bcopy(record->data, (char *)&record->rec + reclen, 1009 bytes); 1010 record->data = (void *)((char *)&record->rec + reclen); 1011 record->flags |= HAMMER_RECF_INBAND; 1012 } else { 1013 ++hammer_count_record_datas; 1014 data = kmalloc(bytes, M_HAMMER, M_WAITOK); 1015 record->flags |= HAMMER_RECF_ALLOCDATA; 1016 bcopy(record->data, data, bytes); 1017 record->data = data; 1018 } 1019 } 1020 1021 /* 1022 * Insert into the RB tree, find an unused iterator if this is 1023 * a directory entry. 1024 */ 1025 while (RB_INSERT(hammer_rec_rb_tree, &record->ip->rec_tree, record)) { 1026 if (record->rec.base.base.rec_type != HAMMER_RECTYPE_DIRENTRY){ 1027 record->flags |= HAMMER_RECF_DELETED_FE; 1028 hammer_rel_mem_record(record); 1029 return (EEXIST); 1030 } 1031 if (++trans->hmp->namekey_iterator == 0) 1032 ++trans->hmp->namekey_iterator; 1033 record->rec.base.base.key &= ~(0xFFFFFFFFLL); 1034 record->rec.base.base.key |= trans->hmp->namekey_iterator; 1035 } 1036 record->flags |= HAMMER_RECF_ONRBTREE; 1037 hammer_modify_inode(trans, record->ip, HAMMER_INODE_XDIRTY); 1038 hammer_rel_mem_record(record); 1039 return(0); 1040 } 1041 1042 /************************************************************************ 1043 * HAMMER INODE MERGED-RECORD FUNCTIONS * 1044 ************************************************************************ 1045 * 1046 * These functions augment the B-Tree scanning functions in hammer_btree.c 1047 * by merging in-memory records with on-disk records. 1048 */ 1049 1050 /* 1051 * Locate a particular record either in-memory or on-disk. 1052 * 1053 * NOTE: This is basically a standalone routine, hammer_ip_next() may 1054 * NOT be called to iterate results. 1055 */ 1056 int 1057 hammer_ip_lookup(hammer_cursor_t cursor, struct hammer_inode *ip) 1058 { 1059 int error; 1060 1061 /* 1062 * If the element is in-memory return it without searching the 1063 * on-disk B-Tree 1064 */ 1065 error = hammer_mem_lookup(cursor, ip); 1066 if (error == 0) { 1067 cursor->record = &cursor->iprec->rec; 1068 return(error); 1069 } 1070 if (error != ENOENT) 1071 return(error); 1072 1073 /* 1074 * If the inode has on-disk components search the on-disk B-Tree. 1075 */ 1076 if ((ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) == 0) 1077 return(error); 1078 error = hammer_btree_lookup(cursor); 1079 if (error == 0) 1080 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1081 return(error); 1082 } 1083 1084 /* 1085 * Locate the first record within the cursor's key_beg/key_end range, 1086 * restricted to a particular inode. 0 is returned on success, ENOENT 1087 * if no records matched the requested range, or some other error. 1088 * 1089 * When 0 is returned hammer_ip_next() may be used to iterate additional 1090 * records within the requested range. 1091 * 1092 * This function can return EDEADLK, requiring the caller to terminate 1093 * the cursor and try again. 1094 */ 1095 int 1096 hammer_ip_first(hammer_cursor_t cursor, struct hammer_inode *ip) 1097 { 1098 int error; 1099 1100 /* 1101 * Clean up fields and setup for merged scan 1102 */ 1103 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 1104 cursor->flags |= HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM; 1105 cursor->flags |= HAMMER_CURSOR_DISKEOF | HAMMER_CURSOR_MEMEOF; 1106 if (cursor->iprec) { 1107 hammer_rel_mem_record(cursor->iprec); 1108 cursor->iprec = NULL; 1109 } 1110 1111 /* 1112 * Search the on-disk B-Tree. hammer_btree_lookup() only does an 1113 * exact lookup so if we get ENOENT we have to call the iterate 1114 * function to validate the first record after the begin key. 1115 * 1116 * The ATEDISK flag is used by hammer_btree_iterate to determine 1117 * whether it must index forwards or not. It is also used here 1118 * to select the next record from in-memory or on-disk. 1119 * 1120 * EDEADLK can only occur if the lookup hit an empty internal 1121 * element and couldn't delete it. Since this could only occur 1122 * in-range, we can just iterate from the failure point. 1123 */ 1124 if (ip->flags & (HAMMER_INODE_ONDISK|HAMMER_INODE_DONDISK)) { 1125 error = hammer_btree_lookup(cursor); 1126 if (error == ENOENT || error == EDEADLK) { 1127 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1128 if (hammer_debug_general & 0x2000) 1129 kprintf("error %d node %p %016llx index %d\n", error, cursor->node, cursor->node->node_offset, cursor->index); 1130 error = hammer_btree_iterate(cursor); 1131 } 1132 if (error && error != ENOENT) 1133 return(error); 1134 if (error == 0) { 1135 cursor->flags &= ~HAMMER_CURSOR_DISKEOF; 1136 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1137 } else { 1138 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1139 } 1140 } 1141 1142 /* 1143 * Search the in-memory record list (Red-Black tree). Unlike the 1144 * B-Tree search, mem_first checks for records in the range. 1145 */ 1146 error = hammer_mem_first(cursor, ip); 1147 if (error && error != ENOENT) 1148 return(error); 1149 if (error == 0) { 1150 cursor->flags &= ~HAMMER_CURSOR_MEMEOF; 1151 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1152 if (hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0) 1153 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1154 } 1155 1156 /* 1157 * This will return the first matching record. 1158 */ 1159 return(hammer_ip_next(cursor)); 1160 } 1161 1162 /* 1163 * Retrieve the next record in a merged iteration within the bounds of the 1164 * cursor. This call may be made multiple times after the cursor has been 1165 * initially searched with hammer_ip_first(). 1166 * 1167 * 0 is returned on success, ENOENT if no further records match the 1168 * requested range, or some other error code is returned. 1169 */ 1170 int 1171 hammer_ip_next(hammer_cursor_t cursor) 1172 { 1173 hammer_btree_elm_t elm; 1174 hammer_record_t rec, save; 1175 int error; 1176 int r; 1177 1178 next_btree: 1179 /* 1180 * Load the current on-disk and in-memory record. If we ate any 1181 * records we have to get the next one. 1182 * 1183 * If we deleted the last on-disk record we had scanned ATEDISK will 1184 * be clear and DELBTREE will be set, forcing a call to iterate. The 1185 * fact that ATEDISK is clear causes iterate to re-test the 'current' 1186 * element. If ATEDISK is set, iterate will skip the 'current' 1187 * element. 1188 * 1189 * Get the next on-disk record 1190 */ 1191 if (cursor->flags & (HAMMER_CURSOR_ATEDISK|HAMMER_CURSOR_DELBTREE)) { 1192 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1193 error = hammer_btree_iterate(cursor); 1194 cursor->flags &= ~HAMMER_CURSOR_DELBTREE; 1195 if (error == 0) 1196 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1197 else 1198 cursor->flags |= HAMMER_CURSOR_DISKEOF | 1199 HAMMER_CURSOR_ATEDISK; 1200 } 1201 } 1202 1203 next_memory: 1204 /* 1205 * Get the next in-memory record. The record can be ripped out 1206 * of the RB tree so we maintain a scan_info structure to track 1207 * the next node. 1208 * 1209 * hammer_rec_scan_cmp: Is the record still in our general range, 1210 * (non-inclusive of snapshot exclusions)? 1211 * hammer_rec_scan_callback: Is the record in our snapshot? 1212 */ 1213 if (cursor->flags & HAMMER_CURSOR_ATEMEM) { 1214 if ((cursor->flags & HAMMER_CURSOR_MEMEOF) == 0) { 1215 save = cursor->iprec; 1216 cursor->iprec = NULL; 1217 rec = save ? hammer_rec_rb_tree_RB_NEXT(save) : NULL; 1218 while (rec) { 1219 if (hammer_rec_scan_cmp(rec, cursor) != 0) 1220 break; 1221 if (hammer_rec_scan_callback(rec, cursor) != 0) 1222 break; 1223 rec = hammer_rec_rb_tree_RB_NEXT(rec); 1224 } 1225 if (save) 1226 hammer_rel_mem_record(save); 1227 if (cursor->iprec) { 1228 KKASSERT(cursor->iprec == rec); 1229 cursor->flags &= ~HAMMER_CURSOR_ATEMEM; 1230 #if 0 1231 cursor->scan.node = 1232 hammer_rec_rb_tree_RB_NEXT(rec); 1233 #endif 1234 } else { 1235 cursor->flags |= HAMMER_CURSOR_MEMEOF; 1236 } 1237 } 1238 } 1239 1240 /* 1241 * Extract either the disk or memory record depending on their 1242 * relative position. 1243 */ 1244 error = 0; 1245 switch(cursor->flags & (HAMMER_CURSOR_ATEDISK | HAMMER_CURSOR_ATEMEM)) { 1246 case 0: 1247 /* 1248 * Both entries valid 1249 */ 1250 elm = &cursor->node->ondisk->elms[cursor->index]; 1251 r = hammer_btree_cmp(&elm->base, &cursor->iprec->rec.base.base); 1252 if (r < 0) { 1253 error = hammer_btree_extract(cursor, 1254 HAMMER_CURSOR_GET_RECORD); 1255 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1256 break; 1257 } 1258 1259 /* 1260 * If the entries match exactly the memory entry typically 1261 * specifies an on-disk deletion and we eat both entries. 1262 * 1263 * If the in-memory record is not an on-disk deletion we 1264 * probably caught the syncer while it was syncing it to 1265 * the media. Since we hold a shared lock on the cursor, 1266 * the in-memory record had better be marked deleted at 1267 * this point. 1268 */ 1269 if (r == 0) { 1270 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1271 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) { 1272 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1273 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1274 goto next_btree; 1275 } 1276 } else { 1277 KKASSERT(hammer_ip_iterate_mem_good(cursor, cursor->iprec) == 0); 1278 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1279 goto next_memory; 1280 } 1281 } 1282 /* fall through to the memory entry */ 1283 case HAMMER_CURSOR_ATEDISK: 1284 /* 1285 * Only the memory entry is valid. If the record is 1286 * placemarking an on-disk deletion, we skip it unless 1287 * the caller wants special record visibility. 1288 */ 1289 cursor->record = &cursor->iprec->rec; 1290 cursor->flags |= HAMMER_CURSOR_ATEMEM; 1291 if (cursor->iprec->type == HAMMER_MEM_RECORD_DEL) { 1292 if ((cursor->flags & HAMMER_CURSOR_DELETE_VISIBILITY) == 0) 1293 goto next_memory; 1294 } 1295 break; 1296 case HAMMER_CURSOR_ATEMEM: 1297 /* 1298 * Only the disk entry is valid 1299 */ 1300 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1301 cursor->flags |= HAMMER_CURSOR_ATEDISK; 1302 break; 1303 default: 1304 /* 1305 * Neither entry is valid 1306 * 1307 * XXX error not set properly 1308 */ 1309 cursor->record = NULL; 1310 error = ENOENT; 1311 break; 1312 } 1313 return(error); 1314 } 1315 1316 /* 1317 * Resolve the cursor->data pointer for the current cursor position in 1318 * a merged iteration. 1319 */ 1320 int 1321 hammer_ip_resolve_data(hammer_cursor_t cursor) 1322 { 1323 int error; 1324 1325 if (cursor->iprec && cursor->record == &cursor->iprec->rec) { 1326 cursor->data = cursor->iprec->data; 1327 error = 0; 1328 } else { 1329 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA); 1330 } 1331 return(error); 1332 } 1333 1334 int 1335 hammer_ip_resolve_record_and_data(hammer_cursor_t cursor) 1336 { 1337 int error; 1338 1339 if (cursor->iprec && cursor->record == &cursor->iprec->rec) { 1340 cursor->data = cursor->iprec->data; 1341 error = 0; 1342 } else { 1343 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_DATA | 1344 HAMMER_CURSOR_GET_RECORD); 1345 } 1346 return(error); 1347 } 1348 1349 /* 1350 * Delete all records within the specified range for inode ip. 1351 * 1352 * NOTE: An unaligned range will cause new records to be added to cover 1353 * the edge cases. (XXX not implemented yet). 1354 * 1355 * NOTE: ran_end is inclusive (e.g. 0,1023 instead of 0,1024). 1356 * 1357 * NOTE: Record keys for regular file data have to be special-cased since 1358 * they indicate the end of the range (key = base + bytes). 1359 */ 1360 int 1361 hammer_ip_delete_range(hammer_transaction_t trans, hammer_inode_t ip, 1362 int64_t ran_beg, int64_t ran_end) 1363 { 1364 struct hammer_cursor cursor; 1365 hammer_record_ondisk_t rec; 1366 hammer_base_elm_t base; 1367 int error; 1368 int64_t off; 1369 1370 #if 0 1371 kprintf("delete_range %p %016llx-%016llx\n", ip, ran_beg, ran_end); 1372 #endif 1373 1374 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1375 retry: 1376 hammer_init_cursor(trans, &cursor, &ip->cache[0]); 1377 1378 cursor.key_beg.obj_id = ip->obj_id; 1379 cursor.key_beg.create_tid = 0; 1380 cursor.key_beg.delete_tid = 0; 1381 cursor.key_beg.obj_type = 0; 1382 cursor.asof = ip->obj_asof; 1383 cursor.flags |= HAMMER_CURSOR_ASOF; 1384 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1385 cursor.flags |= HAMMER_CURSOR_BACKEND; 1386 1387 cursor.key_end = cursor.key_beg; 1388 if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) { 1389 cursor.key_beg.key = ran_beg; 1390 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB; 1391 cursor.key_end.rec_type = HAMMER_RECTYPE_DB; 1392 cursor.key_end.key = ran_end; 1393 } else { 1394 /* 1395 * The key in the B-Tree is (base+bytes), so the first possible 1396 * matching key is ran_beg + 1. 1397 */ 1398 int64_t tmp64; 1399 1400 cursor.key_beg.key = ran_beg + 1; 1401 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA; 1402 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA; 1403 1404 tmp64 = ran_end + MAXPHYS + 1; /* work around GCC-4 bug */ 1405 if (tmp64 < ran_end) 1406 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL; 1407 else 1408 cursor.key_end.key = ran_end + MAXPHYS + 1; 1409 } 1410 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE; 1411 1412 error = hammer_ip_first(&cursor, ip); 1413 1414 /* 1415 * Iterate through matching records and mark them as deleted. 1416 */ 1417 while (error == 0) { 1418 rec = cursor.record; 1419 base = &rec->base.base; 1420 1421 KKASSERT(base->delete_tid == 0); 1422 1423 /* 1424 * There may be overlap cases for regular file data. Also 1425 * remember the key for a regular file record is the offset 1426 * of the last byte of the record (base + len - 1), NOT the 1427 * base offset. 1428 */ 1429 #if 0 1430 kprintf("delete_range rec_type %02x\n", base->rec_type); 1431 #endif 1432 if (base->rec_type == HAMMER_RECTYPE_DATA) { 1433 #if 0 1434 kprintf("delete_range loop key %016llx,%d\n", 1435 base->key - rec->base.data_len, rec->base.data_len); 1436 #endif 1437 off = base->key - rec->base.data_len; 1438 /* 1439 * Check the left edge case. We currently do not 1440 * split existing records. 1441 */ 1442 if (off < ran_beg) { 1443 panic("hammer left edge case %016llx %d\n", 1444 base->key, rec->base.data_len); 1445 } 1446 1447 /* 1448 * Check the right edge case. Note that the 1449 * record can be completely out of bounds, which 1450 * terminates the search. 1451 * 1452 * base->key is exclusive of the right edge while 1453 * ran_end is inclusive of the right edge. The 1454 * (key - data_len) left boundary is inclusive. 1455 * 1456 * XXX theory-check this test at some point, are 1457 * we missing a + 1 somewhere? Note that ran_end 1458 * could overflow. 1459 */ 1460 if (base->key - 1 > ran_end) { 1461 if (base->key - rec->base.data_len > ran_end) 1462 break; 1463 panic("hammer right edge case\n"); 1464 } 1465 } 1466 1467 /* 1468 * Mark the record and B-Tree entry as deleted. This will 1469 * also physically delete the B-Tree entry, record, and 1470 * data if the retention policy dictates. The function 1471 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() 1472 * uses to perform a fixup. 1473 */ 1474 error = hammer_ip_delete_record(&cursor, trans->tid); 1475 if (error) 1476 break; 1477 error = hammer_ip_next(&cursor); 1478 } 1479 hammer_done_cursor(&cursor); 1480 if (error == EDEADLK) 1481 goto retry; 1482 if (error == ENOENT) 1483 error = 0; 1484 return(error); 1485 } 1486 1487 /* 1488 * Delete all user records associated with an inode except the inode record 1489 * itself. Directory entries are not deleted (they must be properly disposed 1490 * of or nlinks would get upset). 1491 */ 1492 int 1493 hammer_ip_delete_range_all(hammer_transaction_t trans, hammer_inode_t ip) 1494 { 1495 struct hammer_cursor cursor; 1496 hammer_record_ondisk_t rec; 1497 hammer_base_elm_t base; 1498 int error; 1499 1500 KKASSERT(trans->type == HAMMER_TRANS_FLS); 1501 retry: 1502 hammer_init_cursor(trans, &cursor, &ip->cache[0]); 1503 1504 cursor.key_beg.obj_id = ip->obj_id; 1505 cursor.key_beg.create_tid = 0; 1506 cursor.key_beg.delete_tid = 0; 1507 cursor.key_beg.obj_type = 0; 1508 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 1509 cursor.key_beg.key = HAMMER_MIN_KEY; 1510 1511 cursor.key_end = cursor.key_beg; 1512 cursor.key_end.rec_type = 0xFFFF; 1513 cursor.key_end.key = HAMMER_MAX_KEY; 1514 1515 cursor.asof = ip->obj_asof; 1516 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1517 cursor.flags |= HAMMER_CURSOR_DELETE_VISIBILITY; 1518 cursor.flags |= HAMMER_CURSOR_BACKEND; 1519 1520 error = hammer_ip_first(&cursor, ip); 1521 1522 /* 1523 * Iterate through matching records and mark them as deleted. 1524 */ 1525 while (error == 0) { 1526 rec = cursor.record; 1527 base = &rec->base.base; 1528 1529 KKASSERT(base->delete_tid == 0); 1530 1531 /* 1532 * Mark the record and B-Tree entry as deleted. This will 1533 * also physically delete the B-Tree entry, record, and 1534 * data if the retention policy dictates. The function 1535 * will set HAMMER_CURSOR_DELBTREE which hammer_ip_next() 1536 * uses to perform a fixup. 1537 * 1538 * Directory entries (and delete-on-disk directory entries) 1539 * must be synced and cannot be deleted. 1540 */ 1541 if (rec->base.base.rec_type != HAMMER_RECTYPE_DIRENTRY) 1542 error = hammer_ip_delete_record(&cursor, trans->tid); 1543 if (error) 1544 break; 1545 error = hammer_ip_next(&cursor); 1546 } 1547 hammer_done_cursor(&cursor); 1548 if (error == EDEADLK) 1549 goto retry; 1550 if (error == ENOENT) 1551 error = 0; 1552 return(error); 1553 } 1554 1555 /* 1556 * Delete the record at the current cursor. On success the cursor will 1557 * be positioned appropriately for an iteration but may no longer be at 1558 * a leaf node. 1559 * 1560 * This routine is only called from the backend. 1561 * 1562 * NOTE: This can return EDEADLK, requiring the caller to terminate the 1563 * cursor and retry. 1564 */ 1565 int 1566 hammer_ip_delete_record(hammer_cursor_t cursor, hammer_tid_t tid) 1567 { 1568 hammer_btree_elm_t elm; 1569 hammer_mount_t hmp; 1570 int error; 1571 int dodelete; 1572 1573 KKASSERT(cursor->flags & HAMMER_CURSOR_BACKEND); 1574 1575 /* 1576 * In-memory (unsynchronized) records can simply be freed. This 1577 * only occurs in range iterations since all other records are 1578 * individually synchronized. Thus there should be no confusion with 1579 * the interlock. 1580 */ 1581 if (cursor->record == &cursor->iprec->rec) { 1582 KKASSERT((cursor->iprec->flags & HAMMER_RECF_INTERLOCK_BE) ==0); 1583 cursor->iprec->flags |= HAMMER_RECF_DELETED_FE; 1584 cursor->iprec->flags |= HAMMER_RECF_DELETED_BE; 1585 return(0); 1586 } 1587 1588 /* 1589 * On-disk records are marked as deleted by updating their delete_tid. 1590 * This does not effect their position in the B-Tree (which is based 1591 * on their create_tid). 1592 */ 1593 error = hammer_btree_extract(cursor, HAMMER_CURSOR_GET_RECORD); 1594 elm = NULL; 1595 hmp = cursor->node->hmp; 1596 1597 dodelete = 0; 1598 if (error == 0) { 1599 error = hammer_cursor_upgrade(cursor); 1600 if (error == 0) { 1601 elm = &cursor->node->ondisk->elms[cursor->index]; 1602 hammer_modify_node(cursor->trans, cursor->node, 1603 &elm->leaf.base.delete_tid, 1604 sizeof(elm->leaf.base.delete_tid)); 1605 elm->leaf.base.delete_tid = tid; 1606 hammer_modify_node_done(cursor->node); 1607 1608 /* 1609 * An on-disk record cannot have the same delete_tid 1610 * as its create_tid. In a chain of record updates 1611 * this could result in a duplicate record. 1612 */ 1613 KKASSERT(elm->leaf.base.delete_tid != elm->leaf.base.create_tid); 1614 hammer_modify_buffer(cursor->trans, cursor->record_buffer, &cursor->record->base.base.delete_tid, sizeof(hammer_tid_t)); 1615 cursor->record->base.base.delete_tid = tid; 1616 hammer_modify_buffer_done(cursor->record_buffer); 1617 } 1618 } 1619 1620 /* 1621 * If we were mounted with the nohistory option, we physically 1622 * delete the record. 1623 */ 1624 if (hmp->hflags & HMNT_NOHISTORY) 1625 dodelete = 1; 1626 1627 if (error == 0 && dodelete) { 1628 error = hammer_delete_at_cursor(cursor, NULL); 1629 if (error) { 1630 panic("hammer_ip_delete_record: unable to physically delete the record!\n"); 1631 error = 0; 1632 } 1633 } 1634 return(error); 1635 } 1636 1637 int 1638 hammer_delete_at_cursor(hammer_cursor_t cursor, int64_t *stat_bytes) 1639 { 1640 hammer_btree_elm_t elm; 1641 hammer_off_t rec_offset; 1642 hammer_off_t data_offset; 1643 int32_t data_len; 1644 u_int16_t rec_type; 1645 int error; 1646 1647 elm = &cursor->node->ondisk->elms[cursor->index]; 1648 KKASSERT(elm->base.btype == HAMMER_BTREE_TYPE_RECORD); 1649 1650 rec_offset = elm->leaf.rec_offset; 1651 data_offset = elm->leaf.data_offset; 1652 data_len = elm->leaf.data_len; 1653 rec_type = elm->leaf.base.rec_type; 1654 1655 error = hammer_btree_delete(cursor); 1656 if (error == 0) { 1657 /* 1658 * This forces a fixup for the iteration because 1659 * the cursor is now either sitting at the 'next' 1660 * element or sitting at the end of a leaf. 1661 */ 1662 if ((cursor->flags & HAMMER_CURSOR_DISKEOF) == 0) { 1663 cursor->flags |= HAMMER_CURSOR_DELBTREE; 1664 cursor->flags &= ~HAMMER_CURSOR_ATEDISK; 1665 } 1666 } 1667 if (error == 0) { 1668 hammer_blockmap_free(cursor->trans, rec_offset, 1669 sizeof(union hammer_record_ondisk)); 1670 } 1671 if (error == 0) { 1672 switch(data_offset & HAMMER_OFF_ZONE_MASK) { 1673 case HAMMER_ZONE_LARGE_DATA: 1674 case HAMMER_ZONE_SMALL_DATA: 1675 hammer_blockmap_free(cursor->trans, 1676 data_offset, data_len); 1677 break; 1678 default: 1679 break; 1680 } 1681 } 1682 #if 0 1683 kprintf("hammer_delete_at_cursor: %d:%d:%08x %08x/%d " 1684 "(%d remain in cluster)\n", 1685 cluster->volume->vol_no, cluster->clu_no, 1686 rec_offset, data_offset, data_len, 1687 cluster->ondisk->stat_records); 1688 #endif 1689 return (error); 1690 } 1691 1692 /* 1693 * Determine whether we can remove a directory. This routine checks whether 1694 * a directory is empty or not and enforces flush connectivity. 1695 * 1696 * Flush connectivity requires that we block if the target directory is 1697 * currently flushing, otherwise it may not end up in the same flush group. 1698 * 1699 * Returns 0 on success, ENOTEMPTY or EDEADLK (or other errors) on failure. 1700 */ 1701 int 1702 hammer_ip_check_directory_empty(hammer_transaction_t trans, 1703 hammer_cursor_t parent_cursor, hammer_inode_t ip) 1704 { 1705 struct hammer_cursor cursor; 1706 int error; 1707 1708 #if 0 1709 /* 1710 * Check flush connectivity 1711 */ 1712 if (ip->flush_state != HAMMER_FST_IDLE) { 1713 kprintf("FWAIT\n"); 1714 hammer_done_cursor(parent_cursor); 1715 hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL); 1716 hammer_wait_inode(ip); 1717 return (EDEADLK); 1718 } 1719 #endif 1720 1721 /* 1722 * Check directory empty 1723 */ 1724 hammer_init_cursor(trans, &cursor, &ip->cache[0]); 1725 1726 cursor.key_beg.obj_id = ip->obj_id; 1727 cursor.key_beg.create_tid = 0; 1728 cursor.key_beg.delete_tid = 0; 1729 cursor.key_beg.obj_type = 0; 1730 cursor.key_beg.rec_type = HAMMER_RECTYPE_INODE + 1; 1731 cursor.key_beg.key = HAMMER_MIN_KEY; 1732 1733 cursor.key_end = cursor.key_beg; 1734 cursor.key_end.rec_type = 0xFFFF; 1735 cursor.key_end.key = HAMMER_MAX_KEY; 1736 1737 cursor.asof = ip->obj_asof; 1738 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF; 1739 1740 error = hammer_ip_first(&cursor, ip); 1741 if (error == ENOENT) 1742 error = 0; 1743 else if (error == 0) 1744 error = ENOTEMPTY; 1745 hammer_done_cursor(&cursor); 1746 return(error); 1747 } 1748 1749