1 /* 2 * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 /* 36 * This subsystem handles direct and indirect block searches, recursions, 37 * creation, and deletion. Chains of blockrefs are tracked and modifications 38 * are flag for propagation... eventually all the way back to the volume 39 * header. 40 */ 41 42 #include <sys/cdefs.h> 43 #include <sys/param.h> 44 #include <sys/systm.h> 45 #include <sys/types.h> 46 #include <sys/lock.h> 47 #include <sys/uuid.h> 48 49 #include "hammer2.h" 50 51 static int hammer2_indirect_optimize; /* XXX SYSCTL */ 52 53 static hammer2_chain_t *hammer2_chain_create_indirect( 54 hammer2_mount_t *hmp, hammer2_chain_t *parent, 55 hammer2_key_t key, int keybits); 56 57 /* 58 * We use a red-black tree to guarantee safe lookups under shared locks. 59 */ 60 RB_GENERATE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); 61 62 int 63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) 64 { 65 return(chain2->index - chain1->index); 66 } 67 68 /* 69 * Recursively mark the parent chain elements so flushes can find 70 * modified elements. Stop when we hit a chain already flagged 71 * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set 72 * in chain itself. 73 * 74 * SUBMODIFIED is not set on the chain passed in. 75 * 76 * XXX rename of parent can create a SMP race 77 */ 78 static void 79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain) 80 { 81 hammer2_chain_t *parent; 82 83 parent = chain->parent; 84 while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) { 85 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED); 86 parent = parent->parent; 87 } 88 } 89 90 /* 91 * Allocate a new disconnected chain element representing the specified 92 * bref. The chain element is locked exclusively and refs is set to 1. 93 * 94 * This essentially allocates a system memory structure representing one 95 * of the media structure types, including inodes. 96 */ 97 hammer2_chain_t * 98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref) 99 { 100 hammer2_chain_t *chain; 101 hammer2_inode_t *ip; 102 hammer2_indblock_t *np; 103 hammer2_data_t *dp; 104 u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX); 105 106 /* 107 * Construct the appropriate system structure. 108 */ 109 switch(bref->type) { 110 case HAMMER2_BREF_TYPE_INODE: 111 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO); 112 chain = &ip->chain; 113 chain->u.ip = ip; 114 ip->hmp = hmp; 115 break; 116 case HAMMER2_BREF_TYPE_INDIRECT: 117 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO); 118 chain = &np->chain; 119 chain->u.np = np; 120 break; 121 case HAMMER2_BREF_TYPE_DATA: 122 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO); 123 chain = &dp->chain; 124 chain->u.dp = dp; 125 break; 126 case HAMMER2_BREF_TYPE_VOLUME: 127 chain = NULL; 128 panic("hammer2_chain_alloc volume type illegal for op"); 129 default: 130 chain = NULL; 131 panic("hammer2_chain_alloc: unrecognized blockref type: %d", 132 bref->type); 133 } 134 135 /* 136 * Only set bref_flush if the bref has a real media offset, otherwise 137 * the caller has to wait for the chain to be modified/block-allocated 138 * before a blockref can be synchronized with its (future) parent. 139 */ 140 chain->bref = *bref; 141 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) 142 chain->bref_flush = *bref; 143 chain->index = -1; /* not yet assigned */ 144 chain->refs = 1; 145 chain->bytes = bytes; 146 ccms_cst_init(&chain->cst, chain); 147 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 148 149 return (chain); 150 } 151 152 /* 153 * Deallocate a chain (the step before freeing it). Remove the chain from 154 * its parent's tree. 155 * 156 * Caller must hold the parent and the chain exclusively locked, and 157 * chain->refs must be 0. 158 * 159 * This function unlocks, removes, and destroys chain, and will recursively 160 * destroy any sub-chains under chain (whos refs must also be 0 at this 161 * point). 162 * 163 * parent can be NULL. 164 */ 165 static void 166 hammer2_chain_dealloc(hammer2_mount_t *hmp, hammer2_chain_t *chain) 167 { 168 hammer2_inode_t *ip; 169 hammer2_chain_t *parent; 170 hammer2_chain_t *child; 171 172 KKASSERT(chain->refs == 0); 173 KKASSERT((chain->flags & 174 (HAMMER2_CHAIN_MOVED | HAMMER2_CHAIN_MODIFIED)) == 0); 175 176 parent = chain->parent; 177 chain->parent = NULL; 178 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 179 ip = chain->u.ip; 180 else 181 ip = NULL; 182 183 /* 184 * If the sub-tree is not empty all the elements on it must have 185 * 0 refs and be deallocatable. 186 */ 187 while ((child = RB_ROOT(&chain->rbhead)) != NULL) { 188 ccms_thread_lock(&child->cst, CCMS_STATE_EXCLUSIVE); 189 hammer2_chain_dealloc(hmp, child); 190 } 191 192 /* 193 * If the DELETED flag is not set the chain must be removed from 194 * its parent's tree. 195 */ 196 if ((chain->flags & HAMMER2_CHAIN_DELETED) == 0) { 197 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 198 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 199 if (ip) 200 ip->pip = NULL; 201 } 202 203 /* 204 * When cleaning out a hammer2_inode we must 205 * also clean out the related ccms_inode. 206 */ 207 if (ip) 208 ccms_cst_uninit(&ip->topo_cst); 209 hammer2_chain_free(hmp, chain); 210 } 211 212 /* 213 * Free a disconnected chain element 214 */ 215 void 216 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain) 217 { 218 void *mem; 219 220 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE || 221 chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) { 222 chain->data = NULL; 223 } 224 225 KKASSERT(chain->bp == NULL); 226 KKASSERT(chain->data == NULL); 227 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE || 228 chain->u.ip->vp == NULL); 229 ccms_thread_unlock(&chain->cst); 230 KKASSERT(chain->cst.count == 0); 231 KKASSERT(chain->cst.upgrade == 0); 232 233 if ((mem = chain->u.mem) != NULL) { 234 chain->u.mem = NULL; 235 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 236 kfree(mem, hmp->minode); 237 else 238 kfree(mem, hmp->mchain); 239 } 240 } 241 242 /* 243 * Add a reference to a chain element, preventing its destruction. 244 * 245 * The parent chain must be locked shared or exclusive or otherwise be 246 * stable and already have a reference. 247 */ 248 void 249 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain) 250 { 251 u_int refs; 252 253 while (chain) { 254 refs = chain->refs; 255 KKASSERT(chain->refs >= 0); 256 cpu_ccfence(); 257 if (refs == 0) { 258 /* 259 * 0 -> 1 transition must bump the refs on the parent 260 * too. The caller has stabilized the parent. 261 */ 262 if (atomic_cmpset_int(&chain->refs, 0, 1)) { 263 chain = chain->parent; 264 KKASSERT(chain == NULL || chain->refs > 0); 265 } 266 /* retry or continue along the parent chain */ 267 } else { 268 /* 269 * N -> N+1 270 */ 271 if (atomic_cmpset_int(&chain->refs, refs, refs + 1)) 272 break; 273 /* retry */ 274 } 275 } 276 } 277 278 /* 279 * Drop the callers reference to the chain element. If the ref count 280 * reaches zero we attempt to recursively drop the parent. 281 * 282 * MOVED and MODIFIED elements hold additional references so it should not 283 * be possible for the count on a modified element to drop to 0. 284 * 285 * The chain element must NOT be locked by the caller on the 1->0 transition. 286 * 287 * The parent might or might not be locked by the caller. If we are unable 288 * to lock the parent on the 1->0 transition the destruction of the chain 289 * will be deferred but we still recurse upward and drop the ref on the 290 * parent (see the lastdrop() function) 291 */ 292 static hammer2_chain_t *hammer2_chain_lastdrop(hammer2_mount_t *hmp, 293 hammer2_chain_t *chain); 294 295 void 296 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain) 297 { 298 u_int refs; 299 300 while (chain) { 301 refs = chain->refs; 302 cpu_ccfence(); 303 KKASSERT(refs > 0); 304 if (refs == 1) { 305 /* 306 * (1) lastdrop successfully drops the chain and 307 * returns the parent, we recursively drop the 308 * parent. 309 * 310 * (2) lastdrop fails to transition refs from 1 to 0 311 * and returns the same chain, we retry. 312 * 313 * (3) lastdrop fails to drop the chain and returns 314 * NULL, leaving the ref intact for a deferred 315 * drop later on. 316 */ 317 chain = hammer2_chain_lastdrop(hmp, chain); 318 } else { 319 if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) { 320 /* 321 * Succeeded, count did not reach zero so 322 * cut out of the loop. 323 */ 324 break; 325 } 326 /* retry the same chain */ 327 } 328 } 329 } 330 331 /* 332 * On the last drop we have to stabilize chain->parent, which we can do 333 * by acquiring the chain->cst.spin lock. If we get a full-blown lock 334 * it messes up the chain_unlock() code's ccms_thread_unlock_zero() call. 335 * 336 * Once the spinlock has been obtained we can drop the refs and become the 337 * owner of the implied ref on the parent, allowing us to return the parent. 338 */ 339 static 340 hammer2_chain_t * 341 hammer2_chain_lastdrop(hammer2_mount_t *hmp, hammer2_chain_t *chain) 342 { 343 hammer2_chain_t *parent; 344 345 /* 346 * gain lock, drop refs, return chain to retry if we were unable 347 * to drop the refs from 1 to 0. 348 */ 349 spin_lock(&chain->cst.spin); 350 if (atomic_cmpset_int(&chain->refs, 1, 0) == 0) { 351 spin_unlock(&chain->cst.spin); 352 return (chain); 353 } 354 355 /* 356 * Refs is 0 and we own the implied ref on the parent. The 357 * chain can still be accessed at this point but any cycling 358 * of its refs will simply build-up more implied refs on the 359 * parent. 360 * 361 * Thus the parent pointer is valid. 362 */ 363 parent = chain->parent; 364 spin_unlock(&chain->cst.spin); 365 366 /* 367 * Attempt to acquire an exclusive lock on the parent. If this 368 * fails we just leave chain alone but still return the parent 369 * for the drop recursion. 370 */ 371 if (parent && 372 ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE)) { 373 return (parent); 374 } 375 376 /* 377 * With an exclusive lock on the parent in-hand if chain->refs is 378 * still 0 then its impossible for anyone new to access it (or any 379 * of its children), and it can be deallocated. 380 */ 381 if (chain->refs == 0) { 382 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 383 hammer2_chain_dealloc(hmp, chain); 384 } 385 386 /* 387 * drop recursion, return parent so the caller can eat the implied 388 * ref we own on it. We have to use hammer2_chain_unlock() (which 389 * also does a drop so we also need a ref on parent). 390 */ 391 if (parent) { 392 hammer2_chain_ref(hmp, parent); 393 hammer2_chain_unlock(hmp, parent); 394 } 395 return (parent); 396 } 397 398 /* 399 * Ref and lock a chain element, acquiring its data with I/O if necessary, 400 * and specify how you would like the data to be resolved. 401 * 402 * Returns 0 on success or an error code if the data could not be acquired. 403 * The chain element is locked either way. 404 * 405 * The lock is allowed to recurse, multiple locking ops will aggregate 406 * the requested resolve types. Once data is assigned it will not be 407 * removed until the last unlock. 408 * 409 * HAMMER2_RESOLVE_NEVER - Do not resolve the data element. 410 * (typically used to avoid device/logical buffer 411 * aliasing for data) 412 * 413 * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in 414 * the INITIAL-create state (indirect blocks only). 415 * 416 * Do not resolve data elements for DATA chains. 417 * (typically used to avoid device/logical buffer 418 * aliasing for data) 419 * 420 * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element. 421 * 422 * HAMMER2_RESOLVE_SHARED- (flag) The chain is locked shared, otherwise 423 * it will be locked exclusive. 424 * 425 * NOTE: Embedded elements (volume header, inodes) are always resolved 426 * regardless. 427 * 428 * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded 429 * element will instantiate and zero its buffer, and flush it on 430 * release. 431 * 432 * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE 433 * so as not to instantiate a device buffer, which could alias against 434 * a logical file buffer. However, if ALWAYS is specified the 435 * device buffer will be instantiated anyway. 436 */ 437 int 438 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how) 439 { 440 hammer2_blockref_t *bref; 441 hammer2_off_t pbase; 442 hammer2_off_t peof; 443 ccms_state_t ostate; 444 size_t boff; 445 size_t bbytes; 446 int error; 447 char *bdata; 448 449 /* 450 * Ref and lock the element. Recursive locks are allowed. 451 */ 452 hammer2_chain_ref(hmp, chain); 453 if (how & HAMMER2_RESOLVE_SHARED) 454 ccms_thread_lock(&chain->cst, CCMS_STATE_SHARED); 455 else 456 ccms_thread_lock(&chain->cst, CCMS_STATE_EXCLUSIVE); 457 458 /* 459 * If we already have a valid data pointer no further action is 460 * necessary. 461 */ 462 if (chain->data) 463 return (0); 464 465 /* 466 * Do we have to resolve the data? 467 */ 468 switch(how & HAMMER2_RESOLVE_MASK) { 469 case HAMMER2_RESOLVE_NEVER: 470 return(0); 471 case HAMMER2_RESOLVE_MAYBE: 472 if (chain->flags & HAMMER2_CHAIN_INITIAL) 473 return(0); 474 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 475 return(0); 476 /* fall through */ 477 case HAMMER2_RESOLVE_ALWAYS: 478 break; 479 } 480 481 /* 482 * Upgrade to an exclusive lock so we can safely manipulate the 483 * buffer cache. If another thread got to it before us we 484 * can just return. 485 */ 486 ostate = ccms_thread_lock_upgrade(&chain->cst); 487 if (chain->data) { 488 ccms_thread_lock_restore(&chain->cst, ostate); 489 return (0); 490 } 491 492 /* 493 * We must resolve to a device buffer, either by issuing I/O or 494 * by creating a zero-fill element. We do not mark the buffer 495 * dirty when creating a zero-fill element (the hammer2_chain_modify() 496 * API must still be used to do that). 497 * 498 * The device buffer is variable-sized in powers of 2 down 499 * to HAMMER2_MINALLOCSIZE (typically 1K). A 64K physical storage 500 * chunk always contains buffers of the same size. (XXX) 501 * 502 * The minimum physical IO size may be larger than the variable 503 * block size. 504 */ 505 bref = &chain->bref; 506 507 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 508 bbytes = HAMMER2_MINIOSIZE; 509 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); 510 peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64; 511 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); 512 KKASSERT(pbase != 0); 513 514 /* 515 * The getblk() optimization can only be used on newly created 516 * elements if the physical block size matches the request. 517 */ 518 if ((chain->flags & HAMMER2_CHAIN_INITIAL) && 519 chain->bytes == bbytes) { 520 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 521 error = 0; 522 } else if (hammer2_cluster_enable) { 523 error = cluster_read(hmp->devvp, peof, pbase, bbytes, 524 HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE, 525 &chain->bp); 526 } else { 527 error = bread(hmp->devvp, pbase, bbytes, &chain->bp); 528 } 529 530 if (error) { 531 kprintf("hammer2_chain_get: I/O error %016jx: %d\n", 532 (intmax_t)pbase, error); 533 bqrelse(chain->bp); 534 chain->bp = NULL; 535 ccms_thread_lock_restore(&chain->cst, ostate); 536 return (error); 537 } 538 539 /* 540 * Zero the data area if the chain is in the INITIAL-create state. 541 * Mark the buffer for bdwrite(). 542 */ 543 bdata = (char *)chain->bp->b_data + boff; 544 if (chain->flags & HAMMER2_CHAIN_INITIAL) { 545 bzero(bdata, chain->bytes); 546 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 547 } 548 549 /* 550 * Setup the data pointer, either pointing it to an embedded data 551 * structure and copying the data from the buffer, or pointing it 552 * into the buffer. 553 * 554 * The buffer is not retained when copying to an embedded data 555 * structure in order to avoid potential deadlocks or recursions 556 * on the same physical buffer. 557 */ 558 switch (bref->type) { 559 case HAMMER2_BREF_TYPE_VOLUME: 560 /* 561 * Copy data from bp to embedded buffer 562 */ 563 panic("hammer2_chain_lock: called on unresolved volume header"); 564 #if 0 565 /* NOT YET */ 566 KKASSERT(pbase == 0); 567 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE); 568 bcopy(bdata, &hmp->voldata, chain->bytes); 569 chain->data = (void *)&hmp->voldata; 570 bqrelse(chain->bp); 571 chain->bp = NULL; 572 #endif 573 break; 574 case HAMMER2_BREF_TYPE_INODE: 575 /* 576 * Copy data from bp to embedded buffer, do not retain the 577 * device buffer. 578 */ 579 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes); 580 chain->data = (void *)&chain->u.ip->ip_data; 581 bqrelse(chain->bp); 582 chain->bp = NULL; 583 break; 584 case HAMMER2_BREF_TYPE_INDIRECT: 585 case HAMMER2_BREF_TYPE_DATA: 586 default: 587 /* 588 * Point data at the device buffer and leave bp intact. 589 */ 590 chain->data = (void *)bdata; 591 break; 592 } 593 594 /* 595 * Make sure the bp is not specifically owned by this thread before 596 * restoring to a possibly shared lock, so another hammer2 thread 597 * can release it. 598 */ 599 if (chain->bp) 600 BUF_KERNPROC(chain->bp); 601 ccms_thread_lock_restore(&chain->cst, ostate); 602 return (0); 603 } 604 605 /* 606 * Unlock and deref a chain element. 607 * 608 * On the last lock release any non-embedded data (chain->bp) will be 609 * retired. 610 */ 611 void 612 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain) 613 { 614 long *counterp; 615 616 /* 617 * Release the CST lock but with a special 1->0 transition case. 618 * 619 * Returns non-zero if lock references remain. When zero is 620 * returned the last lock reference is retained and any shared 621 * lock is upgraded to an exclusive lock for final disposition. 622 */ 623 if (ccms_thread_unlock_zero(&chain->cst)) { 624 KKASSERT(chain->refs > 1); 625 atomic_add_int(&chain->refs, -1); 626 return; 627 } 628 629 /* 630 * Shortcut the case if the data is embedded or not resolved. 631 * 632 * Do NOT null-out pointers to embedded data (e.g. inode). 633 * 634 * The DIRTYBP flag is non-applicable in this situation and can 635 * be cleared to keep the flags state clean. 636 */ 637 if (chain->bp == NULL) { 638 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 639 ccms_thread_unlock(&chain->cst); 640 hammer2_chain_drop(hmp, chain); 641 return; 642 } 643 644 /* 645 * Statistics 646 */ 647 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) { 648 ; 649 } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 650 switch(chain->bref.type) { 651 case HAMMER2_BREF_TYPE_DATA: 652 counterp = &hammer2_ioa_file_write; 653 break; 654 case HAMMER2_BREF_TYPE_INODE: 655 counterp = &hammer2_ioa_meta_write; 656 break; 657 case HAMMER2_BREF_TYPE_INDIRECT: 658 counterp = &hammer2_ioa_indr_write; 659 break; 660 default: 661 counterp = &hammer2_ioa_volu_write; 662 break; 663 } 664 ++*counterp; 665 } else { 666 switch(chain->bref.type) { 667 case HAMMER2_BREF_TYPE_DATA: 668 counterp = &hammer2_iod_file_write; 669 break; 670 case HAMMER2_BREF_TYPE_INODE: 671 counterp = &hammer2_iod_meta_write; 672 break; 673 case HAMMER2_BREF_TYPE_INDIRECT: 674 counterp = &hammer2_iod_indr_write; 675 break; 676 default: 677 counterp = &hammer2_iod_volu_write; 678 break; 679 } 680 ++*counterp; 681 } 682 683 /* 684 * Clean out the bp. 685 * 686 * If a device buffer was used for data be sure to destroy the 687 * buffer when we are done to avoid aliases (XXX what about the 688 * underlying VM pages?). 689 */ 690 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) 691 chain->bp->b_flags |= B_RELBUF; 692 693 /* 694 * The DIRTYBP flag tracks whether we have to bdwrite() the buffer 695 * or not. The flag will get re-set when chain_modify() is called, 696 * even if MODIFIED is already set, allowing the OS to retire the 697 * buffer independent of a hammer2 flus. 698 */ 699 chain->data = NULL; 700 if (chain->flags & HAMMER2_CHAIN_DIRTYBP) { 701 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 702 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 703 atomic_clear_int(&chain->flags, 704 HAMMER2_CHAIN_IOFLUSH); 705 chain->bp->b_flags |= B_RELBUF; 706 cluster_awrite(chain->bp); 707 } else { 708 chain->bp->b_flags |= B_CLUSTEROK; 709 bdwrite(chain->bp); 710 } 711 } else { 712 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) { 713 atomic_clear_int(&chain->flags, 714 HAMMER2_CHAIN_IOFLUSH); 715 chain->bp->b_flags |= B_RELBUF; 716 brelse(chain->bp); 717 } else { 718 /* bp might still be dirty */ 719 bqrelse(chain->bp); 720 } 721 } 722 chain->bp = NULL; 723 ccms_thread_unlock(&chain->cst); 724 hammer2_chain_drop(hmp, chain); 725 } 726 727 /* 728 * Resize the chain's physical storage allocation. Chains can be resized 729 * smaller without reallocating the storage. Resizing larger will reallocate 730 * the storage. 731 * 732 * Must be passed a locked chain. 733 * 734 * If you want the resize code to copy the data to the new block then the 735 * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS. 736 * 737 * If the caller already holds a logical buffer containing the data and 738 * intends to bdwrite() that buffer resolve with RESOLVE_NEVER. The resize 739 * operation will then not copy the data. 740 * 741 * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order 742 * to avoid instantiating a device buffer that conflicts with the vnode 743 * data buffer. 744 * 745 * XXX flags currently ignored, uses chain->bp to detect data/no-data. 746 */ 747 void 748 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain, 749 int nradix, int flags) 750 { 751 hammer2_mount_t *hmp = ip->hmp; 752 struct buf *nbp; 753 hammer2_off_t pbase; 754 size_t obytes; 755 size_t nbytes; 756 size_t bbytes; 757 int boff; 758 char *bdata; 759 int error; 760 761 /* 762 * Only data and indirect blocks can be resized for now 763 */ 764 KKASSERT(chain != &hmp->vchain); 765 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA || 766 chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT); 767 768 /* 769 * Nothing to do if the element is already the proper size 770 */ 771 obytes = chain->bytes; 772 nbytes = 1U << nradix; 773 if (obytes == nbytes) 774 return; 775 776 /* 777 * Set MODIFIED and add a chain ref to prevent destruction. Both 778 * modified flags share the same ref. 779 * 780 * If the chain is already marked MODIFIED then we can safely 781 * return the previous allocation to the pool without having to 782 * worry about snapshots. 783 */ 784 if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) { 785 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED | 786 HAMMER2_CHAIN_MODIFY_TID); 787 hammer2_chain_ref(hmp, chain); 788 } else { 789 hammer2_freemap_free(hmp, chain->bref.data_off, 790 chain->bref.type); 791 } 792 793 /* 794 * Relocate the block, even if making it smaller (because different 795 * block sizes may be in different regions). 796 */ 797 chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type, 798 nbytes); 799 chain->bytes = nbytes; 800 ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */ 801 802 /* 803 * The device buffer may be larger than the allocation size. 804 */ 805 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 806 bbytes = HAMMER2_MINIOSIZE; 807 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 808 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 809 810 /* 811 * Only copy the data if resolved, otherwise the caller is 812 * responsible. 813 */ 814 if (chain->bp) { 815 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || 816 chain->bref.type == HAMMER2_BREF_TYPE_DATA); 817 KKASSERT(chain != &hmp->vchain); /* safety */ 818 819 /* 820 * The getblk() optimization can only be used if the 821 * physical block size matches the request. 822 */ 823 if (nbytes == bbytes) { 824 nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 825 error = 0; 826 } else { 827 error = bread(hmp->devvp, pbase, bbytes, &nbp); 828 KKASSERT(error == 0); 829 } 830 bdata = (char *)nbp->b_data + boff; 831 832 if (nbytes < obytes) { 833 bcopy(chain->data, bdata, nbytes); 834 } else { 835 bcopy(chain->data, bdata, obytes); 836 bzero(bdata + obytes, nbytes - obytes); 837 } 838 839 /* 840 * NOTE: The INITIAL state of the chain is left intact. 841 * We depend on hammer2_chain_modify() to do the 842 * right thing. 843 * 844 * NOTE: We set B_NOCACHE to throw away the previous bp and 845 * any VM backing store, even if it was dirty. 846 * Otherwise we run the risk of a logical/device 847 * conflict on reallocation. 848 */ 849 chain->bp->b_flags |= B_RELBUF | B_NOCACHE; 850 brelse(chain->bp); 851 chain->bp = nbp; 852 chain->data = (void *)bdata; 853 hammer2_chain_modify(hmp, chain, 0); 854 } 855 856 /* 857 * Make sure the chain is marked MOVED and SUBMOD is set in the 858 * parent(s) so the adjustments are picked up by flush. 859 */ 860 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 861 hammer2_chain_ref(hmp, chain); 862 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 863 } 864 hammer2_chain_parent_setsubmod(hmp, chain); 865 } 866 867 /* 868 * Convert a locked chain that was retrieved read-only to read-write. 869 * 870 * If not already marked modified a new physical block will be allocated 871 * and assigned to the bref. 872 * 873 * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE 874 * level or the COW operation will not work. 875 * 876 * Data blocks - The chain is usually locked RESOLVE_NEVER so as not to 877 * run the data through the device buffers. 878 */ 879 void 880 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags) 881 { 882 struct buf *nbp; 883 int error; 884 hammer2_off_t pbase; 885 size_t bbytes; 886 size_t boff; 887 void *bdata; 888 889 /* 890 * Tells flush that modify_tid must be updated, otherwise only 891 * mirror_tid is updated. This is the default. 892 */ 893 if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0) 894 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID); 895 896 /* 897 * If the chain is already marked MODIFIED we can just return. 898 * 899 * However, it is possible that a prior lock/modify sequence 900 * retired the buffer. During this lock/modify sequence MODIFIED 901 * may still be set but the buffer could wind up clean. Since 902 * the caller is going to modify the buffer further we have to 903 * be sure that DIRTYBP is set again. 904 */ 905 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 906 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 && 907 chain->bp == NULL) { 908 goto skip1; 909 } 910 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 911 return; 912 } 913 914 /* 915 * Set MODIFIED and add a chain ref to prevent destruction. Both 916 * modified flags share the same ref. 917 */ 918 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 919 hammer2_chain_ref(hmp, chain); 920 921 /* 922 * We must allocate the copy-on-write block. 923 * 924 * If the data is embedded no other action is required. 925 * 926 * If the data is not embedded we acquire and clear the 927 * new block. If chain->data is not NULL we then do the 928 * copy-on-write. chain->data will then be repointed to the new 929 * buffer and the old buffer will be released. 930 * 931 * For newly created elements with no prior allocation we go 932 * through the copy-on-write steps except without the copying part. 933 */ 934 if (chain != &hmp->vchain) { 935 if ((hammer2_debug & 0x0001) && 936 (chain->bref.data_off & HAMMER2_OFF_MASK)) { 937 kprintf("Replace %d\n", chain->bytes); 938 } 939 chain->bref.data_off = 940 hammer2_freemap_alloc(hmp, chain->bref.type, 941 chain->bytes); 942 /* XXX failed allocation */ 943 } 944 945 /* 946 * If data instantiation is optional and the chain has no current 947 * data association (typical for DATA and newly-created INDIRECT 948 * elements), don't instantiate the buffer now. 949 */ 950 if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL) 951 goto skip2; 952 953 skip1: 954 /* 955 * Setting the DIRTYBP flag will cause the buffer to be dirtied or 956 * written-out on unlock. This bit is independent of the MODIFIED 957 * bit because the chain may still need meta-data adjustments done 958 * by virtue of MODIFIED for its parent, and the buffer can be 959 * flushed out (possibly multiple times) by the OS before that. 960 * 961 * Clearing the INITIAL flag (for indirect blocks) indicates that 962 * a zero-fill buffer has been instantiated. 963 */ 964 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP); 965 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 966 967 /* 968 * We currently should never instantiate a device buffer for a 969 * data chain. 970 */ 971 KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA); 972 973 /* 974 * Execute COW operation 975 */ 976 switch(chain->bref.type) { 977 case HAMMER2_BREF_TYPE_VOLUME: 978 case HAMMER2_BREF_TYPE_INODE: 979 /* 980 * The data is embedded, no copy-on-write operation is 981 * needed. 982 */ 983 KKASSERT(chain->bp == NULL); 984 break; 985 case HAMMER2_BREF_TYPE_DATA: 986 case HAMMER2_BREF_TYPE_INDIRECT: 987 /* 988 * Perform the copy-on-write operation 989 */ 990 KKASSERT(chain != &hmp->vchain); /* safety */ 991 /* 992 * The device buffer may be larger than the allocation size. 993 */ 994 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 995 bbytes = HAMMER2_MINIOSIZE; 996 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 997 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 998 999 /* 1000 * The getblk() optimization can only be used if the 1001 * physical block size matches the request. 1002 */ 1003 if (chain->bytes == bbytes) { 1004 nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 1005 error = 0; 1006 } else { 1007 error = bread(hmp->devvp, pbase, bbytes, &nbp); 1008 KKASSERT(error == 0); 1009 } 1010 bdata = (char *)nbp->b_data + boff; 1011 1012 /* 1013 * Copy or zero-fill on write depending on whether 1014 * chain->data exists or not. 1015 */ 1016 if (chain->data) { 1017 bcopy(chain->data, bdata, chain->bytes); 1018 KKASSERT(chain->bp != NULL); 1019 } else { 1020 bzero(bdata, chain->bytes); 1021 } 1022 if (chain->bp) { 1023 chain->bp->b_flags |= B_RELBUF; 1024 brelse(chain->bp); 1025 } 1026 chain->bp = nbp; 1027 chain->data = bdata; 1028 break; 1029 default: 1030 panic("hammer2_chain_modify: illegal non-embedded type %d", 1031 chain->bref.type); 1032 break; 1033 1034 } 1035 skip2: 1036 if ((flags & HAMMER2_MODIFY_NOSUB) == 0) 1037 hammer2_chain_parent_setsubmod(hmp, chain); 1038 } 1039 1040 /* 1041 * Mark the volume as having been modified. This short-cut version 1042 * does not have to lock the volume's chain, which allows the ioctl 1043 * code to make adjustments to connections without deadlocking. 1044 */ 1045 void 1046 hammer2_modify_volume(hammer2_mount_t *hmp) 1047 { 1048 hammer2_voldata_lock(hmp); 1049 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX); 1050 hammer2_voldata_unlock(hmp); 1051 } 1052 1053 /* 1054 * Locate an in-memory chain. The parent must be locked. The in-memory 1055 * chain is returned or NULL if no in-memory chain is present. 1056 * 1057 * NOTE: A chain on-media might exist for this index when NULL is returned. 1058 */ 1059 hammer2_chain_t * 1060 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index) 1061 { 1062 hammer2_chain_t dummy; 1063 hammer2_chain_t *chain; 1064 1065 dummy.index = index; 1066 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1067 return (chain); 1068 } 1069 1070 /* 1071 * Return a locked chain structure with all associated data acquired. 1072 * 1073 * Caller must lock the parent on call, the returned child will be locked. 1074 */ 1075 hammer2_chain_t * 1076 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1077 int index, int flags) 1078 { 1079 hammer2_blockref_t *bref; 1080 hammer2_inode_t *ip; 1081 hammer2_chain_t *chain; 1082 hammer2_chain_t dummy; 1083 int how; 1084 ccms_state_t ostate; 1085 1086 /* 1087 * Figure out how to lock. MAYBE can be used to optimized 1088 * the initial-create state for indirect blocks. 1089 */ 1090 if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK)) 1091 how = HAMMER2_RESOLVE_NEVER; 1092 else 1093 how = HAMMER2_RESOLVE_MAYBE; 1094 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1095 how |= HAMMER2_RESOLVE_SHARED; 1096 1097 /* 1098 * First see if we have a (possibly modified) chain element cached 1099 * for this (parent, index). Acquire the data if necessary. 1100 * 1101 * If chain->data is non-NULL the chain should already be marked 1102 * modified. 1103 */ 1104 dummy.index = index; 1105 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1106 if (chain) { 1107 if (flags & HAMMER2_LOOKUP_NOLOCK) 1108 hammer2_chain_ref(hmp, chain); 1109 else 1110 hammer2_chain_lock(hmp, chain, how); 1111 return(chain); 1112 } 1113 1114 /* 1115 * Upgrade our thread lock and handle any race that may have 1116 * occurred. Leave the lock upgraded for the rest of the get. 1117 * We have to do this because we will be modifying the chain 1118 * structure. 1119 */ 1120 ostate = ccms_thread_lock_upgrade(&parent->cst); 1121 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1122 if (chain) { 1123 if (flags & HAMMER2_LOOKUP_NOLOCK) 1124 hammer2_chain_ref(hmp, chain); 1125 else 1126 hammer2_chain_lock(hmp, chain, how); 1127 ccms_thread_lock_restore(&parent->cst, ostate); 1128 return(chain); 1129 } 1130 1131 /* 1132 * The get function must always succeed, panic if there's no 1133 * data to index. 1134 */ 1135 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1136 ccms_thread_lock_restore(&parent->cst, ostate); 1137 panic("hammer2_chain_get: Missing bref(1)"); 1138 /* NOT REACHED */ 1139 } 1140 1141 /* 1142 * Otherwise lookup the bref and issue I/O (switch on the parent) 1143 */ 1144 switch(parent->bref.type) { 1145 case HAMMER2_BREF_TYPE_INODE: 1146 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1147 bref = &parent->data->ipdata.u.blockset.blockref[index]; 1148 break; 1149 case HAMMER2_BREF_TYPE_INDIRECT: 1150 KKASSERT(parent->data != NULL); 1151 KKASSERT(index >= 0 && 1152 index < parent->bytes / sizeof(hammer2_blockref_t)); 1153 bref = &parent->data->npdata.blockref[index]; 1154 break; 1155 case HAMMER2_BREF_TYPE_VOLUME: 1156 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT); 1157 bref = &hmp->voldata.sroot_blockset.blockref[index]; 1158 break; 1159 default: 1160 bref = NULL; 1161 panic("hammer2_chain_get: unrecognized blockref type: %d", 1162 parent->bref.type); 1163 } 1164 if (bref->type == 0) { 1165 panic("hammer2_chain_get: Missing bref(2)"); 1166 /* NOT REACHED */ 1167 } 1168 1169 /* 1170 * Allocate a chain structure representing the existing media 1171 * entry. 1172 * 1173 * The locking operation we do later will issue I/O to read it. 1174 */ 1175 chain = hammer2_chain_alloc(hmp, bref); 1176 1177 /* 1178 * Link the chain into its parent. Caller is expected to hold an 1179 * exclusive lock on the parent. 1180 */ 1181 chain->parent = parent; 1182 chain->index = index; 1183 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain)) 1184 panic("hammer2_chain_link: collision"); 1185 KKASSERT(parent->refs > 0); 1186 atomic_add_int(&parent->refs, 1); /* for red-black entry */ 1187 ccms_thread_lock_restore(&parent->cst, ostate); 1188 1189 /* 1190 * Additional linkage for inodes. Reuse the parent pointer to 1191 * find the parent directory. 1192 * 1193 * The ccms_inode is initialized from its parent directory. The 1194 * chain of ccms_inode's is seeded by the mount code. 1195 */ 1196 if (bref->type == HAMMER2_BREF_TYPE_INODE) { 1197 ip = chain->u.ip; 1198 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) 1199 parent = parent->parent; 1200 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) { 1201 ip->pip = parent->u.ip; 1202 ip->pmp = parent->u.ip->pmp; 1203 ip->depth = parent->u.ip->depth + 1; 1204 ccms_cst_init(&ip->topo_cst, &ip->chain); 1205 } 1206 } 1207 1208 /* 1209 * Our new chain structure has already been referenced and locked 1210 * but the lock code handles the I/O so call it to resolve the data. 1211 * Then release one of our two exclusive locks. 1212 * 1213 * If NOLOCK is set the release will release the one-and-only lock. 1214 */ 1215 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) { 1216 hammer2_chain_lock(hmp, chain, how); /* recusive lock */ 1217 hammer2_chain_drop(hmp, chain); /* excess ref */ 1218 } 1219 ccms_thread_unlock(&chain->cst); /* from alloc */ 1220 1221 return (chain); 1222 } 1223 1224 /* 1225 * Locate any key between key_beg and key_end inclusive. (*parentp) 1226 * typically points to an inode but can also point to a related indirect 1227 * block and this function will recurse upwards and find the inode again. 1228 * 1229 * WARNING! THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER! ANY KEY 1230 * WITHIN THE RANGE CAN BE RETURNED. HOWEVER, AN ITERATION 1231 * WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN. 1232 * 1233 * (*parentp) must be exclusively locked and referenced and can be an inode 1234 * or an existing indirect block within the inode. 1235 * 1236 * On return (*parentp) will be modified to point at the deepest parent chain 1237 * element encountered during the search, as a helper for an insertion or 1238 * deletion. The new (*parentp) will be locked and referenced and the old 1239 * will be unlocked and dereferenced (no change if they are both the same). 1240 * 1241 * The matching chain will be returned exclusively locked and referenced. 1242 * 1243 * NULL is returned if no match was found, but (*parentp) will still 1244 * potentially be adjusted. 1245 * 1246 * This function will also recurse up the chain if the key is not within the 1247 * current parent's range. (*parentp) can never be set to NULL. An iteration 1248 * can simply allow (*parentp) to float inside the loop. 1249 */ 1250 hammer2_chain_t * 1251 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp, 1252 hammer2_key_t key_beg, hammer2_key_t key_end, 1253 int flags) 1254 { 1255 hammer2_chain_t *parent; 1256 hammer2_chain_t *chain; 1257 hammer2_chain_t *tmp; 1258 hammer2_blockref_t *base; 1259 hammer2_blockref_t *bref; 1260 hammer2_key_t scan_beg; 1261 hammer2_key_t scan_end; 1262 int count = 0; 1263 int i; 1264 int how_always = HAMMER2_RESOLVE_ALWAYS; 1265 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1266 1267 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) { 1268 how_maybe |= HAMMER2_RESOLVE_SHARED; 1269 how_always |= HAMMER2_RESOLVE_SHARED; 1270 } 1271 1272 /* 1273 * Recurse (*parentp) upward if necessary until the parent completely 1274 * encloses the key range or we hit the inode. 1275 */ 1276 parent = *parentp; 1277 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) { 1278 scan_beg = parent->bref.key; 1279 scan_end = scan_beg + 1280 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1281 if (key_beg >= scan_beg && key_end <= scan_end) 1282 break; 1283 hammer2_chain_ref(hmp, parent); /* ref old parent */ 1284 hammer2_chain_unlock(hmp, parent); /* unlock old parent */ 1285 parent = parent->parent; 1286 /* lock new parent */ 1287 hammer2_chain_lock(hmp, parent, how_maybe); 1288 hammer2_chain_drop(hmp, *parentp); /* drop old parent */ 1289 *parentp = parent; /* new parent */ 1290 } 1291 1292 again: 1293 /* 1294 * Locate the blockref array. Currently we do a fully associative 1295 * search through the array. 1296 */ 1297 switch(parent->bref.type) { 1298 case HAMMER2_BREF_TYPE_INODE: 1299 /* 1300 * Special shortcut for embedded data returns the inode 1301 * itself. Callers must detect this condition and access 1302 * the embedded data (the strategy code does this for us). 1303 * 1304 * This is only applicable to regular files and softlinks. 1305 */ 1306 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) { 1307 if (flags & HAMMER2_LOOKUP_NOLOCK) 1308 hammer2_chain_ref(hmp, parent); 1309 else 1310 hammer2_chain_lock(hmp, parent, how_always); 1311 return (parent); 1312 } 1313 base = &parent->data->ipdata.u.blockset.blockref[0]; 1314 count = HAMMER2_SET_COUNT; 1315 break; 1316 case HAMMER2_BREF_TYPE_INDIRECT: 1317 /* 1318 * Optimize indirect blocks in the INITIAL state to avoid 1319 * I/O. 1320 */ 1321 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1322 base = NULL; 1323 } else { 1324 if (parent->data == NULL) 1325 panic("parent->data is NULL"); 1326 base = &parent->data->npdata.blockref[0]; 1327 } 1328 count = parent->bytes / sizeof(hammer2_blockref_t); 1329 break; 1330 case HAMMER2_BREF_TYPE_VOLUME: 1331 base = &hmp->voldata.sroot_blockset.blockref[0]; 1332 count = HAMMER2_SET_COUNT; 1333 break; 1334 default: 1335 panic("hammer2_chain_lookup: unrecognized blockref type: %d", 1336 parent->bref.type); 1337 base = NULL; /* safety */ 1338 count = 0; /* safety */ 1339 } 1340 1341 /* 1342 * If the element and key overlap we use the element. 1343 */ 1344 bref = NULL; 1345 for (i = 0; i < count; ++i) { 1346 tmp = hammer2_chain_find(hmp, parent, i); 1347 if (tmp) { 1348 bref = &tmp->bref; 1349 KKASSERT(bref->type != 0); 1350 } else if (base == NULL || base[i].type == 0) { 1351 continue; 1352 } else { 1353 bref = &base[i]; 1354 } 1355 scan_beg = bref->key; 1356 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 1357 if (key_beg <= scan_end && key_end >= scan_beg) 1358 break; 1359 } 1360 if (i == count) { 1361 if (key_beg == key_end) 1362 return (NULL); 1363 return (hammer2_chain_next(hmp, parentp, NULL, 1364 key_beg, key_end, flags)); 1365 } 1366 1367 /* 1368 * Acquire the new chain element. If the chain element is an 1369 * indirect block we must search recursively. 1370 */ 1371 chain = hammer2_chain_get(hmp, parent, i, flags); 1372 if (chain == NULL) 1373 return (NULL); 1374 1375 /* 1376 * If the chain element is an indirect block it becomes the new 1377 * parent and we loop on it. 1378 * 1379 * The parent always has to be locked with at least RESOLVE_MAYBE, 1380 * so it might need a fixup if the caller passed incompatible flags. 1381 */ 1382 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) { 1383 hammer2_chain_unlock(hmp, parent); 1384 *parentp = parent = chain; 1385 if (flags & HAMMER2_LOOKUP_NOLOCK) { 1386 hammer2_chain_lock(hmp, chain, how_maybe); 1387 hammer2_chain_drop(hmp, chain); /* excess ref */ 1388 } else if (flags & HAMMER2_LOOKUP_NODATA) { 1389 hammer2_chain_lock(hmp, chain, how_maybe); 1390 hammer2_chain_unlock(hmp, chain); 1391 } 1392 goto again; 1393 } 1394 1395 /* 1396 * All done, return chain 1397 */ 1398 return (chain); 1399 } 1400 1401 /* 1402 * After having issued a lookup we can iterate all matching keys. 1403 * 1404 * If chain is non-NULL we continue the iteration from just after it's index. 1405 * 1406 * If chain is NULL we assume the parent was exhausted and continue the 1407 * iteration at the next parent. 1408 * 1409 * parent must be locked on entry and remains locked throughout. chain's 1410 * lock status must match flags. 1411 */ 1412 hammer2_chain_t * 1413 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp, 1414 hammer2_chain_t *chain, 1415 hammer2_key_t key_beg, hammer2_key_t key_end, 1416 int flags) 1417 { 1418 hammer2_chain_t *parent; 1419 hammer2_chain_t *tmp; 1420 hammer2_blockref_t *base; 1421 hammer2_blockref_t *bref; 1422 hammer2_key_t scan_beg; 1423 hammer2_key_t scan_end; 1424 int i; 1425 int how_maybe = HAMMER2_RESOLVE_MAYBE; 1426 int count; 1427 1428 if (flags & (HAMMER2_LOOKUP_SHARED | HAMMER2_LOOKUP_NOLOCK)) 1429 how_maybe |= HAMMER2_RESOLVE_SHARED; 1430 1431 parent = *parentp; 1432 1433 again: 1434 /* 1435 * Calculate the next index and recalculate the parent if necessary. 1436 */ 1437 if (chain) { 1438 /* 1439 * Continue iteration within current parent. If not NULL 1440 * the passed-in chain may or may not be locked, based on 1441 * the LOOKUP_NOLOCK flag (passed in as returned from lookup 1442 * or a prior next). 1443 */ 1444 i = chain->index + 1; 1445 if (flags & HAMMER2_LOOKUP_NOLOCK) 1446 hammer2_chain_drop(hmp, chain); 1447 else 1448 hammer2_chain_unlock(hmp, chain); 1449 1450 /* 1451 * Any scan where the lookup returned degenerate data embedded 1452 * in the inode has an invalid index and must terminate. 1453 */ 1454 if (chain == parent) 1455 return(NULL); 1456 chain = NULL; 1457 } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) { 1458 /* 1459 * We reached the end of the iteration. 1460 */ 1461 return (NULL); 1462 } else { 1463 /* 1464 * Continue iteration with next parent unless the current 1465 * parent covers the range. 1466 */ 1467 hammer2_chain_t *nparent; 1468 1469 scan_beg = parent->bref.key; 1470 scan_end = scan_beg + 1471 ((hammer2_key_t)1 << parent->bref.keybits) - 1; 1472 if (key_beg >= scan_beg && key_end <= scan_end) 1473 return (NULL); 1474 1475 i = parent->index + 1; 1476 nparent = parent->parent; 1477 hammer2_chain_ref(hmp, nparent); /* ref new parent */ 1478 hammer2_chain_unlock(hmp, parent); /* unlock old parent */ 1479 /* lock new parent */ 1480 hammer2_chain_lock(hmp, nparent, how_maybe); 1481 hammer2_chain_drop(hmp, nparent); /* drop excess ref */ 1482 *parentp = parent = nparent; 1483 } 1484 1485 again2: 1486 /* 1487 * Locate the blockref array. Currently we do a fully associative 1488 * search through the array. 1489 */ 1490 switch(parent->bref.type) { 1491 case HAMMER2_BREF_TYPE_INODE: 1492 base = &parent->data->ipdata.u.blockset.blockref[0]; 1493 count = HAMMER2_SET_COUNT; 1494 break; 1495 case HAMMER2_BREF_TYPE_INDIRECT: 1496 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1497 base = NULL; 1498 } else { 1499 KKASSERT(parent->data != NULL); 1500 base = &parent->data->npdata.blockref[0]; 1501 } 1502 count = parent->bytes / sizeof(hammer2_blockref_t); 1503 break; 1504 case HAMMER2_BREF_TYPE_VOLUME: 1505 base = &hmp->voldata.sroot_blockset.blockref[0]; 1506 count = HAMMER2_SET_COUNT; 1507 break; 1508 default: 1509 panic("hammer2_chain_next: unrecognized blockref type: %d", 1510 parent->bref.type); 1511 base = NULL; /* safety */ 1512 count = 0; /* safety */ 1513 break; 1514 } 1515 KKASSERT(i <= count); 1516 1517 /* 1518 * Look for the key. If we are unable to find a match and an exact 1519 * match was requested we return NULL. If a range was requested we 1520 * run hammer2_chain_next() to iterate. 1521 */ 1522 bref = NULL; 1523 while (i < count) { 1524 tmp = hammer2_chain_find(hmp, parent, i); 1525 if (tmp) { 1526 bref = &tmp->bref; 1527 } else if (base == NULL || base[i].type == 0) { 1528 ++i; 1529 continue; 1530 } else { 1531 bref = &base[i]; 1532 } 1533 scan_beg = bref->key; 1534 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1; 1535 if (key_beg <= scan_end && key_end >= scan_beg) 1536 break; 1537 ++i; 1538 } 1539 1540 /* 1541 * If we couldn't find a match recurse up a parent to continue the 1542 * search. 1543 */ 1544 if (i == count) 1545 goto again; 1546 1547 /* 1548 * Acquire the new chain element. If the chain element is an 1549 * indirect block we must search recursively. 1550 */ 1551 chain = hammer2_chain_get(hmp, parent, i, flags); 1552 if (chain == NULL) 1553 return (NULL); 1554 1555 /* 1556 * If the chain element is an indirect block it becomes the new 1557 * parent and we loop on it. 1558 * 1559 * The parent always has to be locked with at least RESOLVE_MAYBE, 1560 * so it might need a fixup if the caller passed incompatible flags. 1561 */ 1562 if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) { 1563 hammer2_chain_unlock(hmp, parent); 1564 *parentp = parent = chain; 1565 chain = NULL; 1566 if (flags & HAMMER2_LOOKUP_NOLOCK) { 1567 hammer2_chain_lock(hmp, parent, how_maybe); 1568 hammer2_chain_drop(hmp, parent); /* excess ref */ 1569 } else if (flags & HAMMER2_LOOKUP_NODATA) { 1570 hammer2_chain_lock(hmp, parent, how_maybe); 1571 hammer2_chain_unlock(hmp, parent); 1572 } 1573 i = 0; 1574 goto again2; 1575 } 1576 1577 /* 1578 * All done, return chain 1579 */ 1580 return (chain); 1581 } 1582 1583 /* 1584 * Create and return a new hammer2 system memory structure of the specified 1585 * key, type and size and insert it RELATIVE TO (PARENT). 1586 * 1587 * (parent) is typically either an inode or an indirect block, acquired 1588 * acquired as a side effect of issuing a prior failed lookup. parent 1589 * must be locked and held. Do not pass the inode chain to this function 1590 * unless that is the chain returned by the failed lookup. 1591 * 1592 * Non-indirect types will automatically allocate indirect blocks as required 1593 * if the new item does not fit in the current (parent). 1594 * 1595 * Indirect types will move a portion of the existing blockref array in 1596 * (parent) into the new indirect type and then use one of the free slots 1597 * to emplace the new indirect type. 1598 * 1599 * A new locked, referenced chain element is returned of the specified type. 1600 * The element may or may not have a data area associated with it: 1601 * 1602 * VOLUME not allowed here 1603 * INODE embedded data are will be set-up 1604 * INDIRECT not allowed here 1605 * DATA no data area will be set-up (caller is expected 1606 * to have logical buffers, we don't want to alias 1607 * the data onto device buffers!). 1608 * 1609 * Requires an exclusively locked parent. 1610 */ 1611 hammer2_chain_t * 1612 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1613 hammer2_chain_t *chain, 1614 hammer2_key_t key, int keybits, int type, size_t bytes) 1615 { 1616 hammer2_blockref_t dummy; 1617 hammer2_blockref_t *base; 1618 hammer2_chain_t dummy_chain; 1619 int unlock_parent = 0; 1620 int allocated = 0; 1621 int count; 1622 int i; 1623 1624 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 1625 1626 if (chain == NULL) { 1627 /* 1628 * First allocate media space and construct the dummy bref, 1629 * then allocate the in-memory chain structure. 1630 */ 1631 bzero(&dummy, sizeof(dummy)); 1632 dummy.type = type; 1633 dummy.key = key; 1634 dummy.keybits = keybits; 1635 dummy.data_off = hammer2_bytes_to_radix(bytes); 1636 chain = hammer2_chain_alloc(hmp, &dummy); 1637 allocated = 1; 1638 1639 /* 1640 * We do NOT set INITIAL here (yet). INITIAL is only 1641 * used for indirect blocks. 1642 * 1643 * Recalculate bytes to reflect the actual media block 1644 * allocation. 1645 */ 1646 bytes = (hammer2_off_t)1 << 1647 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX); 1648 chain->bytes = bytes; 1649 1650 switch(type) { 1651 case HAMMER2_BREF_TYPE_VOLUME: 1652 panic("hammer2_chain_create: called with volume type"); 1653 break; 1654 case HAMMER2_BREF_TYPE_INODE: 1655 KKASSERT(bytes == HAMMER2_INODE_BYTES); 1656 chain->data = (void *)&chain->u.ip->ip_data; 1657 break; 1658 case HAMMER2_BREF_TYPE_INDIRECT: 1659 panic("hammer2_chain_create: cannot be used to" 1660 "create indirect block"); 1661 break; 1662 case HAMMER2_BREF_TYPE_DATA: 1663 default: 1664 /* leave chain->data NULL */ 1665 KKASSERT(chain->data == NULL); 1666 break; 1667 } 1668 } else { 1669 /* 1670 * Potentially update the chain's key/keybits. 1671 */ 1672 chain->bref.key = key; 1673 chain->bref.keybits = keybits; 1674 } 1675 1676 again: 1677 /* 1678 * Locate a free blockref in the parent's array 1679 */ 1680 switch(parent->bref.type) { 1681 case HAMMER2_BREF_TYPE_INODE: 1682 KKASSERT((parent->u.ip->ip_data.op_flags & 1683 HAMMER2_OPFLAG_DIRECTDATA) == 0); 1684 KKASSERT(parent->data != NULL); 1685 base = &parent->data->ipdata.u.blockset.blockref[0]; 1686 count = HAMMER2_SET_COUNT; 1687 break; 1688 case HAMMER2_BREF_TYPE_INDIRECT: 1689 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1690 base = NULL; 1691 } else { 1692 KKASSERT(parent->data != NULL); 1693 base = &parent->data->npdata.blockref[0]; 1694 } 1695 count = parent->bytes / sizeof(hammer2_blockref_t); 1696 break; 1697 case HAMMER2_BREF_TYPE_VOLUME: 1698 KKASSERT(parent->data != NULL); 1699 base = &hmp->voldata.sroot_blockset.blockref[0]; 1700 count = HAMMER2_SET_COUNT; 1701 break; 1702 default: 1703 panic("hammer2_chain_create: unrecognized blockref type: %d", 1704 parent->bref.type); 1705 count = 0; 1706 break; 1707 } 1708 1709 /* 1710 * Scan for an unallocated bref, also skipping any slots occupied 1711 * by in-memory chain elements that may not yet have been updated 1712 * in the parent's bref array. 1713 */ 1714 bzero(&dummy_chain, sizeof(dummy_chain)); 1715 for (i = 0; i < count; ++i) { 1716 if (base == NULL) { 1717 dummy_chain.index = i; 1718 if (RB_FIND(hammer2_chain_tree, 1719 &parent->rbhead, &dummy_chain) == NULL) { 1720 break; 1721 } 1722 } else if (base[i].type == 0) { 1723 dummy_chain.index = i; 1724 if (RB_FIND(hammer2_chain_tree, 1725 &parent->rbhead, &dummy_chain) == NULL) { 1726 break; 1727 } 1728 } 1729 } 1730 1731 /* 1732 * If no free blockref could be found we must create an indirect 1733 * block and move a number of blockrefs into it. With the parent 1734 * locked we can safely lock each child in order to move it without 1735 * causing a deadlock. 1736 * 1737 * This may return the new indirect block or the old parent depending 1738 * on where the key falls. 1739 */ 1740 if (i == count) { 1741 hammer2_chain_t *nparent; 1742 1743 nparent = hammer2_chain_create_indirect(hmp, parent, 1744 key, keybits); 1745 if (nparent == NULL) { 1746 if (allocated) 1747 hammer2_chain_free(hmp, chain); 1748 chain = NULL; 1749 goto done; 1750 } 1751 if (parent != nparent) { 1752 if (unlock_parent) 1753 hammer2_chain_unlock(hmp, parent); 1754 parent = nparent; 1755 unlock_parent = 1; 1756 } 1757 goto again; 1758 } 1759 1760 /* 1761 * Link the chain into its parent. Later on we will have to set 1762 * the MOVED bit in situations where we don't mark the new chain 1763 * as being modified. 1764 */ 1765 if (chain->parent != NULL) 1766 panic("hammer2: hammer2_chain_create: chain already connected"); 1767 KKASSERT(chain->parent == NULL); 1768 chain->parent = parent; 1769 chain->index = i; 1770 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, chain)) 1771 panic("hammer2_chain_link: collision"); 1772 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED); 1773 KKASSERT(parent->refs > 0); 1774 atomic_add_int(&parent->refs, 1); 1775 1776 /* 1777 * Additional linkage for inodes. Reuse the parent pointer to 1778 * find the parent directory. 1779 * 1780 * Cumulative adjustments are inherited on [re]attach and will 1781 * propagate up the tree on the next flush. 1782 * 1783 * The ccms_inode is initialized from its parent directory. The 1784 * chain of ccms_inode's is seeded by the mount code. 1785 */ 1786 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 1787 hammer2_chain_t *scan = parent; 1788 hammer2_inode_t *ip = chain->u.ip; 1789 1790 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT) 1791 scan = scan->parent; 1792 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) { 1793 ip->pip = scan->u.ip; 1794 ip->pmp = scan->u.ip->pmp; 1795 ip->depth = scan->u.ip->depth + 1; 1796 ip->pip->delta_icount += ip->ip_data.inode_count; 1797 ip->pip->delta_dcount += ip->ip_data.data_count; 1798 ++ip->pip->delta_icount; 1799 ccms_cst_init(&ip->topo_cst, &ip->chain); 1800 } 1801 } 1802 1803 /* 1804 * (allocated) indicates that this is a newly-created chain element 1805 * rather than a renamed chain element. In this situation we want 1806 * to place the chain element in the MODIFIED state. 1807 * 1808 * The data area will be set up as follows: 1809 * 1810 * VOLUME not allowed here. 1811 * 1812 * INODE embedded data are will be set-up. 1813 * 1814 * INDIRECT not allowed here. 1815 * 1816 * DATA no data area will be set-up (caller is expected 1817 * to have logical buffers, we don't want to alias 1818 * the data onto device buffers!). 1819 */ 1820 if (allocated) { 1821 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) { 1822 hammer2_chain_modify(hmp, chain, 1823 HAMMER2_MODIFY_OPTDATA); 1824 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) { 1825 /* not supported in this function */ 1826 panic("hammer2_chain_create: bad type"); 1827 atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL); 1828 hammer2_chain_modify(hmp, chain, 1829 HAMMER2_MODIFY_OPTDATA); 1830 } else { 1831 hammer2_chain_modify(hmp, chain, 0); 1832 } 1833 } else { 1834 /* 1835 * When reconnecting inodes we have to call setsubmod() 1836 * to ensure that its state propagates up the newly 1837 * connected parent. 1838 * 1839 * Make sure MOVED is set but do not update bref_flush. If 1840 * the chain is undergoing modification bref_flush will be 1841 * updated when it gets flushed. If it is not then the 1842 * bref may not have been flushed yet and we do not want to 1843 * set MODIFIED here as this could result in unnecessary 1844 * reallocations. 1845 */ 1846 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 1847 hammer2_chain_ref(hmp, chain); 1848 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 1849 } 1850 hammer2_chain_parent_setsubmod(hmp, chain); 1851 } 1852 1853 done: 1854 if (unlock_parent) 1855 hammer2_chain_unlock(hmp, parent); 1856 return (chain); 1857 } 1858 1859 /* 1860 * Create an indirect block that covers one or more of the elements in the 1861 * current parent. Either returns the existing parent with no locking or 1862 * ref changes or returns the new indirect block locked and referenced 1863 * and leaving the original parent lock/ref intact as well. 1864 * 1865 * The returned chain depends on where the specified key falls. 1866 * 1867 * The key/keybits for the indirect mode only needs to follow three rules: 1868 * 1869 * (1) That all elements underneath it fit within its key space and 1870 * 1871 * (2) That all elements outside it are outside its key space. 1872 * 1873 * (3) When creating the new indirect block any elements in the current 1874 * parent that fit within the new indirect block's keyspace must be 1875 * moved into the new indirect block. 1876 * 1877 * (4) The keyspace chosen for the inserted indirect block CAN cover a wider 1878 * keyspace the the current parent, but lookup/iteration rules will 1879 * ensure (and must ensure) that rule (2) for all parents leading up 1880 * to the nearest inode or the root volume header is adhered to. This 1881 * is accomplished by always recursing through matching keyspaces in 1882 * the hammer2_chain_lookup() and hammer2_chain_next() API. 1883 * 1884 * The current implementation calculates the current worst-case keyspace by 1885 * iterating the current parent and then divides it into two halves, choosing 1886 * whichever half has the most elements (not necessarily the half containing 1887 * the requested key). 1888 * 1889 * We can also opt to use the half with the least number of elements. This 1890 * causes lower-numbered keys (aka logical file offsets) to recurse through 1891 * fewer indirect blocks and higher-numbered keys to recurse through more. 1892 * This also has the risk of not moving enough elements to the new indirect 1893 * block and being forced to create several indirect blocks before the element 1894 * can be inserted. 1895 * 1896 * Must be called with an exclusively locked parent 1897 */ 1898 static 1899 hammer2_chain_t * 1900 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent, 1901 hammer2_key_t create_key, int create_bits) 1902 { 1903 hammer2_blockref_t *base; 1904 hammer2_blockref_t *bref; 1905 hammer2_chain_t *chain; 1906 hammer2_chain_t *ichain; 1907 hammer2_chain_t dummy; 1908 hammer2_key_t key = create_key; 1909 int keybits = create_bits; 1910 int locount = 0; 1911 int hicount = 0; 1912 int count; 1913 int nbytes; 1914 int i; 1915 1916 /* 1917 * Calculate the base blockref pointer or NULL if the chain 1918 * is known to be empty. We need to calculate the array count 1919 * for RB lookups either way. 1920 */ 1921 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 1922 1923 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA); 1924 if (parent->flags & HAMMER2_CHAIN_INITIAL) { 1925 base = NULL; 1926 1927 switch(parent->bref.type) { 1928 case HAMMER2_BREF_TYPE_INODE: 1929 count = HAMMER2_SET_COUNT; 1930 break; 1931 case HAMMER2_BREF_TYPE_INDIRECT: 1932 count = parent->bytes / sizeof(hammer2_blockref_t); 1933 break; 1934 case HAMMER2_BREF_TYPE_VOLUME: 1935 count = HAMMER2_SET_COUNT; 1936 break; 1937 default: 1938 panic("hammer2_chain_create_indirect: " 1939 "unrecognized blockref type: %d", 1940 parent->bref.type); 1941 count = 0; 1942 break; 1943 } 1944 } else { 1945 switch(parent->bref.type) { 1946 case HAMMER2_BREF_TYPE_INODE: 1947 base = &parent->data->ipdata.u.blockset.blockref[0]; 1948 count = HAMMER2_SET_COUNT; 1949 break; 1950 case HAMMER2_BREF_TYPE_INDIRECT: 1951 base = &parent->data->npdata.blockref[0]; 1952 count = parent->bytes / sizeof(hammer2_blockref_t); 1953 break; 1954 case HAMMER2_BREF_TYPE_VOLUME: 1955 base = &hmp->voldata.sroot_blockset.blockref[0]; 1956 count = HAMMER2_SET_COUNT; 1957 break; 1958 default: 1959 panic("hammer2_chain_create_indirect: " 1960 "unrecognized blockref type: %d", 1961 parent->bref.type); 1962 count = 0; 1963 break; 1964 } 1965 } 1966 1967 /* 1968 * Scan for an unallocated bref, also skipping any slots occupied 1969 * by in-memory chain elements which may not yet have been updated 1970 * in the parent's bref array. 1971 */ 1972 bzero(&dummy, sizeof(dummy)); 1973 for (i = 0; i < count; ++i) { 1974 int nkeybits; 1975 1976 dummy.index = i; 1977 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 1978 if (chain) { 1979 bref = &chain->bref; 1980 } else if (base && base[i].type) { 1981 bref = &base[i]; 1982 } else { 1983 continue; 1984 } 1985 1986 /* 1987 * Expand our calculated key range (key, keybits) to fit 1988 * the scanned key. nkeybits represents the full range 1989 * that we will later cut in half (two halves @ nkeybits - 1). 1990 */ 1991 nkeybits = keybits; 1992 if (nkeybits < bref->keybits) 1993 nkeybits = bref->keybits; 1994 while (nkeybits < 64 && 1995 (~(((hammer2_key_t)1 << nkeybits) - 1) & 1996 (key ^ bref->key)) != 0) { 1997 ++nkeybits; 1998 } 1999 2000 /* 2001 * If the new key range is larger we have to determine 2002 * which side of the new key range the existing keys fall 2003 * under by checking the high bit, then collapsing the 2004 * locount into the hicount or vise-versa. 2005 */ 2006 if (keybits != nkeybits) { 2007 if (((hammer2_key_t)1 << (nkeybits - 1)) & key) { 2008 hicount += locount; 2009 locount = 0; 2010 } else { 2011 locount += hicount; 2012 hicount = 0; 2013 } 2014 keybits = nkeybits; 2015 } 2016 2017 /* 2018 * The newly scanned key will be in the lower half or the 2019 * higher half of the (new) key range. 2020 */ 2021 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key) 2022 ++hicount; 2023 else 2024 ++locount; 2025 } 2026 2027 /* 2028 * Adjust keybits to represent half of the full range calculated 2029 * above (radix 63 max) 2030 */ 2031 --keybits; 2032 2033 /* 2034 * Select whichever half contains the most elements. Theoretically 2035 * we can select either side as long as it contains at least one 2036 * element (in order to ensure that a free slot is present to hold 2037 * the indirect block). 2038 */ 2039 key &= ~(((hammer2_key_t)1 << keybits) - 1); 2040 if (hammer2_indirect_optimize) { 2041 /* 2042 * Insert node for least number of keys, this will arrange 2043 * the first few blocks of a large file or the first few 2044 * inodes in a directory with fewer indirect blocks when 2045 * created linearly. 2046 */ 2047 if (hicount < locount && hicount != 0) 2048 key |= (hammer2_key_t)1 << keybits; 2049 else 2050 key &= ~(hammer2_key_t)1 << keybits; 2051 } else { 2052 /* 2053 * Insert node for most number of keys, best for heavily 2054 * fragmented files. 2055 */ 2056 if (hicount > locount) 2057 key |= (hammer2_key_t)1 << keybits; 2058 else 2059 key &= ~(hammer2_key_t)1 << keybits; 2060 } 2061 2062 /* 2063 * How big should our new indirect block be? It has to be at least 2064 * as large as its parent. 2065 */ 2066 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) 2067 nbytes = HAMMER2_IND_BYTES_MIN; 2068 else 2069 nbytes = HAMMER2_IND_BYTES_MAX; 2070 if (nbytes < count * sizeof(hammer2_blockref_t)) 2071 nbytes = count * sizeof(hammer2_blockref_t); 2072 2073 /* 2074 * Ok, create our new indirect block 2075 */ 2076 dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT; 2077 dummy.bref.key = key; 2078 dummy.bref.keybits = keybits; 2079 dummy.bref.data_off = hammer2_bytes_to_radix(nbytes); 2080 ichain = hammer2_chain_alloc(hmp, &dummy.bref); 2081 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL); 2082 2083 /* 2084 * Iterate the original parent and move the matching brefs into 2085 * the new indirect block. 2086 */ 2087 for (i = 0; i < count; ++i) { 2088 /* 2089 * For keying purposes access the bref from the media or 2090 * from our in-memory cache. In cases where the in-memory 2091 * cache overrides the media the keyrefs will be the same 2092 * anyway so we can avoid checking the cache when the media 2093 * has a key. 2094 */ 2095 dummy.index = i; 2096 chain = RB_FIND(hammer2_chain_tree, &parent->rbhead, &dummy); 2097 if (chain) { 2098 bref = &chain->bref; 2099 } else if (base && base[i].type) { 2100 bref = &base[i]; 2101 } else { 2102 if (ichain->index < 0) 2103 ichain->index = i; 2104 continue; 2105 } 2106 2107 /* 2108 * Skip keys not in the chosen half (low or high), only bit 2109 * (keybits - 1) needs to be compared but for safety we 2110 * will compare all msb bits plus that bit again. 2111 */ 2112 if ((~(((hammer2_key_t)1 << keybits) - 1) & 2113 (key ^ bref->key)) != 0) { 2114 continue; 2115 } 2116 2117 /* 2118 * This element is being moved from the parent, its slot 2119 * is available for our new indirect block. 2120 */ 2121 if (ichain->index < 0) 2122 ichain->index = i; 2123 2124 /* 2125 * Load the new indirect block by acquiring or allocating 2126 * the related chain entries, then simply move them to the 2127 * new parent (ichain). 2128 * 2129 * When adjusting the parent/child relationship we must 2130 * set the MOVED bit but we do NOT update bref_flush 2131 * because otherwise we might synchronize a bref that has 2132 * not yet been flushed. We depend on chain's bref_flush 2133 * either being correct or the chain being in a MODIFIED 2134 * state. 2135 * 2136 * We do not want to set MODIFIED here as this would result 2137 * in unnecessary reallocations. 2138 * 2139 * We must still set SUBMODIFIED in the parent but we do 2140 * that after the loop. 2141 * 2142 * XXX we really need a lock here but we don't need the 2143 * data. NODATA feature needed. 2144 */ 2145 chain = hammer2_chain_get(hmp, parent, i, 2146 HAMMER2_LOOKUP_NODATA); 2147 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 2148 if (RB_INSERT(hammer2_chain_tree, &ichain->rbhead, chain)) 2149 panic("hammer2_chain_create_indirect: collision"); 2150 chain->parent = ichain; 2151 if (base) 2152 bzero(&base[i], sizeof(base[i])); 2153 atomic_add_int(&parent->refs, -1); 2154 atomic_add_int(&ichain->refs, 1); 2155 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) { 2156 hammer2_chain_ref(hmp, chain); 2157 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2158 } 2159 hammer2_chain_unlock(hmp, chain); 2160 KKASSERT(parent->refs > 0); 2161 chain = NULL; 2162 } 2163 2164 /* 2165 * Insert the new indirect block into the parent now that we've 2166 * cleared out some entries in the parent. We calculated a good 2167 * insertion index in the loop above (ichain->index). 2168 * 2169 * We don't have to set MOVED here because we mark ichain modified 2170 * down below (so the normal modified -> flush -> set-moved sequence 2171 * applies). 2172 */ 2173 KKASSERT(ichain->index >= 0); 2174 if (RB_INSERT(hammer2_chain_tree, &parent->rbhead, ichain)) 2175 panic("hammer2_chain_create_indirect: ichain insertion"); 2176 ichain->parent = parent; 2177 atomic_add_int(&parent->refs, 1); 2178 2179 /* 2180 * Mark the new indirect block modified after insertion, which 2181 * will propagate up through parent all the way to the root and 2182 * also allocate the physical block in ichain for our caller, 2183 * and assign ichain->data to a pre-zero'd space (because there 2184 * is not prior data to copy into it). 2185 * 2186 * We have to set SUBMODIFIED in ichain's flags manually so the 2187 * flusher knows it has to recurse through it to get to all of 2188 * our moved blocks, then call setsubmod() to set the bit 2189 * recursively. 2190 */ 2191 hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA); 2192 hammer2_chain_parent_setsubmod(hmp, ichain); 2193 atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED); 2194 2195 /* 2196 * Figure out what to return. 2197 */ 2198 if (create_bits > keybits) { 2199 /* 2200 * Key being created is way outside the key range, 2201 * return the original parent. 2202 */ 2203 hammer2_chain_unlock(hmp, ichain); 2204 } else if (~(((hammer2_key_t)1 << keybits) - 1) & 2205 (create_key ^ key)) { 2206 /* 2207 * Key being created is outside the key range, 2208 * return the original parent. 2209 */ 2210 hammer2_chain_unlock(hmp, ichain); 2211 } else { 2212 /* 2213 * Otherwise its in the range, return the new parent. 2214 * (leave both the new and old parent locked). 2215 */ 2216 parent = ichain; 2217 } 2218 2219 return(parent); 2220 } 2221 2222 /* 2223 * Physically delete the specified chain element. Note that inodes with 2224 * open descriptors should not be deleted (as with other filesystems) until 2225 * the last open descriptor is closed. 2226 * 2227 * This routine will remove the chain element from its parent and potentially 2228 * also recurse upward and delete indirect blocks which become empty as a 2229 * side effect. 2230 * 2231 * The caller must pass a pointer to the chain's parent, also locked and 2232 * referenced. (*parentp) will be modified in a manner similar to a lookup 2233 * or iteration when indirect blocks are also deleted as a side effect. 2234 * 2235 * XXX This currently does not adhere to the MOVED flag protocol in that 2236 * the removal is immediately indicated in the parent's blockref[] 2237 * array. 2238 * 2239 * Must be called with an exclusively locked parent. 2240 */ 2241 void 2242 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent, 2243 hammer2_chain_t *chain, int retain) 2244 { 2245 hammer2_blockref_t *base; 2246 hammer2_inode_t *ip; 2247 int count; 2248 2249 if (chain->parent != parent) 2250 panic("hammer2_chain_delete: parent mismatch"); 2251 KKASSERT(ccms_thread_lock_owned(&parent->cst)); 2252 2253 /* 2254 * Mark the parent modified so our base[] pointer remains valid 2255 * while we move entries. For the optimized indirect block 2256 * case mark the parent moved instead. 2257 * 2258 * Calculate the blockref reference in the parent 2259 */ 2260 switch(parent->bref.type) { 2261 case HAMMER2_BREF_TYPE_INODE: 2262 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID); 2263 base = &parent->data->ipdata.u.blockset.blockref[0]; 2264 count = HAMMER2_SET_COUNT; 2265 break; 2266 case HAMMER2_BREF_TYPE_INDIRECT: 2267 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA | 2268 HAMMER2_MODIFY_NO_MODIFY_TID); 2269 if (parent->flags & HAMMER2_CHAIN_INITIAL) 2270 base = NULL; 2271 else 2272 base = &parent->data->npdata.blockref[0]; 2273 count = parent->bytes / sizeof(hammer2_blockref_t); 2274 break; 2275 case HAMMER2_BREF_TYPE_VOLUME: 2276 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID); 2277 base = &hmp->voldata.sroot_blockset.blockref[0]; 2278 count = HAMMER2_SET_COUNT; 2279 break; 2280 default: 2281 panic("hammer2_chain_delete: unrecognized blockref type: %d", 2282 parent->bref.type); 2283 count = 0; 2284 break; 2285 } 2286 2287 /* 2288 * Disconnect the bref in the parent, remove the chain, and 2289 * disconnect in-memory fields from the parent. 2290 */ 2291 KKASSERT(chain->index >= 0 && chain->index < count); 2292 if (base) 2293 bzero(&base[chain->index], sizeof(*base)); 2294 2295 RB_REMOVE(hammer2_chain_tree, &parent->rbhead, chain); 2296 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED); 2297 atomic_add_int(&parent->refs, -1); /* for red-black entry */ 2298 chain->index = -1; 2299 chain->parent = NULL; 2300 2301 /* 2302 * Cumulative adjustments must be propagated to the parent inode 2303 * when deleting and synchronized to ip. 2304 * 2305 * NOTE: We do not propagate ip->delta_*count to the parent because 2306 * these represent adjustments that have not yet been 2307 * propagated upward, so we don't need to remove them from 2308 * the parent. 2309 * 2310 * Clear the pointer to the parent inode. 2311 */ 2312 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 2313 ip = chain->u.ip; 2314 if (ip->pip) { 2315 ip->pip->delta_icount -= ip->ip_data.inode_count; 2316 ip->pip->delta_dcount -= ip->ip_data.data_count; 2317 ip->ip_data.inode_count += ip->delta_icount; 2318 ip->ip_data.data_count += ip->delta_dcount; 2319 ip->delta_icount = 0; 2320 ip->delta_dcount = 0; 2321 --ip->pip->delta_icount; 2322 ip->pip = NULL; 2323 } 2324 chain->u.ip->depth = 0; 2325 } 2326 2327 /* 2328 * If retain is 0 the deletion is permanent. Because the chain is 2329 * no longer connected to the topology a flush will have no 2330 * visibility into it. We must dispose of the references related 2331 * to the MODIFIED and MOVED flags, otherwise the ref count will 2332 * never transition to 0. 2333 * 2334 * If retain is non-zero the deleted element is likely an inode 2335 * which the vnops frontend will mark DESTROYED and flush. In that 2336 * situation we must retain the flags for any open file descriptors 2337 * on the (removed) inode. The final close will destroy the 2338 * disconnected chain. 2339 */ 2340 if (retain == 0) { 2341 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 2342 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); 2343 hammer2_chain_drop(hmp, chain); 2344 } 2345 if (chain->flags & HAMMER2_CHAIN_MOVED) { 2346 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2347 hammer2_chain_drop(hmp, chain); 2348 } 2349 } 2350 2351 /* 2352 * The chain is still likely referenced, possibly even by a vnode 2353 * (if an inode), so defer further action until the chain gets 2354 * dropped. 2355 */ 2356 } 2357 2358 /* 2359 * Recursively flush the specified chain. The chain is locked and 2360 * referenced by the caller and will remain so on return. The chain 2361 * will remain referenced throughout but can temporarily lose its 2362 * lock during the recursion to avoid unnecessarily stalling user 2363 * processes. 2364 */ 2365 struct hammer2_flush_info { 2366 struct flush_deferral_list flush_list; 2367 int depth; 2368 hammer2_tid_t modify_tid; 2369 }; 2370 2371 typedef struct hammer2_flush_info hammer2_flush_info_t; 2372 2373 static void 2374 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, 2375 hammer2_flush_info_t *info) 2376 { 2377 hammer2_blockref_t *bref; 2378 hammer2_off_t pbase; 2379 size_t bbytes; 2380 size_t boff; 2381 char *bdata; 2382 struct buf *bp; 2383 int error; 2384 int wasmodified; 2385 2386 /* 2387 * If we hit the stack recursion depth limit defer the operation. 2388 * The controller of the info structure will execute the deferral 2389 * list and then retry. 2390 * 2391 * This is only applicable if SUBMODIFIED is set. After a reflush 2392 * SUBMODIFIED will probably be cleared and we want to drop through 2393 * to finish processing the current element so our direct parent 2394 * can process the results. 2395 */ 2396 if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT && 2397 (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) { 2398 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) { 2399 hammer2_chain_ref(hmp, chain); 2400 TAILQ_INSERT_TAIL(&info->flush_list, 2401 chain, flush_node); 2402 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED); 2403 } 2404 return; 2405 } 2406 2407 if (hammer2_debug & 0x0008) 2408 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n", 2409 info->depth, info->depth, "", 2410 chain->bref.type, chain->bref.data_off, 2411 chain, chain->refs, chain->flags); 2412 2413 /* 2414 * If SUBMODIFIED is set we recurse the flush and adjust the 2415 * blockrefs accordingly. 2416 * 2417 * NOTE: Looping on SUBMODIFIED can prevent a flush from ever 2418 * finishing in the face of filesystem activity. 2419 */ 2420 if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) { 2421 hammer2_chain_t *child; 2422 hammer2_chain_t *next; 2423 hammer2_blockref_t *base; 2424 int count; 2425 2426 /* 2427 * Clear SUBMODIFIED to catch races. Note that if any 2428 * child has to be flushed SUBMODIFIED will wind up being 2429 * set again (for next time), but this does not stop us from 2430 * synchronizing block updates which occurred. 2431 * 2432 * We don't want to set our chain to MODIFIED gratuitously. 2433 */ 2434 /* XXX SUBMODIFIED not interlocked, can race */ 2435 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED); 2436 2437 /* 2438 * Flush the children and update the blockrefs in the chain. 2439 * Be careful of ripouts during the loop. 2440 */ 2441 next = RB_MIN(hammer2_chain_tree, &chain->rbhead); 2442 if (next) 2443 hammer2_chain_ref(hmp, next); 2444 while ((child = next) != NULL) { 2445 next = RB_NEXT(hammer2_chain_tree, 2446 &chain->rbhead, child); 2447 if (next) 2448 hammer2_chain_ref(hmp, next); 2449 /* 2450 * We only recurse if SUBMODIFIED (internal node) 2451 * or MODIFIED (internal node or leaf) is set. 2452 * However, we must still track whether any MOVED 2453 * entries are present to determine if the chain's 2454 * blockref's need updating or not. 2455 */ 2456 if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED | 2457 HAMMER2_CHAIN_MODIFIED | 2458 HAMMER2_CHAIN_MODIFIED_AUX)) == 0) { 2459 hammer2_chain_drop(hmp, child); 2460 continue; 2461 } 2462 hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE); 2463 hammer2_chain_drop(hmp, child); 2464 if (child->parent != chain || 2465 (child->flags & (HAMMER2_CHAIN_SUBMODIFIED | 2466 HAMMER2_CHAIN_MODIFIED | 2467 HAMMER2_CHAIN_MODIFIED_AUX)) == 0) { 2468 hammer2_chain_unlock(hmp, child); 2469 continue; 2470 } 2471 2472 /* 2473 * Propagate the DESTROYED flag if found set, then 2474 * recurse the flush. 2475 */ 2476 if ((chain->flags & HAMMER2_CHAIN_DESTROYED) && 2477 (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) { 2478 atomic_set_int(&child->flags, 2479 HAMMER2_CHAIN_DESTROYED | 2480 HAMMER2_CHAIN_SUBMODIFIED); 2481 } 2482 ++info->depth; 2483 hammer2_chain_flush_pass1(hmp, child, info); 2484 --info->depth; 2485 hammer2_chain_unlock(hmp, child); 2486 } 2487 2488 /* 2489 * Now synchronize any block updates. 2490 */ 2491 next = RB_MIN(hammer2_chain_tree, &chain->rbhead); 2492 if (next) 2493 hammer2_chain_ref(hmp, next); 2494 while ((child = next) != NULL) { 2495 next = RB_NEXT(hammer2_chain_tree, 2496 &chain->rbhead, child); 2497 if (next) 2498 hammer2_chain_ref(hmp, next); 2499 if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) { 2500 hammer2_chain_drop(hmp, child); 2501 continue; 2502 } 2503 hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER); 2504 hammer2_chain_drop(hmp, child); 2505 if (child->parent != chain || 2506 (child->flags & HAMMER2_CHAIN_MOVED) == 0) { 2507 hammer2_chain_unlock(hmp, child); 2508 continue; 2509 } 2510 2511 hammer2_chain_modify(hmp, chain, 2512 HAMMER2_MODIFY_NO_MODIFY_TID); 2513 2514 switch(chain->bref.type) { 2515 case HAMMER2_BREF_TYPE_INODE: 2516 KKASSERT((chain->data->ipdata.op_flags & 2517 HAMMER2_OPFLAG_DIRECTDATA) == 0); 2518 base = &chain->data->ipdata.u.blockset. 2519 blockref[0]; 2520 count = HAMMER2_SET_COUNT; 2521 break; 2522 case HAMMER2_BREF_TYPE_INDIRECT: 2523 base = &chain->data->npdata.blockref[0]; 2524 count = chain->bytes / 2525 sizeof(hammer2_blockref_t); 2526 break; 2527 case HAMMER2_BREF_TYPE_VOLUME: 2528 base = &hmp->voldata.sroot_blockset.blockref[0]; 2529 count = HAMMER2_SET_COUNT; 2530 break; 2531 default: 2532 base = NULL; 2533 panic("hammer2_chain_get: " 2534 "unrecognized blockref type: %d", 2535 chain->bref.type); 2536 } 2537 2538 KKASSERT(child->index >= 0); 2539 base[child->index] = child->bref_flush; 2540 2541 if (chain->bref.mirror_tid < 2542 child->bref_flush.mirror_tid) { 2543 chain->bref.mirror_tid = 2544 child->bref_flush.mirror_tid; 2545 } 2546 2547 if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME && 2548 hmp->voldata.mirror_tid < 2549 child->bref_flush.mirror_tid) { 2550 hmp->voldata.mirror_tid = 2551 child->bref_flush.mirror_tid; 2552 } 2553 atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED); 2554 hammer2_chain_drop(hmp, child); /* MOVED flag */ 2555 hammer2_chain_unlock(hmp, child); 2556 } 2557 } 2558 2559 /* 2560 * If destroying the object we unconditonally clear the MODIFIED 2561 * and MOVED bits, and we destroy the buffer without writing it 2562 * out. 2563 * 2564 * We don't bother updating the hash/crc or the chain bref. 2565 * 2566 * NOTE: The destroy'd object's bref has already been updated. 2567 * so we can clear MOVED without propagating mirror_tid 2568 * or modify_tid upward. 2569 * 2570 * XXX allocations for unflushed data can be returned to the 2571 * free pool. 2572 */ 2573 if (chain->flags & HAMMER2_CHAIN_DESTROYED) { 2574 if (chain->flags & HAMMER2_CHAIN_MODIFIED) { 2575 if (chain->bp) { 2576 chain->bp->b_flags |= B_INVAL|B_RELBUF; 2577 } 2578 atomic_clear_int(&chain->flags, 2579 HAMMER2_CHAIN_MODIFIED | 2580 HAMMER2_CHAIN_MODIFY_TID); 2581 hammer2_chain_drop(hmp, chain); 2582 } 2583 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) { 2584 atomic_clear_int(&chain->flags, 2585 HAMMER2_CHAIN_MODIFIED_AUX); 2586 } 2587 if (chain->flags & HAMMER2_CHAIN_MOVED) { 2588 atomic_clear_int(&chain->flags, 2589 HAMMER2_CHAIN_MOVED); 2590 hammer2_chain_drop(hmp, chain); 2591 } 2592 return; 2593 } 2594 2595 /* 2596 * Flush this chain entry only if it is marked modified. 2597 */ 2598 if ((chain->flags & (HAMMER2_CHAIN_MODIFIED | 2599 HAMMER2_CHAIN_MODIFIED_AUX)) == 0) { 2600 goto done; 2601 } 2602 2603 /* 2604 * Synchronize cumulative data and inode count adjustments to 2605 * the inode and propagate the deltas upward to the parent. 2606 */ 2607 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) { 2608 hammer2_inode_t *ip; 2609 2610 ip = chain->u.ip; 2611 ip->ip_data.inode_count += ip->delta_icount; 2612 ip->ip_data.data_count += ip->delta_dcount; 2613 if (ip->pip) { 2614 ip->pip->delta_icount += ip->delta_icount; 2615 ip->pip->delta_dcount += ip->delta_dcount; 2616 } 2617 ip->delta_icount = 0; 2618 ip->delta_dcount = 0; 2619 } 2620 2621 /* 2622 * Flush if MODIFIED or MODIFIED_AUX is set. MODIFIED_AUX is only 2623 * used by the volume header (&hmp->vchain). 2624 */ 2625 if ((chain->flags & (HAMMER2_CHAIN_MODIFIED | 2626 HAMMER2_CHAIN_MODIFIED_AUX)) == 0) { 2627 goto done; 2628 } 2629 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX); 2630 2631 /* 2632 * Clear MODIFIED and set HAMMER2_CHAIN_MOVED. The caller 2633 * will re-test the MOVED bit. We must also update the mirror_tid 2634 * and modify_tid fields as appropriate. 2635 * 2636 * bits own a single chain ref and the MOVED bit owns its own 2637 * chain ref. 2638 */ 2639 chain->bref.mirror_tid = info->modify_tid; 2640 if (chain->flags & HAMMER2_CHAIN_MODIFY_TID) 2641 chain->bref.modify_tid = info->modify_tid; 2642 wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0; 2643 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED | 2644 HAMMER2_CHAIN_MODIFY_TID); 2645 2646 if (chain->flags & HAMMER2_CHAIN_MOVED) { 2647 /* 2648 * Drop the ref from the MODIFIED bit we cleared. 2649 */ 2650 if (wasmodified) 2651 hammer2_chain_drop(hmp, chain); 2652 } else { 2653 /* 2654 * If we were MODIFIED we inherit the ref from clearing 2655 * that bit, otherwise we need another ref. 2656 */ 2657 if (wasmodified == 0) 2658 hammer2_chain_ref(hmp, chain); 2659 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED); 2660 } 2661 chain->bref_flush = chain->bref; 2662 2663 /* 2664 * If this is part of a recursive flush we can go ahead and write 2665 * out the buffer cache buffer and pass a new bref back up the chain. 2666 * 2667 * This will never be a volume header. 2668 */ 2669 switch(chain->bref.type) { 2670 case HAMMER2_BREF_TYPE_VOLUME: 2671 /* 2672 * The volume header is flushed manually by the syncer, not 2673 * here. 2674 */ 2675 break; 2676 case HAMMER2_BREF_TYPE_DATA: 2677 /* 2678 * Data elements have already been flushed via the logical 2679 * file buffer cache. Their hash was set in the bref by 2680 * the vop_write code. 2681 * 2682 * Make sure the buffer(s) have been flushed out here. 2683 */ 2684 bbytes = chain->bytes; 2685 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); 2686 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); 2687 2688 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0); 2689 if (bp) { 2690 if ((bp->b_flags & (B_CACHE | B_DIRTY)) == 2691 (B_CACHE | B_DIRTY)) { 2692 kprintf("x"); 2693 cluster_awrite(bp); 2694 } else { 2695 bp->b_flags |= B_RELBUF; 2696 brelse(bp); 2697 } 2698 } 2699 break; 2700 case HAMMER2_BREF_TYPE_INDIRECT: 2701 /* 2702 * Indirect blocks may be in an INITIAL state. Use the 2703 * chain_lock() call to ensure that the buffer has been 2704 * instantiated (even though it is already locked the buffer 2705 * might not have been instantiated). 2706 * 2707 * Only write the buffer out if it is dirty, it is possible 2708 * the operating system had already written out the buffer. 2709 */ 2710 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS); 2711 KKASSERT(chain->bp != NULL); 2712 2713 bp = chain->bp; 2714 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) || 2715 (bp->b_flags & B_DIRTY)) { 2716 bawrite(chain->bp); 2717 } else { 2718 brelse(chain->bp); 2719 } 2720 chain->bp = NULL; 2721 chain->data = NULL; 2722 hammer2_chain_unlock(hmp, chain); 2723 break; 2724 default: 2725 /* 2726 * Embedded elements have to be flushed out. 2727 */ 2728 KKASSERT(chain->data != NULL); 2729 KKASSERT(chain->bp == NULL); 2730 bref = &chain->bref; 2731 2732 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0); 2733 2734 if (chain->bp == NULL) { 2735 /* 2736 * The data is embedded, we have to acquire the 2737 * buffer cache buffer and copy the data into it. 2738 */ 2739 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) 2740 bbytes = HAMMER2_MINIOSIZE; 2741 pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); 2742 boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); 2743 2744 /* 2745 * The getblk() optimization can only be used if the 2746 * physical block size matches the request. 2747 */ 2748 if (chain->bytes == bbytes) { 2749 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); 2750 error = 0; 2751 } else { 2752 error = bread(hmp->devvp, pbase, bbytes, &bp); 2753 KKASSERT(error == 0); 2754 } 2755 bdata = (char *)bp->b_data + boff; 2756 2757 /* 2758 * Copy the data to the buffer, mark the buffer 2759 * dirty, and convert the chain to unmodified. 2760 * 2761 * We expect we might have to make adjustments to 2762 * non-data delayed-write buffers when doing an 2763 * actual flush so use bawrite() instead of 2764 * cluster_awrite() here. 2765 */ 2766 bcopy(chain->data, bdata, chain->bytes); 2767 bp->b_flags |= B_CLUSTEROK; 2768 bawrite(bp); 2769 bp = NULL; 2770 chain->bref.check.iscsi32.value = 2771 hammer2_icrc32(chain->data, chain->bytes); 2772 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) 2773 ++hammer2_iod_meta_write; 2774 else 2775 ++hammer2_iod_indr_write; 2776 } else { 2777 chain->bref.check.iscsi32.value = 2778 hammer2_icrc32(chain->data, chain->bytes); 2779 } 2780 } 2781 2782 /* 2783 * Adjustments to the bref. The caller will use this to adjust 2784 * our chain's pointer to this chain element. 2785 */ 2786 bref = &chain->bref; 2787 2788 switch(bref->type) { 2789 case HAMMER2_BREF_TYPE_VOLUME: 2790 KKASSERT(chain->data != NULL); 2791 KKASSERT(chain->bp == NULL); 2792 2793 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]= 2794 hammer2_icrc32( 2795 (char *)&hmp->voldata + 2796 HAMMER2_VOLUME_ICRC1_OFF, 2797 HAMMER2_VOLUME_ICRC1_SIZE); 2798 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]= 2799 hammer2_icrc32( 2800 (char *)&hmp->voldata + 2801 HAMMER2_VOLUME_ICRC0_OFF, 2802 HAMMER2_VOLUME_ICRC0_SIZE); 2803 hmp->voldata.icrc_volheader = 2804 hammer2_icrc32( 2805 (char *)&hmp->voldata + 2806 HAMMER2_VOLUME_ICRCVH_OFF, 2807 HAMMER2_VOLUME_ICRCVH_SIZE); 2808 break; 2809 default: 2810 break; 2811 2812 } 2813 done: 2814 if (hammer2_debug & 0x0008) { 2815 kprintf("%*.*s} %p/%d %04x ", 2816 info->depth, info->depth, "", 2817 chain, chain->refs, chain->flags); 2818 } 2819 } 2820 2821 #if 0 2822 /* 2823 * PASS2 - not yet implemented (should be called only with the root chain?) 2824 */ 2825 static void 2826 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain) 2827 { 2828 } 2829 #endif 2830 2831 /* 2832 * Stand-alone flush. If the chain is unable to completely flush we have 2833 * to be sure that SUBMODIFIED propagates up the parent chain. We must not 2834 * clear the MOVED bit after flushing in this situation or our desynchronized 2835 * bref will not properly update in the parent. 2836 * 2837 * This routine can be called from several places but the most important 2838 * is from the hammer2_vop_reclaim() function. We want to try to completely 2839 * clean out the inode structure to prevent disconnected inodes from 2840 * building up and blowing out the kmalloc pool. 2841 * 2842 * If modify_tid is 0 (usual case), a new modify_tid is allocated and 2843 * applied to the flush. The depth-limit handling code is the only 2844 * code which passes a non-zero modify_tid to hammer2_chain_flush(). 2845 */ 2846 void 2847 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain, 2848 hammer2_tid_t modify_tid) 2849 { 2850 hammer2_chain_t *parent; 2851 hammer2_chain_t *scan; 2852 hammer2_blockref_t *base; 2853 hammer2_flush_info_t info; 2854 int count; 2855 int reflush; 2856 2857 /* 2858 * Execute the recursive flush and handle deferrals. 2859 * 2860 * Chains can be ridiculously long (thousands deep), so to 2861 * avoid blowing out the kernel stack the recursive flush has a 2862 * depth limit. Elements at the limit are placed on a list 2863 * for re-execution after the stack has been popped. 2864 */ 2865 bzero(&info, sizeof(info)); 2866 TAILQ_INIT(&info.flush_list); 2867 2868 if (modify_tid == 0) { 2869 hammer2_voldata_lock(hmp); 2870 info.modify_tid = hmp->voldata.alloc_tid++; 2871 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX); 2872 hammer2_voldata_unlock(hmp); 2873 } else { 2874 info.modify_tid = modify_tid; 2875 } 2876 reflush = 1; 2877 2878 while (reflush) { 2879 /* 2880 * Primary recursion 2881 */ 2882 hammer2_chain_flush_pass1(hmp, chain, &info); 2883 reflush = 0; 2884 2885 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) { 2886 /* 2887 * Secondary recursion. Note that a reference is 2888 * retained from the element's presence on the 2889 * deferral list. 2890 */ 2891 KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED); 2892 TAILQ_REMOVE(&info.flush_list, scan, flush_node); 2893 atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED); 2894 2895 /* 2896 * Now that we've popped back up we can do a secondary 2897 * recursion on the deferred elements. 2898 */ 2899 if (hammer2_debug & 0x0040) 2900 kprintf("defered flush %p\n", scan); 2901 hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE); 2902 hammer2_chain_flush(hmp, scan, info.modify_tid); 2903 hammer2_chain_unlock(hmp, scan); 2904 2905 /* 2906 * Only flag a reflush if SUBMODIFIED is no longer 2907 * set. If SUBMODIFIED is set the element will just 2908 * wind up on our flush_list again. 2909 */ 2910 if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED | 2911 HAMMER2_CHAIN_MODIFIED | 2912 HAMMER2_CHAIN_MODIFIED_AUX)) == 0) { 2913 reflush = 1; 2914 } 2915 hammer2_chain_drop(hmp, scan); 2916 } 2917 if ((hammer2_debug & 0x0040) && reflush) 2918 kprintf("reflush %p\n", chain); 2919 } 2920 2921 /* 2922 * The SUBMODIFIED bit must propagate upward if the chain could not 2923 * be completely flushed. 2924 */ 2925 if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED | 2926 HAMMER2_CHAIN_MODIFIED | 2927 HAMMER2_CHAIN_MODIFIED_AUX | 2928 HAMMER2_CHAIN_MOVED)) { 2929 hammer2_chain_parent_setsubmod(hmp, chain); 2930 } 2931 2932 /* 2933 * If the only thing left is a simple bref update try to 2934 * pro-actively update the parent, otherwise return early. 2935 */ 2936 parent = chain->parent; 2937 if (parent == NULL) { 2938 return; 2939 } 2940 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE || 2941 (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED | 2942 HAMMER2_CHAIN_MODIFIED | 2943 HAMMER2_CHAIN_MODIFIED_AUX | 2944 HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) { 2945 return; 2946 } 2947 2948 /* 2949 * We are locking backwards so allow the lock to fail. 2950 */ 2951 if (ccms_thread_lock_nonblock(&parent->cst, CCMS_STATE_EXCLUSIVE)) 2952 return; 2953 2954 /* 2955 * We are updating brefs but we have to call chain_modify() 2956 * because our caller is not being run from a recursive flush. 2957 * 2958 * This will also chain up the parent list and set the SUBMODIFIED 2959 * flag. 2960 * 2961 * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the 2962 * modification is only related to updating a bref in the parent. 2963 * 2964 * When updating the blockset embedded in the volume header we must 2965 * also update voldata.mirror_tid. 2966 */ 2967 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE); 2968 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID); 2969 2970 switch(parent->bref.type) { 2971 case HAMMER2_BREF_TYPE_INODE: 2972 base = &parent->data->ipdata.u.blockset. 2973 blockref[0]; 2974 count = HAMMER2_SET_COUNT; 2975 break; 2976 case HAMMER2_BREF_TYPE_INDIRECT: 2977 base = &parent->data->npdata.blockref[0]; 2978 count = parent->bytes / 2979 sizeof(hammer2_blockref_t); 2980 break; 2981 case HAMMER2_BREF_TYPE_VOLUME: 2982 base = &hmp->voldata.sroot_blockset.blockref[0]; 2983 count = HAMMER2_SET_COUNT; 2984 if (chain->flags & HAMMER2_CHAIN_MOVED) { 2985 if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) { 2986 hmp->voldata.mirror_tid = 2987 chain->bref.mirror_tid; 2988 } 2989 } 2990 break; 2991 default: 2992 base = NULL; 2993 panic("hammer2_chain_flush: " 2994 "unrecognized blockref type: %d", 2995 parent->bref.type); 2996 } 2997 2998 /* 2999 * Update the blockref in the parent. We do not have to set 3000 * MOVED in the parent because the parent has been marked modified, 3001 * so the flush sequence will pick up the bref change. 3002 * 3003 * We do have to propagate mirror_tid upward. 3004 */ 3005 KKASSERT(chain->index >= 0 && 3006 chain->index < count); 3007 KKASSERT(chain->parent == parent); 3008 if (chain->flags & HAMMER2_CHAIN_MOVED) { 3009 base[chain->index] = chain->bref_flush; 3010 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid) 3011 parent->bref.mirror_tid = chain->bref_flush.mirror_tid; 3012 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED); 3013 hammer2_chain_drop(hmp, chain); 3014 } else if (bcmp(&base[chain->index], &chain->bref_flush, 3015 sizeof(chain->bref)) != 0) { 3016 panic("hammer2: unflagged bref update(2)"); 3017 } 3018 ccms_thread_unlock(&parent->cst); /* release manual op */ 3019 hammer2_chain_unlock(hmp, parent); 3020 } 3021