1 /* 2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@dragonflybsd.org> 6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org> 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 3. Neither the name of The DragonFly Project nor the names of its 19 * contributors may be used to endorse or promote products derived 20 * from this software without specific, prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kernel.h> 38 #include <sys/fcntl.h> 39 #include <sys/buf.h> 40 #include <sys/proc.h> 41 #include <sys/namei.h> 42 #include <sys/mount.h> 43 #include <sys/vnode.h> 44 #include <sys/mountctl.h> 45 46 #include "hammer2.h" 47 48 struct hammer2_fiterate { 49 hammer2_off_t bpref; 50 hammer2_off_t bnext; 51 int loops; 52 }; 53 54 typedef struct hammer2_fiterate hammer2_fiterate_t; 55 56 static int hammer2_freemap_try_alloc(hammer2_trans_t *trans, 57 hammer2_chain_t **parentp, hammer2_blockref_t *bref, 58 int radix, hammer2_fiterate_t *iter); 59 static void hammer2_freemap_init(hammer2_trans_t *trans, hammer2_mount_t *hmp, 60 hammer2_key_t key, hammer2_chain_t *chain); 61 static int hammer2_bmap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp, 62 hammer2_bmap_data_t *bmap, uint16_t class, 63 int n, int radix, hammer2_key_t *basep); 64 static int hammer2_freemap_iterate(hammer2_trans_t *trans, 65 hammer2_chain_t **parentp, hammer2_chain_t **chainp, 66 hammer2_fiterate_t *iter); 67 68 static __inline 69 int 70 hammer2_freemapradix(int radix) 71 { 72 return(radix); 73 } 74 75 /* 76 * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF 77 * bref. Return a combined media offset and physical size radix. Freemap 78 * chains use fixed storage offsets in the 4MB reserved area at the 79 * beginning of each 2GB zone 80 * 81 * Rotate between four possibilities. Theoretically this means we have three 82 * good freemaps in case of a crash which we can use as a base for the fixup 83 * scan at mount-time. 84 */ 85 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1)) 86 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix)) 87 88 static 89 int 90 hammer2_freemap_reserve(hammer2_trans_t *trans, hammer2_chain_t *chain, 91 int radix) 92 { 93 hammer2_blockref_t *bref = &chain->bref; 94 hammer2_off_t off; 95 int index; 96 size_t bytes; 97 98 /* 99 * Physical allocation size -> radix. Typically either 256 for 100 * a level 0 freemap leaf or 65536 for a level N freemap node. 101 * 102 * NOTE: A 256 byte bitmap represents 256 x 8 x 1024 = 2MB of storage. 103 * Do not use hammer2_allocsize() here as it has a min cap. 104 */ 105 bytes = 1 << radix; 106 107 /* 108 * Calculate block selection index 0..7 of current block. 109 */ 110 if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) { 111 index = 0; 112 } else { 113 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX & 114 (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1); 115 off = off / HAMMER2_PBUFSIZE; 116 KKASSERT(off >= HAMMER2_ZONE_FREEMAP_00 && 117 off < HAMMER2_ZONE_FREEMAP_END); 118 index = (int)(off - HAMMER2_ZONE_FREEMAP_00) / 4; 119 KKASSERT(index >= 0 && index < HAMMER2_ZONE_FREEMAP_COPIES); 120 } 121 122 /* 123 * Calculate new index (our 'allocation'). We have to be careful 124 * here as there can be two different transaction ids running 125 * concurrently when a flush is in-progress. 126 * 127 * We also want to make sure, for algorithmic repeatability, that 128 * the index sequences are monotonic with transaction ids. Some 129 * skipping is allowed as long as we ensure that all four volume 130 * header backups have consistent freemaps. 131 * 132 * FLUSH NORMAL FLUSH NORMAL FLUSH NORMAL FLUSH NORMAL 133 * N+=1 N+=2 134 * (0->1) (1->3) (3->4) (4->6) (6->7) (7->9) (9->10) (10->12) 135 * 136 * [-concurrent-][-concurrent-][-concurrent-][-concurrent-] 137 * 138 * (alternative first NORMAL might be 0->2 if flush had not yet 139 * modified the chain, this is the worst case). 140 */ 141 if ((trans->flags & HAMMER2_TRANS_ISFLUSH) == 0) { 142 /* 143 * Normal transactions always run with the highest TID. 144 * But if a flush is in-progress we want to reserve a slot 145 * for the flush with a lower TID running concurrently to 146 * do a delete-duplicate. 147 */ 148 index = (index + 2) % HAMMER2_ZONE_FREEMAP_COPIES; 149 } else if (trans->flags & HAMMER2_TRANS_ISALLOCATING) { 150 /* 151 * Flush transaction, hammer2_freemap.c itself is doing a 152 * delete-duplicate during an allocation within the freemap. 153 */ 154 index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES; 155 } else { 156 /* 157 * Flush transaction, hammer2_flush.c is doing a 158 * delete-duplicate on the freemap while flushing 159 * hmp->fchain. 160 */ 161 index = (index + 1) % HAMMER2_ZONE_FREEMAP_COPIES; 162 } 163 164 /* 165 * Calculate the block offset of the reserved block. This will 166 * point into the 4MB reserved area at the base of the appropriate 167 * 2GB zone, once added to the FREEMAP_x selection above. 168 */ 169 switch(bref->keybits) { 170 /* case HAMMER2_FREEMAP_LEVEL5_RADIX: not applicable */ 171 case HAMMER2_FREEMAP_LEVEL4_RADIX: /* 2EB */ 172 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE); 173 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE); 174 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) + 175 (index * 4 + HAMMER2_ZONE_FREEMAP_00 + 176 HAMMER2_ZONEFM_LEVEL4) * HAMMER2_PBUFSIZE; 177 break; 178 case HAMMER2_FREEMAP_LEVEL3_RADIX: /* 2PB */ 179 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE); 180 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE); 181 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) + 182 (index * 4 + HAMMER2_ZONE_FREEMAP_00 + 183 HAMMER2_ZONEFM_LEVEL3) * HAMMER2_PBUFSIZE; 184 break; 185 case HAMMER2_FREEMAP_LEVEL2_RADIX: /* 2TB */ 186 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE); 187 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE); 188 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) + 189 (index * 4 + HAMMER2_ZONE_FREEMAP_00 + 190 HAMMER2_ZONEFM_LEVEL2) * HAMMER2_PBUFSIZE; 191 break; 192 case HAMMER2_FREEMAP_LEVEL1_RADIX: /* 2GB */ 193 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF); 194 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE); 195 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) + 196 (index * 4 + HAMMER2_ZONE_FREEMAP_00 + 197 HAMMER2_ZONEFM_LEVEL1) * HAMMER2_PBUFSIZE; 198 break; 199 default: 200 panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits); 201 /* NOT REACHED */ 202 off = (hammer2_off_t)-1; 203 break; 204 } 205 bref->data_off = off | radix; 206 #if 0 207 kprintf("-> %016jx\n", bref->data_off); 208 #endif 209 return (0); 210 } 211 212 /* 213 * Normal freemap allocator 214 * 215 * Use available hints to allocate space using the freemap. Create missing 216 * freemap infrastructure on-the-fly as needed (including marking initial 217 * allocations using the iterator as allocated, instantiating new 2GB zones, 218 * and dealing with the end-of-media edge case). 219 * 220 * ip and bpref are only used as a heuristic to determine locality of 221 * reference. bref->key may also be used heuristically. 222 * 223 * WARNING! When called from a flush we have to use the 'live' sync_tid 224 * and not the flush sync_tid. The live sync_tid is the flush 225 * sync_tid + 1. That is, freemap allocations which occur during 226 * a flush are not part of the flush. Crash-recovery will restore 227 * any lost allocations. 228 */ 229 int 230 hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *chain, 231 size_t bytes) 232 { 233 hammer2_mount_t *hmp = chain->hmp; 234 hammer2_blockref_t *bref = &chain->bref; 235 hammer2_chain_t *parent; 236 int radix; 237 int error; 238 unsigned int hindex; 239 hammer2_fiterate_t iter; 240 241 /* 242 * Validate the allocation size. It must be a power of 2. 243 * 244 * For now require that the caller be aware of the minimum 245 * allocation (1K). 246 */ 247 radix = hammer2_getradix(bytes); 248 KKASSERT((size_t)1 << radix == bytes); 249 250 /* 251 * Freemap blocks themselves are simply assigned from the reserve 252 * area, not allocated from the freemap. 253 */ 254 if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE || 255 bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { 256 return (hammer2_freemap_reserve(trans, chain, radix)); 257 } 258 259 #if 0 260 /* 261 * (this mechanic is no longer used, DOMAYFREE is used only by 262 * the bulk freemap scan now). 263 * 264 * Mark previously allocated block as possibly freeable. There might 265 * be snapshots and other races so we can't just mark it fully free. 266 * (XXX optimize this for the current-transaction create+delete case) 267 */ 268 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) { 269 hammer2_freemap_adjust(trans, hmp, bref, 270 HAMMER2_FREEMAP_DOMAYFREE); 271 } 272 #endif 273 274 /* 275 * Setting ISALLOCATING ensures correct operation even when the 276 * flusher itself is making allocations. 277 */ 278 KKASSERT(bytes >= HAMMER2_MIN_ALLOC && bytes <= HAMMER2_MAX_ALLOC); 279 KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0); 280 atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING); 281 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 282 ++trans->sync_tid; 283 284 /* 285 * Calculate the starting point for our allocation search. 286 * 287 * Each freemap leaf is dedicated to a specific freemap_radix. 288 * The freemap_radix can be more fine-grained than the device buffer 289 * radix which results in inodes being grouped together in their 290 * own segment, terminal-data (16K or less) and initial indirect 291 * block being grouped together, and then full-indirect and full-data 292 * blocks (64K) being grouped together. 293 * 294 * The single most important aspect of this is the inode grouping 295 * because that is what allows 'find' and 'ls' and other filesystem 296 * topology operations to run fast. 297 */ 298 #if 0 299 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) 300 bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; 301 else if (trans->tmp_bpref) 302 bpref = trans->tmp_bpref; 303 else if (trans->tmp_ip) 304 bpref = trans->tmp_ip->chain->bref.data_off; 305 else 306 #endif 307 /* 308 * Heuristic tracking index. We would like one for each distinct 309 * bref type if possible. heur_freemap[] has room for two classes 310 * for each type. At a minimum we have to break-up our heuristic 311 * by device block sizes. 312 */ 313 hindex = hammer2_devblkradix(radix) - HAMMER2_MINIORADIX; 314 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_NRADIX); 315 hindex += bref->type * HAMMER2_FREEMAP_HEUR_NRADIX; 316 hindex &= HAMMER2_FREEMAP_HEUR_TYPES * HAMMER2_FREEMAP_HEUR_NRADIX - 1; 317 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR); 318 319 iter.bpref = hmp->heur_freemap[hindex]; 320 321 /* 322 * Make sure bpref is in-bounds. It's ok if bpref covers a zone's 323 * reserved area, the try code will iterate past it. 324 */ 325 if (iter.bpref > hmp->voldata.volu_size) 326 iter.bpref = hmp->voldata.volu_size - 1; 327 328 /* 329 * Iterate the freemap looking for free space before and after. 330 */ 331 parent = &hmp->fchain; 332 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 333 error = EAGAIN; 334 iter.bnext = iter.bpref; 335 iter.loops = 0; 336 337 while (error == EAGAIN) { 338 error = hammer2_freemap_try_alloc(trans, &parent, bref, 339 radix, &iter); 340 } 341 hmp->heur_freemap[hindex] = iter.bnext; 342 hammer2_chain_unlock(parent); 343 344 atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING); 345 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 346 --trans->sync_tid; 347 348 return (error); 349 } 350 351 static int 352 hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, 353 hammer2_blockref_t *bref, int radix, 354 hammer2_fiterate_t *iter) 355 { 356 hammer2_mount_t *hmp = (*parentp)->hmp; 357 hammer2_off_t l0size; 358 hammer2_off_t l1size; 359 hammer2_off_t l1mask; 360 hammer2_key_t key_dummy; 361 hammer2_chain_t *chain; 362 hammer2_off_t key; 363 size_t bytes; 364 uint16_t class; 365 int error = 0; 366 int cache_index = -1; 367 368 369 /* 370 * Calculate the number of bytes being allocated, the number 371 * of contiguous bits of bitmap being allocated, and the bitmap 372 * mask. 373 * 374 * WARNING! cpu hardware may mask bits == 64 -> 0 and blow up the 375 * mask calculation. 376 */ 377 bytes = (size_t)1 << radix; 378 class = (bref->type << 8) | hammer2_devblkradix(radix); 379 380 /* 381 * Lookup the level1 freemap chain, creating and initializing one 382 * if necessary. Intermediate levels will be created automatically 383 * when necessary by hammer2_chain_create(). 384 */ 385 key = H2FMBASE(iter->bnext, HAMMER2_FREEMAP_LEVEL1_RADIX); 386 l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 387 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 388 l1mask = l1size - 1; 389 390 chain = hammer2_chain_lookup(parentp, &key_dummy, key, key + l1mask, 391 &cache_index, 392 HAMMER2_LOOKUP_ALWAYS | 393 HAMMER2_LOOKUP_MATCHIND); 394 395 if (chain == NULL) { 396 /* 397 * Create the missing leaf, be sure to initialize 398 * the auxillary freemap tracking information in 399 * the bref.check.freemap structure. 400 */ 401 #if 0 402 kprintf("freemap create L1 @ %016jx bpref %016jx\n", 403 key, iter->bpref); 404 #endif 405 error = hammer2_chain_create(trans, parentp, &chain, 406 key, HAMMER2_FREEMAP_LEVEL1_RADIX, 407 HAMMER2_BREF_TYPE_FREEMAP_LEAF, 408 HAMMER2_FREEMAP_LEVELN_PSIZE); 409 KKASSERT(error == 0); 410 if (error == 0) { 411 hammer2_chain_modify(trans, &chain, 0); 412 bzero(&chain->data->bmdata[0], 413 HAMMER2_FREEMAP_LEVELN_PSIZE); 414 chain->bref.check.freemap.bigmask = (uint32_t)-1; 415 chain->bref.check.freemap.avail = l1size; 416 /* bref.methods should already be inherited */ 417 418 hammer2_freemap_init(trans, hmp, key, chain); 419 } 420 } else if ((chain->bref.check.freemap.bigmask & (1 << radix)) == 0) { 421 /* 422 * Already flagged as not having enough space 423 */ 424 error = ENOSPC; 425 } else { 426 /* 427 * Modify existing chain to setup for adjustment. 428 */ 429 hammer2_chain_modify(trans, &chain, 0); 430 } 431 432 /* 433 * Scan 2MB entries. 434 */ 435 if (error == 0) { 436 hammer2_bmap_data_t *bmap; 437 hammer2_key_t base_key; 438 int count; 439 int start; 440 int n; 441 442 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF); 443 start = (int)((iter->bnext - key) >> 444 HAMMER2_FREEMAP_LEVEL0_RADIX); 445 KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT); 446 hammer2_chain_modify(trans, &chain, 0); 447 448 error = ENOSPC; 449 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) { 450 if (start + count >= HAMMER2_FREEMAP_COUNT && 451 start - count < 0) { 452 break; 453 } 454 n = start + count; 455 bmap = &chain->data->bmdata[n]; 456 if (n < HAMMER2_FREEMAP_COUNT && bmap->avail && 457 (bmap->class == 0 || bmap->class == class)) { 458 base_key = key + n * l0size; 459 error = hammer2_bmap_alloc(trans, hmp, bmap, 460 class, n, radix, 461 &base_key); 462 if (error != ENOSPC) { 463 key = base_key; 464 break; 465 } 466 } 467 n = start - count; 468 bmap = &chain->data->bmdata[n]; 469 if (n >= 0 && bmap->avail && 470 (bmap->class == 0 || bmap->class == class)) { 471 base_key = key + n * l0size; 472 error = hammer2_bmap_alloc(trans, hmp, bmap, 473 class, n, radix, 474 &base_key); 475 if (error != ENOSPC) { 476 key = base_key; 477 break; 478 } 479 } 480 } 481 if (error == ENOSPC) 482 chain->bref.check.freemap.bigmask &= ~(1 << radix); 483 /* XXX also scan down from original count */ 484 } 485 486 if (error == 0) { 487 /* 488 * Assert validity. Must be beyond the static allocator used 489 * by newfs_hammer2 (and thus also beyond the aux area), 490 * not go past the volume size, and must not be in the 491 * reserved segment area for a zone. 492 */ 493 KKASSERT(key >= hmp->voldata.allocator_beg && 494 key + bytes <= hmp->voldata.volu_size); 495 KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG); 496 bref->data_off = key | radix; 497 498 #if 0 499 kprintf("alloc cp=%p %016jx %016jx using %016jx\n", 500 chain, 501 bref->key, bref->data_off, chain->bref.data_off); 502 #endif 503 } else if (error == ENOSPC) { 504 /* 505 * Return EAGAIN with next iteration in iter->bnext, or 506 * return ENOSPC if the allocation map has been exhausted. 507 */ 508 error = hammer2_freemap_iterate(trans, parentp, &chain, iter); 509 } 510 511 /* 512 * Cleanup 513 */ 514 if (chain) 515 hammer2_chain_unlock(chain); 516 return (error); 517 } 518 519 /* 520 * Allocate (1<<radix) bytes from the bmap whos base data offset is (*basep). 521 * 522 * If the linear iterator is mid-block we use it directly (the bitmap should 523 * already be marked allocated), otherwise we search for a block in the bitmap 524 * that fits the allocation request. 525 * 526 * A partial bitmap allocation sets the minimum bitmap granularity (16KB) 527 * to fully allocated and adjusts the linear allocator to allow the 528 * remaining space to be allocated. 529 */ 530 static 531 int 532 hammer2_bmap_alloc(hammer2_trans_t *trans, hammer2_mount_t *hmp, 533 hammer2_bmap_data_t *bmap, 534 uint16_t class, int n, int radix, hammer2_key_t *basep) 535 { 536 hammer2_io_t *dio; 537 size_t size; 538 size_t bsize; 539 int bmradix; 540 uint32_t bmmask; 541 int offset; 542 int error; 543 int i; 544 int j; 545 546 /* 547 * Take into account 2-bits per block when calculating bmradix. 548 */ 549 size = (size_t)1 << radix; 550 551 if (radix <= HAMMER2_FREEMAP_BLOCK_RADIX) { 552 bmradix = 2; 553 bsize = HAMMER2_FREEMAP_BLOCK_SIZE; 554 /* (16K) 2 bits per allocation block */ 555 } else { 556 bmradix = 2 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX); 557 bsize = size; 558 /* (32K-256K) 4, 8, 16, 32 bits per allocation block */ 559 } 560 561 /* 562 * Use the linear iterator to pack small allocations, otherwise 563 * fall-back to finding a free 16KB chunk. The linear iterator 564 * is only valid when *NOT* on a freemap chunking boundary (16KB). 565 * If it is the bitmap must be scanned. It can become invalid 566 * once we pack to the boundary. We adjust it after a bitmap 567 * allocation only for sub-16KB allocations (so the perfectly good 568 * previous value can still be used for fragments when 16KB+ 569 * allocations are made). 570 * 571 * Beware of hardware artifacts when bmradix == 32 (intermediate 572 * result can wind up being '1' instead of '0' if hardware masks 573 * bit-count & 31). 574 * 575 * NOTE: j needs to be even in the j= calculation. As an artifact 576 * of the /2 division, our bitmask has to clear bit 0. 577 * 578 * NOTE: TODO this can leave little unallocatable fragments lying 579 * around. 580 */ 581 if (((uint32_t)bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) + size <= 582 HAMMER2_FREEMAP_BLOCK_SIZE && 583 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) && 584 bmap->linear < HAMMER2_SEGSIZE) { 585 KKASSERT(bmap->linear >= 0 && 586 bmap->linear + size <= HAMMER2_SEGSIZE && 587 (bmap->linear & (HAMMER2_MIN_ALLOC - 1)) == 0); 588 offset = bmap->linear; 589 i = offset / (HAMMER2_SEGSIZE / 8); 590 j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 30; 591 bmmask = (bmradix == 32) ? 592 0xFFFFFFFFU : (1 << bmradix) - 1; 593 bmmask <<= j; 594 bmap->linear = offset + size; 595 } else { 596 for (i = 0; i < 8; ++i) { 597 bmmask = (bmradix == 32) ? 598 0xFFFFFFFFU : (1 << bmradix) - 1; 599 for (j = 0; j < 32; j += bmradix) { 600 if ((bmap->bitmap[i] & bmmask) == 0) 601 goto success; 602 bmmask <<= bmradix; 603 } 604 } 605 /*fragments might remain*/ 606 /*KKASSERT(bmap->avail == 0);*/ 607 return (ENOSPC); 608 success: 609 offset = i * (HAMMER2_SEGSIZE / 8) + 610 (j * (HAMMER2_FREEMAP_BLOCK_SIZE / 2)); 611 if (size & HAMMER2_FREEMAP_BLOCK_MASK) 612 bmap->linear = offset + size; 613 } 614 615 KKASSERT(i >= 0 && i < 8); /* 8 x 16 -> 128 x 16K -> 2MB */ 616 617 /* 618 * Optimize the buffer cache to avoid unnecessary read-before-write 619 * operations. 620 * 621 * The device block size could be larger than the allocation size 622 * so the actual bitmap test is somewhat more involved. We have 623 * to use a compatible buffer size for this operation. 624 */ 625 if ((bmap->bitmap[i] & bmmask) == 0 && 626 hammer2_devblksize(size) != size) { 627 size_t psize = hammer2_devblksize(size); 628 hammer2_off_t pmask = (hammer2_off_t)psize - 1; 629 int pbmradix = 2 << (hammer2_devblkradix(radix) - 630 HAMMER2_FREEMAP_BLOCK_RADIX); 631 uint32_t pbmmask; 632 int pradix = hammer2_getradix(psize); 633 634 pbmmask = (pbmradix == 32) ? 0xFFFFFFFFU : (1 << pbmradix) - 1; 635 while ((pbmmask & bmmask) == 0) 636 pbmmask <<= pbmradix; 637 638 #if 0 639 kprintf("%016jx mask %08x %08x %08x (%zd/%zd)\n", 640 *basep + offset, bmap->bitmap[i], 641 pbmmask, bmmask, size, psize); 642 #endif 643 644 if ((bmap->bitmap[i] & pbmmask) == 0) { 645 error = hammer2_io_newq(hmp, 646 (*basep + (offset & ~pmask)) | 647 pradix, 648 psize, &dio); 649 hammer2_io_bqrelse(&dio); 650 } 651 } 652 653 #if 0 654 /* 655 * When initializing a new inode segment also attempt to initialize 656 * an adjacent segment. Be careful not to index beyond the array 657 * bounds. 658 * 659 * We do this to try to localize inode accesses to improve 660 * directory scan rates. XXX doesn't improve scan rates. 661 */ 662 if (size == HAMMER2_INODE_BYTES) { 663 if (n & 1) { 664 if (bmap[-1].radix == 0 && bmap[-1].avail) 665 bmap[-1].radix = radix; 666 } else { 667 if (bmap[1].radix == 0 && bmap[1].avail) 668 bmap[1].radix = radix; 669 } 670 } 671 #endif 672 673 /* 674 * Adjust the linear iterator, set the radix if necessary (might as 675 * well just set it unconditionally), adjust *basep to return the 676 * allocated data offset. 677 */ 678 bmap->bitmap[i] |= bmmask; 679 bmap->class = class; 680 bmap->avail -= size; 681 *basep += offset; 682 683 hammer2_voldata_lock(hmp); 684 hmp->voldata.allocator_free -= size; /* XXX */ 685 hammer2_voldata_unlock(hmp, 1); 686 687 return(0); 688 } 689 690 static 691 void 692 hammer2_freemap_init(hammer2_trans_t *trans, hammer2_mount_t *hmp, 693 hammer2_key_t key, hammer2_chain_t *chain) 694 { 695 hammer2_off_t l1size; 696 hammer2_off_t lokey; 697 hammer2_off_t hikey; 698 hammer2_bmap_data_t *bmap; 699 int count; 700 701 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 702 703 /* 704 * Calculate the portion of the 2GB map that should be initialized 705 * as free. Portions below or after will be initialized as allocated. 706 * SEGMASK-align the areas so we don't have to worry about sub-scans 707 * or endianess when using memset. 708 * 709 * (1) Ensure that all statically allocated space from newfs_hammer2 710 * is marked allocated. 711 * 712 * (2) Ensure that the reserved area is marked allocated (typically 713 * the first 4MB of the 2GB area being represented). 714 * 715 * (3) Ensure that any trailing space at the end-of-volume is marked 716 * allocated. 717 * 718 * WARNING! It is possible for lokey to be larger than hikey if the 719 * entire 2GB segment is within the static allocation. 720 */ 721 lokey = (hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) & 722 ~HAMMER2_SEGMASK64; 723 724 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) + 725 HAMMER2_ZONE_SEG64) { 726 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) + 727 HAMMER2_ZONE_SEG64; 728 } 729 730 hikey = key + H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 731 if (hikey > hmp->voldata.volu_size) { 732 hikey = hmp->voldata.volu_size & ~HAMMER2_SEGMASK64; 733 } 734 735 chain->bref.check.freemap.avail = 736 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 737 bmap = &chain->data->bmdata[0]; 738 739 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) { 740 if (key < lokey || key >= hikey) { 741 memset(bmap->bitmap, -1, 742 sizeof(bmap->bitmap)); 743 bmap->avail = 0; 744 bmap->linear = HAMMER2_SEGSIZE; 745 chain->bref.check.freemap.avail -= 746 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 747 } else { 748 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 749 } 750 key += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 751 ++bmap; 752 } 753 } 754 755 /* 756 * The current Level 1 freemap has been exhausted, iterate to the next 757 * one, return ENOSPC if no freemaps remain. 758 * 759 * XXX this should rotate back to the beginning to handle freed-up space 760 * XXX or use intermediate entries to locate free space. TODO 761 */ 762 static int 763 hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp, 764 hammer2_chain_t **chainp, hammer2_fiterate_t *iter) 765 { 766 hammer2_mount_t *hmp = (*parentp)->hmp; 767 768 iter->bnext &= ~(H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX) - 1); 769 iter->bnext += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 770 if (iter->bnext >= hmp->voldata.volu_size) { 771 iter->bnext = 0; 772 if (++iter->loops == 2) 773 return (ENOSPC); 774 } 775 return(EAGAIN); 776 } 777 778 /* 779 * Adjust the bit-pattern for data in the freemap bitmap according to 780 * (how). This code is called from on-mount recovery to fixup (mark 781 * as allocated) blocks whos freemap upates might not have been committed 782 * in the last crash and is used by the bulk freemap scan to stage frees. 783 * 784 * XXX currently disabled when how == 0 (the normal real-time case). At 785 * the moment we depend on the bulk freescan to actually free blocks. It 786 * will still call this routine with a non-zero how to stage possible frees 787 * and to do the actual free. 788 * 789 * WARNING! When called from a flush we have to use the 'live' sync_tid 790 * and not the flush sync_tid. The live sync_tid is the flush 791 * sync_tid + 1. That is, freemap allocations which occur during 792 * a flush are not part of the flush. Crash-recovery will restore 793 * any lost allocations. 794 */ 795 void 796 hammer2_freemap_adjust(hammer2_trans_t *trans, hammer2_mount_t *hmp, 797 hammer2_blockref_t *bref, int how) 798 { 799 hammer2_off_t data_off = bref->data_off; 800 hammer2_chain_t *chain; 801 hammer2_chain_t *parent; 802 hammer2_bmap_data_t *bmap; 803 hammer2_key_t key; 804 hammer2_key_t key_dummy; 805 hammer2_off_t l0size; 806 hammer2_off_t l1size; 807 hammer2_off_t l1mask; 808 uint32_t *bitmap; 809 const uint32_t bmmask00 = 0; 810 uint32_t bmmask01; 811 uint32_t bmmask10; 812 uint32_t bmmask11; 813 size_t bytes; 814 uint16_t class; 815 int radix; 816 int start; 817 int count; 818 int modified = 0; 819 int cache_index = -1; 820 int error; 821 822 radix = (int)data_off & HAMMER2_OFF_MASK_RADIX; 823 data_off &= ~HAMMER2_OFF_MASK_RADIX; 824 KKASSERT(radix <= HAMMER2_MAX_RADIX); 825 826 bytes = (size_t)1 << radix; 827 class = (bref->type << 8) | hammer2_devblkradix(radix); 828 829 /* 830 * We can't adjust thre freemap for data allocations made by 831 * newfs_hammer2. 832 */ 833 if (data_off < hmp->voldata.allocator_beg) 834 return; 835 836 KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG); 837 KKASSERT((trans->flags & HAMMER2_TRANS_ISALLOCATING) == 0); 838 atomic_set_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING); 839 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 840 ++trans->sync_tid; 841 842 /* 843 * Lookup the level1 freemap chain. The chain must exist. 844 */ 845 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL1_RADIX); 846 l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX); 847 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX); 848 l1mask = l1size - 1; 849 850 parent = &hmp->fchain; 851 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); 852 853 chain = hammer2_chain_lookup(&parent, &key_dummy, key, key + l1mask, 854 &cache_index, 855 HAMMER2_LOOKUP_ALWAYS | 856 HAMMER2_LOOKUP_MATCHIND); 857 858 /* 859 * Stop early if we are trying to free something but no leaf exists. 860 */ 861 if (chain == NULL && how != HAMMER2_FREEMAP_DORECOVER) { 862 kprintf("hammer2_freemap_adjust: %016jx: no chain\n", 863 (intmax_t)bref->data_off); 864 goto done; 865 } 866 867 /* 868 * Create any missing leaf(s) if we are doing a recovery (marking 869 * the block(s) as being allocated instead of being freed). Be sure 870 * to initialize the auxillary freemap tracking info in the 871 * bref.check.freemap structure. 872 */ 873 if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) { 874 error = hammer2_chain_create(trans, &parent, &chain, 875 key, HAMMER2_FREEMAP_LEVEL1_RADIX, 876 HAMMER2_BREF_TYPE_FREEMAP_LEAF, 877 HAMMER2_FREEMAP_LEVELN_PSIZE); 878 879 if (hammer2_debug & 0x0040) { 880 kprintf("fixup create chain %p %016jx:%d\n", 881 chain, chain->bref.key, chain->bref.keybits); 882 } 883 884 if (error == 0) { 885 hammer2_chain_modify(trans, &chain, 0); 886 bzero(&chain->data->bmdata[0], 887 HAMMER2_FREEMAP_LEVELN_PSIZE); 888 chain->bref.check.freemap.bigmask = (uint32_t)-1; 889 chain->bref.check.freemap.avail = l1size; 890 /* bref.methods should already be inherited */ 891 892 hammer2_freemap_init(trans, hmp, key, chain); 893 } 894 /* XXX handle error */ 895 } 896 897 /* 898 * Calculate the bitmask (runs in 2-bit pairs). 899 */ 900 start = ((int)(data_off >> HAMMER2_FREEMAP_BLOCK_RADIX) & 15) * 2; 901 bmmask01 = 1 << start; 902 bmmask10 = 2 << start; 903 bmmask11 = 3 << start; 904 905 /* 906 * Fixup the bitmap. Partial blocks cannot be fully freed unless 907 * a bulk scan is able to roll them up. 908 */ 909 if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) { 910 count = 1; 911 if (how == HAMMER2_FREEMAP_DOREALFREE) 912 how = HAMMER2_FREEMAP_DOMAYFREE; 913 } else { 914 count = 1 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX); 915 } 916 917 /* 918 * [re]load the bmap and bitmap pointers. Each bmap entry covers 919 * a 2MB swath. The bmap itself (LEVEL1) covers 2GB. 920 */ 921 again: 922 bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) & 923 (HAMMER2_FREEMAP_COUNT - 1)]; 924 bitmap = &bmap->bitmap[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7]; 925 926 927 while (count) { 928 KKASSERT(bmmask11); 929 if (how == HAMMER2_FREEMAP_DORECOVER) { 930 /* 931 * Recovery request, mark as allocated. 932 */ 933 if ((*bitmap & bmmask11) != bmmask11) { 934 if (modified == 0) { 935 hammer2_chain_modify(trans, &chain, 0); 936 modified = 1; 937 goto again; 938 } 939 if ((*bitmap & bmmask11) == bmmask00) 940 bmap->avail -= 1 << radix; 941 if (bmap->class == 0) 942 bmap->class = class; 943 *bitmap |= bmmask11; 944 if (hammer2_debug & 0x0040) { 945 kprintf("hammer2_freemap_recover: " 946 "fixup type=%02x " 947 "block=%016jx/%zd\n", 948 bref->type, data_off, bytes); 949 } 950 } else { 951 /* 952 kprintf("hammer2_freemap_recover: good " 953 "type=%02x block=%016jx/%zd\n", 954 bref->type, data_off, bytes); 955 */ 956 } 957 } else if ((*bitmap & bmmask11) == bmmask11) { 958 /* 959 * Mayfree/Realfree request and bitmap is currently 960 * marked as being fully allocated. 961 */ 962 if (!modified) { 963 hammer2_chain_modify(trans, &chain, 0); 964 modified = 1; 965 goto again; 966 } 967 if (how == HAMMER2_FREEMAP_DOREALFREE) 968 *bitmap &= ~bmmask11; 969 else 970 *bitmap = (*bitmap & ~bmmask11) | bmmask10; 971 } else if ((*bitmap & bmmask11) == bmmask10) { 972 /* 973 * Mayfree/Realfree request and bitmap is currently 974 * marked as being possibly freeable. 975 */ 976 if (how == HAMMER2_FREEMAP_DOREALFREE) { 977 if (!modified) { 978 hammer2_chain_modify(trans, &chain, 0); 979 modified = 1; 980 goto again; 981 } 982 *bitmap &= ~bmmask11; 983 } 984 } else { 985 /* 986 * 01 - Not implemented, currently illegal state 987 * 00 - Not allocated at all, illegal free. 988 */ 989 panic("hammer2_freemap_adjust: " 990 "Illegal state %08x(%08x)", 991 *bitmap, *bitmap & bmmask11); 992 } 993 --count; 994 bmmask01 <<= 2; 995 bmmask10 <<= 2; 996 bmmask11 <<= 2; 997 } 998 if (how == HAMMER2_FREEMAP_DOREALFREE && modified) { 999 bmap->avail += 1 << radix; 1000 KKASSERT(bmap->avail <= HAMMER2_SEGSIZE); 1001 if (bmap->avail == HAMMER2_SEGSIZE && 1002 bmap->bitmap[0] == 0 && 1003 bmap->bitmap[1] == 0 && 1004 bmap->bitmap[2] == 0 && 1005 bmap->bitmap[3] == 0 && 1006 bmap->bitmap[4] == 0 && 1007 bmap->bitmap[5] == 0 && 1008 bmap->bitmap[6] == 0 && 1009 bmap->bitmap[7] == 0) { 1010 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL0_RADIX); 1011 kprintf("Freeseg %016jx\n", (intmax_t)key); 1012 bmap->class = 0; 1013 } 1014 } 1015 1016 /* 1017 * chain->bref.check.freemap.bigmask (XXX) 1018 * 1019 * Setting bigmask is a hint to the allocation code that there might 1020 * be something allocatable. We also set this in recovery... it 1021 * doesn't hurt and we might want to use the hint for other validation 1022 * operations later on. 1023 */ 1024 if (modified) 1025 chain->bref.check.freemap.bigmask |= 1 << radix; 1026 1027 hammer2_chain_unlock(chain); 1028 done: 1029 hammer2_chain_unlock(parent); 1030 atomic_clear_int(&trans->flags, HAMMER2_TRANS_ISALLOCATING); 1031 if (trans->flags & HAMMER2_TRANS_ISFLUSH) 1032 --trans->sync_tid; 1033 } 1034