1 /* Copyright (c) Mark Harmstone 2016-17 2 * 3 * This file is part of WinBtrfs. 4 * 5 * WinBtrfs is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public Licence as published by 7 * the Free Software Foundation, either version 3 of the Licence, or 8 * (at your option) any later version. 9 * 10 * WinBtrfs is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public Licence for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public Licence 16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include "btrfs_drv.h" 19 #include "btrfsioctl.h" 20 #include "crc32c.h" 21 #include <ntddstor.h> 22 23 typedef struct { 24 uint64_t address; 25 uint64_t new_address; 26 tree_header* data; 27 EXTENT_ITEM* ei; 28 tree* t; 29 bool system; 30 LIST_ENTRY refs; 31 LIST_ENTRY list_entry; 32 } metadata_reloc; 33 34 typedef struct { 35 uint8_t type; 36 uint64_t hash; 37 38 union { 39 TREE_BLOCK_REF tbr; 40 SHARED_BLOCK_REF sbr; 41 }; 42 43 metadata_reloc* parent; 44 bool top; 45 LIST_ENTRY list_entry; 46 } metadata_reloc_ref; 47 48 typedef struct { 49 uint64_t address; 50 uint64_t size; 51 uint64_t new_address; 52 chunk* newchunk; 53 EXTENT_ITEM* ei; 54 LIST_ENTRY refs; 55 LIST_ENTRY list_entry; 56 } data_reloc; 57 58 typedef struct { 59 uint8_t type; 60 uint64_t hash; 61 62 union { 63 EXTENT_DATA_REF edr; 64 SHARED_DATA_REF sdr; 65 }; 66 67 metadata_reloc* parent; 68 LIST_ENTRY list_entry; 69 } data_reloc_ref; 70 71 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time 72 73 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp, 74 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) { 75 NTSTATUS Status; 76 metadata_reloc* mr; 77 EXTENT_ITEM* ei; 78 uint16_t len; 79 uint64_t inline_rc; 80 uint8_t* ptr; 81 82 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG); 83 if (!mr) { 84 ERR("out of memory\n"); 85 return STATUS_INSUFFICIENT_RESOURCES; 86 } 87 88 mr->address = tp->item->key.obj_id; 89 mr->data = NULL; 90 mr->ei = (EXTENT_ITEM*)tp->item->data; 91 mr->system = false; 92 InitializeListHead(&mr->refs); 93 94 Status = delete_tree_item(Vcb, tp); 95 if (!NT_SUCCESS(Status)) { 96 ERR("delete_tree_item returned %08lx\n", Status); 97 ExFreePool(mr); 98 return Status; 99 } 100 101 if (!c) 102 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 103 104 if (c) { 105 acquire_chunk_lock(c, Vcb); 106 107 c->used -= Vcb->superblock.node_size; 108 109 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback); 110 111 release_chunk_lock(c, Vcb); 112 } 113 114 ei = (EXTENT_ITEM*)tp->item->data; 115 inline_rc = 0; 116 117 len = tp->item->size - sizeof(EXTENT_ITEM); 118 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 119 if (!skinny) { 120 len -= sizeof(EXTENT_ITEM2); 121 ptr += sizeof(EXTENT_ITEM2); 122 } 123 124 while (len > 0) { 125 uint8_t secttype = *ptr; 126 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0); 127 metadata_reloc_ref* ref; 128 129 len--; 130 131 if (sectlen > len) { 132 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 133 return STATUS_INTERNAL_ERROR; 134 } 135 136 if (sectlen == 0) { 137 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 138 return STATUS_INTERNAL_ERROR; 139 } 140 141 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 142 if (!ref) { 143 ERR("out of memory\n"); 144 return STATUS_INSUFFICIENT_RESOURCES; 145 } 146 147 if (secttype == TYPE_TREE_BLOCK_REF) { 148 ref->type = TYPE_TREE_BLOCK_REF; 149 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF)); 150 inline_rc++; 151 } else if (secttype == TYPE_SHARED_BLOCK_REF) { 152 ref->type = TYPE_SHARED_BLOCK_REF; 153 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF)); 154 inline_rc++; 155 } else { 156 ERR("unexpected tree type %x\n", secttype); 157 ExFreePool(ref); 158 return STATUS_INTERNAL_ERROR; 159 } 160 161 ref->parent = NULL; 162 ref->top = false; 163 InsertTailList(&mr->refs, &ref->list_entry); 164 165 len -= sectlen; 166 ptr += sizeof(uint8_t) + sectlen; 167 } 168 169 if (inline_rc < ei->refcount) { // look for non-inline entries 170 traverse_ptr tp2 = *tp, next_tp; 171 172 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 173 tp2 = next_tp; 174 175 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 176 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) { 177 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 178 if (!ref) { 179 ERR("out of memory\n"); 180 return STATUS_INSUFFICIENT_RESOURCES; 181 } 182 183 ref->type = TYPE_TREE_BLOCK_REF; 184 ref->tbr.offset = tp2.item->key.offset; 185 ref->parent = NULL; 186 ref->top = false; 187 InsertTailList(&mr->refs, &ref->list_entry); 188 189 Status = delete_tree_item(Vcb, &tp2); 190 if (!NT_SUCCESS(Status)) { 191 ERR("delete_tree_item returned %08lx\n", Status); 192 return Status; 193 } 194 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) { 195 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 196 if (!ref) { 197 ERR("out of memory\n"); 198 return STATUS_INSUFFICIENT_RESOURCES; 199 } 200 201 ref->type = TYPE_SHARED_BLOCK_REF; 202 ref->sbr.offset = tp2.item->key.offset; 203 ref->parent = NULL; 204 ref->top = false; 205 InsertTailList(&mr->refs, &ref->list_entry); 206 207 Status = delete_tree_item(Vcb, &tp2); 208 if (!NT_SUCCESS(Status)) { 209 ERR("delete_tree_item returned %08lx\n", Status); 210 return Status; 211 } 212 } 213 } else 214 break; 215 } 216 } 217 218 InsertTailList(items, &mr->list_entry); 219 220 if (mr2) 221 *mr2 = mr; 222 223 return STATUS_SUCCESS; 224 } 225 226 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 227 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) { 228 LIST_ENTRY* le; 229 KEY searchkey; 230 traverse_ptr tp; 231 bool skinny = false; 232 NTSTATUS Status; 233 234 le = items->Flink; 235 while (le != items) { 236 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 237 238 if (mr->address == address) { 239 *mr2 = mr; 240 return STATUS_SUCCESS; 241 } 242 243 le = le->Flink; 244 } 245 246 searchkey.obj_id = address; 247 searchkey.obj_type = TYPE_METADATA_ITEM; 248 searchkey.offset = 0xffffffffffffffff; 249 250 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 251 if (!NT_SUCCESS(Status)) { 252 ERR("find_item returned %08lx\n", Status); 253 return Status; 254 } 255 256 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) 257 skinny = true; 258 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 259 tp.item->size >= sizeof(EXTENT_ITEM)) { 260 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 261 262 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) { 263 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address); 264 return STATUS_INTERNAL_ERROR; 265 } 266 } else { 267 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address); 268 return STATUS_INTERNAL_ERROR; 269 } 270 271 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback); 272 if (!NT_SUCCESS(Status)) { 273 ERR("add_metadata_reloc returned %08lx\n", Status); 274 return Status; 275 } 276 277 return STATUS_SUCCESS; 278 } 279 280 static void sort_metadata_reloc_refs(metadata_reloc* mr) { 281 LIST_ENTRY newlist, *le; 282 283 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items 284 return; 285 286 // insertion sort 287 288 InitializeListHead(&newlist); 289 290 while (!IsListEmpty(&mr->refs)) { 291 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 292 bool inserted = false; 293 294 if (ref->type == TYPE_TREE_BLOCK_REF) 295 ref->hash = ref->tbr.offset; 296 else if (ref->type == TYPE_SHARED_BLOCK_REF) 297 ref->hash = ref->parent->new_address; 298 299 le = newlist.Flink; 300 while (le != &newlist) { 301 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 302 303 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 304 InsertHeadList(le->Blink, &ref->list_entry); 305 inserted = true; 306 break; 307 } 308 309 le = le->Flink; 310 } 311 312 if (!inserted) 313 InsertTailList(&newlist, &ref->list_entry); 314 } 315 316 newlist.Flink->Blink = &mr->refs; 317 newlist.Blink->Flink = &mr->refs; 318 mr->refs.Flink = newlist.Flink; 319 mr->refs.Blink = newlist.Blink; 320 } 321 322 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) { 323 NTSTATUS Status; 324 LIST_ENTRY* le; 325 uint64_t rc = 0; 326 uint16_t inline_len; 327 bool all_inline = true; 328 metadata_reloc_ref* first_noninline = NULL; 329 EXTENT_ITEM* ei; 330 uint8_t* ptr; 331 332 inline_len = sizeof(EXTENT_ITEM); 333 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) 334 inline_len += sizeof(EXTENT_ITEM2); 335 336 sort_metadata_reloc_refs(mr); 337 338 le = mr->refs.Flink; 339 while (le != &mr->refs) { 340 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 341 uint16_t extlen = 0; 342 343 rc++; 344 345 if (ref->type == TYPE_TREE_BLOCK_REF) 346 extlen += sizeof(TREE_BLOCK_REF); 347 else if (ref->type == TYPE_SHARED_BLOCK_REF) 348 extlen += sizeof(SHARED_BLOCK_REF); 349 350 if (all_inline) { 351 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 352 all_inline = false; 353 first_noninline = ref; 354 } else 355 inline_len += extlen + 1; 356 } 357 358 le = le->Flink; 359 } 360 361 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 362 if (!ei) { 363 ERR("out of memory\n"); 364 return STATUS_INSUFFICIENT_RESOURCES; 365 } 366 367 ei->refcount = rc; 368 ei->generation = mr->ei->generation; 369 ei->flags = mr->ei->flags; 370 ptr = (uint8_t*)&ei[1]; 371 372 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) { 373 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr; 374 375 ei2->firstitem = *(KEY*)&mr->data[1]; 376 ei2->level = mr->data->level; 377 378 ptr += sizeof(EXTENT_ITEM2); 379 } 380 381 le = mr->refs.Flink; 382 while (le != &mr->refs) { 383 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 384 385 if (ref == first_noninline) 386 break; 387 388 *ptr = ref->type; 389 ptr++; 390 391 if (ref->type == TYPE_TREE_BLOCK_REF) { 392 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr; 393 394 tbr->offset = ref->tbr.offset; 395 396 ptr += sizeof(TREE_BLOCK_REF); 397 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 398 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr; 399 400 sbr->offset = ref->parent->new_address; 401 402 ptr += sizeof(SHARED_BLOCK_REF); 403 } 404 405 le = le->Flink; 406 } 407 408 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA) 409 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL); 410 else 411 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL); 412 413 if (!NT_SUCCESS(Status)) { 414 ERR("insert_tree_item returned %08lx\n", Status); 415 ExFreePool(ei); 416 return Status; 417 } 418 419 if (!all_inline) { 420 le = &first_noninline->list_entry; 421 422 while (le != &mr->refs) { 423 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 424 425 if (ref->type == TYPE_TREE_BLOCK_REF) { 426 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL); 427 if (!NT_SUCCESS(Status)) { 428 ERR("insert_tree_item returned %08lx\n", Status); 429 return Status; 430 } 431 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 432 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL); 433 if (!NT_SUCCESS(Status)) { 434 ERR("insert_tree_item returned %08lx\n", Status); 435 return Status; 436 } 437 } 438 439 le = le->Flink; 440 } 441 } 442 443 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) { 444 if (mr->data->level > 0) { 445 uint16_t i; 446 internal_node* in = (internal_node*)&mr->data[1]; 447 448 for (i = 0; i < mr->data->num_items; i++) { 449 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL); 450 451 if (sbrrc > 0) { 452 SHARED_BLOCK_REF sbr; 453 454 sbr.offset = mr->new_address; 455 456 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL); 457 if (!NT_SUCCESS(Status)) { 458 ERR("increase_extent_refcount returned %08lx\n", Status); 459 return Status; 460 } 461 462 sbr.offset = mr->address; 463 464 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, 465 sbr.offset, false, NULL); 466 if (!NT_SUCCESS(Status)) { 467 ERR("decrease_extent_refcount returned %08lx\n", Status); 468 return Status; 469 } 470 } 471 } 472 } else { 473 uint16_t i; 474 leaf_node* ln = (leaf_node*)&mr->data[1]; 475 476 for (i = 0; i < mr->data->num_items; i++) { 477 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 478 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 479 480 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 481 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 482 483 if (ed2->size > 0) { // not sparse 484 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL); 485 486 if (sdrrc > 0) { 487 SHARED_DATA_REF sdr; 488 chunk* c; 489 490 sdr.offset = mr->new_address; 491 sdr.count = sdrrc; 492 493 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL); 494 if (!NT_SUCCESS(Status)) { 495 ERR("increase_extent_refcount returned %08lx\n", Status); 496 return Status; 497 } 498 499 sdr.offset = mr->address; 500 501 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, 502 sdr.offset, false, NULL); 503 if (!NT_SUCCESS(Status)) { 504 ERR("decrease_extent_refcount returned %08lx\n", Status); 505 return Status; 506 } 507 508 c = get_chunk_from_address(Vcb, ed2->address); 509 510 if (c) { 511 // check changed_extents 512 513 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true); 514 515 le = c->changed_extents.Flink; 516 517 while (le != &c->changed_extents) { 518 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 519 520 if (ce->address == ed2->address) { 521 LIST_ENTRY* le2; 522 523 le2 = ce->refs.Flink; 524 while (le2 != &ce->refs) { 525 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 526 527 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 528 cer->sdr.offset = mr->new_address; 529 break; 530 } 531 532 le2 = le2->Flink; 533 } 534 535 le2 = ce->old_refs.Flink; 536 while (le2 != &ce->old_refs) { 537 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 538 539 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 540 cer->sdr.offset = mr->new_address; 541 break; 542 } 543 544 le2 = le2->Flink; 545 } 546 547 break; 548 } 549 550 le = le->Flink; 551 } 552 553 ExReleaseResourceLite(&c->changed_extents_lock); 554 } 555 } 556 } 557 } 558 } 559 } 560 } 561 } 562 563 return STATUS_SUCCESS; 564 } 565 566 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 567 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) { 568 LIST_ENTRY tree_writes, *le; 569 NTSTATUS Status; 570 traverse_ptr tp; 571 uint8_t level, max_level = 0; 572 chunk* newchunk = NULL; 573 574 InitializeListHead(&tree_writes); 575 576 le = items->Flink; 577 while (le != items) { 578 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 579 LIST_ENTRY* le2; 580 chunk* pc; 581 582 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); 583 if (!mr->data) { 584 ERR("out of memory\n"); 585 return STATUS_INSUFFICIENT_RESOURCES; 586 } 587 588 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data, 589 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority); 590 if (!NT_SUCCESS(Status)) { 591 ERR("read_data returned %08lx\n", Status); 592 return Status; 593 } 594 595 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM) 596 mr->system = true; 597 598 if (data_items && mr->data->level == 0) { 599 le2 = data_items->Flink; 600 while (le2 != data_items) { 601 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 602 leaf_node* ln = (leaf_node*)&mr->data[1]; 603 uint16_t i; 604 605 for (i = 0; i < mr->data->num_items; i++) { 606 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 607 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 608 609 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 610 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 611 612 if (ed2->address == dr->address) 613 ed2->address = dr->new_address; 614 } 615 } 616 } 617 618 le2 = le2->Flink; 619 } 620 } 621 622 if (mr->data->level > max_level) 623 max_level = mr->data->level; 624 625 le2 = mr->refs.Flink; 626 while (le2 != &mr->refs) { 627 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 628 629 if (ref->type == TYPE_TREE_BLOCK_REF) { 630 KEY* firstitem; 631 root* r = NULL; 632 LIST_ENTRY* le3; 633 tree* t; 634 635 firstitem = (KEY*)&mr->data[1]; 636 637 le3 = Vcb->roots.Flink; 638 while (le3 != &Vcb->roots) { 639 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 640 641 if (r2->id == ref->tbr.offset) { 642 r = r2; 643 break; 644 } 645 646 le3 = le3->Flink; 647 } 648 649 if (!r) { 650 ERR("could not find subvol with id %I64x\n", ref->tbr.offset); 651 return STATUS_INTERNAL_ERROR; 652 } 653 654 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL); 655 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 656 ERR("find_item_to_level returned %08lx\n", Status); 657 return Status; 658 } 659 660 t = tp.tree; 661 while (t && t->header.level < mr->data->level + 1) { 662 t = t->parent; 663 } 664 665 if (!t) 666 ref->top = true; 667 else { 668 metadata_reloc* mr2; 669 670 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback); 671 if (!NT_SUCCESS(Status)) { 672 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 673 return Status; 674 } 675 676 ref->parent = mr2; 677 } 678 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 679 metadata_reloc* mr2; 680 681 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback); 682 if (!NT_SUCCESS(Status)) { 683 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 684 return Status; 685 } 686 687 ref->parent = mr2; 688 } 689 690 le2 = le2->Flink; 691 } 692 693 le = le->Flink; 694 } 695 696 le = items->Flink; 697 while (le != items) { 698 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 699 LIST_ENTRY* le2; 700 uint32_t hash; 701 702 mr->t = NULL; 703 704 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t)); 705 706 le2 = Vcb->trees_ptrs[hash >> 24]; 707 708 if (le2) { 709 while (le2 != &Vcb->trees_hash) { 710 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash); 711 712 if (t->header.address == mr->address) { 713 mr->t = t; 714 break; 715 } else if (t->hash > hash) 716 break; 717 718 le2 = le2->Flink; 719 } 720 } 721 722 le = le->Flink; 723 } 724 725 for (level = 0; level <= max_level; level++) { 726 le = items->Flink; 727 while (le != items) { 728 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 729 730 if (mr->data->level == level) { 731 bool done = false; 732 LIST_ENTRY* le2; 733 tree_write* tw; 734 uint64_t flags; 735 tree* t3; 736 737 if (mr->system) 738 flags = Vcb->system_flags; 739 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) 740 flags = Vcb->data_flags; 741 else 742 flags = Vcb->metadata_flags; 743 744 if (newchunk) { 745 acquire_chunk_lock(newchunk, Vcb); 746 747 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 748 newchunk->used += Vcb->superblock.node_size; 749 space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback); 750 done = true; 751 } 752 753 release_chunk_lock(newchunk, Vcb); 754 } 755 756 if (!done) { 757 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 758 759 le2 = Vcb->chunks.Flink; 760 while (le2 != &Vcb->chunks) { 761 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 762 763 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) { 764 acquire_chunk_lock(c2, Vcb); 765 766 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) { 767 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) { 768 c2->used += Vcb->superblock.node_size; 769 space_list_subtract(c2, mr->new_address, Vcb->superblock.node_size, rollback); 770 release_chunk_lock(c2, Vcb); 771 newchunk = c2; 772 done = true; 773 break; 774 } 775 } 776 777 release_chunk_lock(c2, Vcb); 778 } 779 780 le2 = le2->Flink; 781 } 782 783 // allocate new chunk if necessary 784 if (!done) { 785 Status = alloc_chunk(Vcb, flags, &newchunk, false); 786 787 if (!NT_SUCCESS(Status)) { 788 ERR("alloc_chunk returned %08lx\n", Status); 789 ExReleaseResourceLite(&Vcb->chunk_lock); 790 goto end; 791 } 792 793 acquire_chunk_lock(newchunk, Vcb); 794 795 newchunk->balance_num = Vcb->balance.balance_num; 796 797 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 798 release_chunk_lock(newchunk, Vcb); 799 ExReleaseResourceLite(&Vcb->chunk_lock); 800 ERR("could not find address in new chunk\n"); 801 Status = STATUS_DISK_FULL; 802 goto end; 803 } else { 804 newchunk->used += Vcb->superblock.node_size; 805 space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback); 806 } 807 808 release_chunk_lock(newchunk, Vcb); 809 } 810 811 ExReleaseResourceLite(&Vcb->chunk_lock); 812 } 813 814 // update parents 815 le2 = mr->refs.Flink; 816 while (le2 != &mr->refs) { 817 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 818 819 if (ref->parent) { 820 uint16_t i; 821 internal_node* in = (internal_node*)&ref->parent->data[1]; 822 823 for (i = 0; i < ref->parent->data->num_items; i++) { 824 if (in[i].address == mr->address) { 825 in[i].address = mr->new_address; 826 break; 827 } 828 } 829 830 if (ref->parent->t) { 831 LIST_ENTRY* le3; 832 833 le3 = ref->parent->t->itemlist.Flink; 834 while (le3 != &ref->parent->t->itemlist) { 835 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 836 837 if (!td->inserted && td->treeholder.address == mr->address) 838 td->treeholder.address = mr->new_address; 839 840 le3 = le3->Flink; 841 } 842 } 843 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) { 844 LIST_ENTRY* le3; 845 root* r = NULL; 846 847 // alter ROOT_ITEM 848 849 le3 = Vcb->roots.Flink; 850 while (le3 != &Vcb->roots) { 851 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 852 853 if (r2->id == ref->tbr.offset) { 854 r = r2; 855 break; 856 } 857 858 le3 = le3->Flink; 859 } 860 861 if (r) { 862 r->treeholder.address = mr->new_address; 863 864 if (r == Vcb->root_root) 865 Vcb->superblock.root_tree_addr = mr->new_address; 866 else if (r == Vcb->chunk_root) 867 Vcb->superblock.chunk_tree_addr = mr->new_address; 868 else if (r->root_item.block_number == mr->address) { 869 KEY searchkey; 870 ROOT_ITEM* ri; 871 872 r->root_item.block_number = mr->new_address; 873 874 searchkey.obj_id = r->id; 875 searchkey.obj_type = TYPE_ROOT_ITEM; 876 searchkey.offset = 0xffffffffffffffff; 877 878 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 879 if (!NT_SUCCESS(Status)) { 880 ERR("find_item returned %08lx\n", Status); 881 goto end; 882 } 883 884 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) { 885 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id); 886 Status = STATUS_INTERNAL_ERROR; 887 goto end; 888 } 889 890 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG); 891 if (!ri) { 892 ERR("out of memory\n"); 893 Status = STATUS_INSUFFICIENT_RESOURCES; 894 goto end; 895 } 896 897 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM)); 898 899 Status = delete_tree_item(Vcb, &tp); 900 if (!NT_SUCCESS(Status)) { 901 ERR("delete_tree_item returned %08lx\n", Status); 902 goto end; 903 } 904 905 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL); 906 if (!NT_SUCCESS(Status)) { 907 ERR("insert_tree_item returned %08lx\n", Status); 908 goto end; 909 } 910 } 911 } 912 } 913 914 le2 = le2->Flink; 915 } 916 917 mr->data->address = mr->new_address; 918 919 t3 = mr->t; 920 921 while (t3) { 922 uint8_t h; 923 bool inserted; 924 tree* t4 = NULL; 925 926 // check if tree loaded more than once 927 if (t3->list_entry.Flink != &Vcb->trees_hash) { 928 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 929 930 if (nt->header.address == t3->header.address) 931 t4 = nt; 932 } 933 934 t3->header.address = mr->new_address; 935 936 h = t3->hash >> 24; 937 938 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) { 939 if (t3->list_entry_hash.Flink == &Vcb->trees_hash) 940 Vcb->trees_ptrs[h] = NULL; 941 else { 942 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 943 944 if (t2->hash >> 24 == h) 945 Vcb->trees_ptrs[h] = &t2->list_entry_hash; 946 else 947 Vcb->trees_ptrs[h] = NULL; 948 } 949 } 950 951 RemoveEntryList(&t3->list_entry_hash); 952 953 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t)); 954 h = t3->hash >> 24; 955 956 if (!Vcb->trees_ptrs[h]) { 957 uint8_t h2 = h; 958 959 le2 = Vcb->trees_hash.Flink; 960 961 if (h2 > 0) { 962 h2--; 963 do { 964 if (Vcb->trees_ptrs[h2]) { 965 le2 = Vcb->trees_ptrs[h2]; 966 break; 967 } 968 969 h2--; 970 } while (h2 > 0); 971 } 972 } else 973 le2 = Vcb->trees_ptrs[h]; 974 975 inserted = false; 976 while (le2 != &Vcb->trees_hash) { 977 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash); 978 979 if (t2->hash >= t3->hash) { 980 InsertHeadList(le2->Blink, &t3->list_entry_hash); 981 inserted = true; 982 break; 983 } 984 985 le2 = le2->Flink; 986 } 987 988 if (!inserted) 989 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash); 990 991 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h]) 992 Vcb->trees_ptrs[h] = &t3->list_entry_hash; 993 994 if (data_items && level == 0) { 995 le2 = data_items->Flink; 996 997 while (le2 != data_items) { 998 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 999 LIST_ENTRY* le3 = t3->itemlist.Flink; 1000 1001 while (le3 != &t3->itemlist) { 1002 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 1003 1004 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 1005 EXTENT_DATA* ed = (EXTENT_DATA*)td->data; 1006 1007 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 1008 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1009 1010 if (ed2->address == dr->address) 1011 ed2->address = dr->new_address; 1012 } 1013 } 1014 1015 le3 = le3->Flink; 1016 } 1017 1018 le2 = le2->Flink; 1019 } 1020 } 1021 1022 t3 = t4; 1023 } 1024 1025 calc_tree_checksum(Vcb, mr->data); 1026 1027 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG); 1028 if (!tw) { 1029 ERR("out of memory\n"); 1030 Status = STATUS_INSUFFICIENT_RESOURCES; 1031 goto end; 1032 } 1033 1034 tw->address = mr->new_address; 1035 tw->length = Vcb->superblock.node_size; 1036 tw->data = (uint8_t*)mr->data; 1037 tw->allocated = false; 1038 1039 if (IsListEmpty(&tree_writes)) 1040 InsertTailList(&tree_writes, &tw->list_entry); 1041 else { 1042 bool inserted = false; 1043 1044 le2 = tree_writes.Flink; 1045 while (le2 != &tree_writes) { 1046 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry); 1047 1048 if (tw2->address > tw->address) { 1049 InsertHeadList(le2->Blink, &tw->list_entry); 1050 inserted = true; 1051 break; 1052 } 1053 1054 le2 = le2->Flink; 1055 } 1056 1057 if (!inserted) 1058 InsertTailList(&tree_writes, &tw->list_entry); 1059 } 1060 } 1061 1062 le = le->Flink; 1063 } 1064 } 1065 1066 Status = do_tree_writes(Vcb, &tree_writes, true); 1067 if (!NT_SUCCESS(Status)) { 1068 ERR("do_tree_writes returned %08lx\n", Status); 1069 goto end; 1070 } 1071 1072 le = items->Flink; 1073 while (le != items) { 1074 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 1075 1076 Status = add_metadata_reloc_extent_item(Vcb, mr); 1077 if (!NT_SUCCESS(Status)) { 1078 ERR("add_metadata_reloc_extent_item returned %08lx\n", Status); 1079 goto end; 1080 } 1081 1082 le = le->Flink; 1083 } 1084 1085 Status = STATUS_SUCCESS; 1086 1087 end: 1088 while (!IsListEmpty(&tree_writes)) { 1089 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry); 1090 1091 if (tw->allocated) 1092 ExFreePool(tw->data); 1093 1094 ExFreePool(tw); 1095 } 1096 1097 return Status; 1098 } 1099 1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1101 KEY searchkey; 1102 traverse_ptr tp; 1103 NTSTATUS Status; 1104 bool b; 1105 LIST_ENTRY items, rollback; 1106 uint32_t loaded = 0; 1107 1108 TRACE("chunk %I64x\n", c->offset); 1109 1110 InitializeListHead(&rollback); 1111 InitializeListHead(&items); 1112 1113 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1114 1115 searchkey.obj_id = c->offset; 1116 searchkey.obj_type = TYPE_METADATA_ITEM; 1117 searchkey.offset = 0xffffffffffffffff; 1118 1119 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1120 if (!NT_SUCCESS(Status)) { 1121 ERR("find_item returned %08lx\n", Status); 1122 goto end; 1123 } 1124 1125 do { 1126 traverse_ptr next_tp; 1127 1128 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1129 break; 1130 1131 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { 1132 bool tree = false, skinny = false; 1133 1134 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1135 tree = true; 1136 skinny = true; 1137 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 1138 tp.item->size >= sizeof(EXTENT_ITEM)) { 1139 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1140 1141 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1142 tree = true; 1143 } 1144 1145 if (tree) { 1146 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback); 1147 1148 if (!NT_SUCCESS(Status)) { 1149 ERR("add_metadata_reloc returned %08lx\n", Status); 1150 goto end; 1151 } 1152 1153 loaded++; 1154 1155 if (loaded >= 64) // only do 64 at a time 1156 break; 1157 } 1158 } 1159 1160 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1161 1162 if (b) 1163 tp = next_tp; 1164 } while (b); 1165 1166 if (IsListEmpty(&items)) { 1167 *changed = false; 1168 Status = STATUS_SUCCESS; 1169 goto end; 1170 } else 1171 *changed = true; 1172 1173 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback); 1174 if (!NT_SUCCESS(Status)) { 1175 ERR("write_metadata_items returned %08lx\n", Status); 1176 goto end; 1177 } 1178 1179 Status = STATUS_SUCCESS; 1180 1181 Vcb->need_write = true; 1182 1183 end: 1184 if (NT_SUCCESS(Status)) { 1185 Status = do_write(Vcb, NULL); 1186 if (!NT_SUCCESS(Status)) 1187 ERR("do_write returned %08lx\n", Status); 1188 } 1189 1190 if (NT_SUCCESS(Status)) 1191 clear_rollback(&rollback); 1192 else 1193 do_rollback(Vcb, &rollback); 1194 1195 free_trees(Vcb); 1196 1197 ExReleaseResourceLite(&Vcb->tree_lock); 1198 1199 while (!IsListEmpty(&items)) { 1200 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry); 1201 1202 while (!IsListEmpty(&mr->refs)) { 1203 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 1204 1205 ExFreePool(ref); 1206 } 1207 1208 if (mr->data) 1209 ExFreePool(mr->data); 1210 1211 ExFreePool(mr); 1212 } 1213 1214 return Status; 1215 } 1216 1217 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items, 1218 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) { 1219 NTSTATUS Status; 1220 LIST_ENTRY* le; 1221 KEY searchkey; 1222 traverse_ptr tp; 1223 root* r = NULL; 1224 metadata_reloc* mr; 1225 uint64_t last_tree = 0; 1226 data_reloc_ref* ref; 1227 1228 le = Vcb->roots.Flink; 1229 while (le != &Vcb->roots) { 1230 root* r2 = CONTAINING_RECORD(le, root, list_entry); 1231 1232 if (r2->id == edr->root) { 1233 r = r2; 1234 break; 1235 } 1236 1237 le = le->Flink; 1238 } 1239 1240 if (!r) { 1241 ERR("could not find subvol %I64x\n", edr->root); 1242 return STATUS_INTERNAL_ERROR; 1243 } 1244 1245 searchkey.obj_id = edr->objid; 1246 searchkey.obj_type = TYPE_EXTENT_DATA; 1247 searchkey.offset = 0; 1248 1249 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL); 1250 if (!NT_SUCCESS(Status)) { 1251 ERR("find_item returned %08lx\n", Status); 1252 return Status; 1253 } 1254 1255 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) { 1256 traverse_ptr tp2; 1257 1258 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1259 tp = tp2; 1260 else { 1261 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id); 1262 return STATUS_INTERNAL_ERROR; 1263 } 1264 } 1265 1266 ref = NULL; 1267 1268 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) { 1269 traverse_ptr tp2; 1270 1271 if (tp.item->size >= sizeof(EXTENT_DATA)) { 1272 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data; 1273 1274 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) { 1275 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1276 1277 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) { 1278 if (ref && last_tree == tp.tree->header.address) 1279 ref->edr.count++; 1280 else { 1281 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1282 if (!ref) { 1283 ERR("out of memory\n"); 1284 return STATUS_INSUFFICIENT_RESOURCES; 1285 } 1286 1287 ref->type = TYPE_EXTENT_DATA_REF; 1288 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF)); 1289 ref->edr.count = 1; 1290 1291 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback); 1292 if (!NT_SUCCESS(Status)) { 1293 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1294 ExFreePool(ref); 1295 return Status; 1296 } 1297 1298 last_tree = tp.tree->header.address; 1299 ref->parent = mr; 1300 1301 InsertTailList(&dr->refs, &ref->list_entry); 1302 } 1303 } 1304 } 1305 } 1306 1307 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1308 tp = tp2; 1309 else 1310 break; 1311 } 1312 1313 return STATUS_SUCCESS; 1314 } 1315 1316 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items, 1317 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) { 1318 NTSTATUS Status; 1319 data_reloc* dr; 1320 EXTENT_ITEM* ei; 1321 uint16_t len; 1322 uint64_t inline_rc; 1323 uint8_t* ptr; 1324 1325 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG); 1326 if (!dr) { 1327 ERR("out of memory\n"); 1328 return STATUS_INSUFFICIENT_RESOURCES; 1329 } 1330 1331 dr->address = tp->item->key.obj_id; 1332 dr->size = tp->item->key.offset; 1333 dr->ei = (EXTENT_ITEM*)tp->item->data; 1334 InitializeListHead(&dr->refs); 1335 1336 Status = delete_tree_item(Vcb, tp); 1337 if (!NT_SUCCESS(Status)) { 1338 ERR("delete_tree_item returned %08lx\n", Status); 1339 return Status; 1340 } 1341 1342 if (!c) 1343 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 1344 1345 if (c) { 1346 acquire_chunk_lock(c, Vcb); 1347 1348 c->used -= tp->item->key.offset; 1349 1350 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback); 1351 1352 release_chunk_lock(c, Vcb); 1353 } 1354 1355 ei = (EXTENT_ITEM*)tp->item->data; 1356 inline_rc = 0; 1357 1358 len = tp->item->size - sizeof(EXTENT_ITEM); 1359 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 1360 1361 while (len > 0) { 1362 uint8_t secttype = *ptr; 1363 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0); 1364 1365 len--; 1366 1367 if (sectlen > len) { 1368 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 1369 return STATUS_INTERNAL_ERROR; 1370 } 1371 1372 if (sectlen == 0) { 1373 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 1374 return STATUS_INTERNAL_ERROR; 1375 } 1376 1377 if (secttype == TYPE_EXTENT_DATA_REF) { 1378 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t)); 1379 1380 inline_rc += edr->count; 1381 1382 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback); 1383 if (!NT_SUCCESS(Status)) { 1384 ERR("data_reloc_add_tree_edr returned %08lx\n", Status); 1385 return Status; 1386 } 1387 } else if (secttype == TYPE_SHARED_DATA_REF) { 1388 metadata_reloc* mr; 1389 data_reloc_ref* ref; 1390 1391 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1392 if (!ref) { 1393 ERR("out of memory\n"); 1394 return STATUS_INSUFFICIENT_RESOURCES; 1395 } 1396 1397 ref->type = TYPE_SHARED_DATA_REF; 1398 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF)); 1399 inline_rc += ref->sdr.count; 1400 1401 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1402 if (!NT_SUCCESS(Status)) { 1403 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1404 ExFreePool(ref); 1405 return Status; 1406 } 1407 1408 ref->parent = mr; 1409 1410 InsertTailList(&dr->refs, &ref->list_entry); 1411 } else { 1412 ERR("unexpected tree type %x\n", secttype); 1413 return STATUS_INTERNAL_ERROR; 1414 } 1415 1416 1417 len -= sectlen; 1418 ptr += sizeof(uint8_t) + sectlen; 1419 } 1420 1421 if (inline_rc < ei->refcount) { // look for non-inline entries 1422 traverse_ptr tp2 = *tp, next_tp; 1423 1424 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 1425 tp2 = next_tp; 1426 1427 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 1428 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) { 1429 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback); 1430 if (!NT_SUCCESS(Status)) { 1431 ERR("data_reloc_add_tree_edr returned %08lx\n", Status); 1432 return Status; 1433 } 1434 1435 Status = delete_tree_item(Vcb, &tp2); 1436 if (!NT_SUCCESS(Status)) { 1437 ERR("delete_tree_item returned %08lx\n", Status); 1438 return Status; 1439 } 1440 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) { 1441 metadata_reloc* mr; 1442 data_reloc_ref* ref; 1443 1444 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1445 if (!ref) { 1446 ERR("out of memory\n"); 1447 return STATUS_INSUFFICIENT_RESOURCES; 1448 } 1449 1450 ref->type = TYPE_SHARED_DATA_REF; 1451 ref->sdr.offset = tp2.item->key.offset; 1452 ref->sdr.count = *((uint32_t*)tp2.item->data); 1453 1454 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1455 if (!NT_SUCCESS(Status)) { 1456 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1457 ExFreePool(ref); 1458 return Status; 1459 } 1460 1461 ref->parent = mr; 1462 InsertTailList(&dr->refs, &ref->list_entry); 1463 1464 Status = delete_tree_item(Vcb, &tp2); 1465 if (!NT_SUCCESS(Status)) { 1466 ERR("delete_tree_item returned %08lx\n", Status); 1467 return Status; 1468 } 1469 } 1470 } else 1471 break; 1472 } 1473 } 1474 1475 InsertTailList(items, &dr->list_entry); 1476 1477 return STATUS_SUCCESS; 1478 } 1479 1480 static void sort_data_reloc_refs(data_reloc* dr) { 1481 LIST_ENTRY newlist, *le; 1482 1483 if (IsListEmpty(&dr->refs)) 1484 return; 1485 1486 // insertion sort 1487 1488 InitializeListHead(&newlist); 1489 1490 while (!IsListEmpty(&dr->refs)) { 1491 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 1492 bool inserted = false; 1493 1494 if (ref->type == TYPE_EXTENT_DATA_REF) 1495 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset); 1496 else if (ref->type == TYPE_SHARED_DATA_REF) 1497 ref->hash = ref->parent->new_address; 1498 1499 le = newlist.Flink; 1500 while (le != &newlist) { 1501 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1502 1503 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 1504 InsertHeadList(le->Blink, &ref->list_entry); 1505 inserted = true; 1506 break; 1507 } 1508 1509 le = le->Flink; 1510 } 1511 1512 if (!inserted) 1513 InsertTailList(&newlist, &ref->list_entry); 1514 } 1515 1516 le = newlist.Flink; 1517 while (le != &newlist) { 1518 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1519 1520 if (le->Flink != &newlist) { 1521 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry); 1522 1523 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root && 1524 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) { 1525 RemoveEntryList(&ref2->list_entry); 1526 ref->edr.count += ref2->edr.count; 1527 ExFreePool(ref2); 1528 continue; 1529 } 1530 } 1531 1532 le = le->Flink; 1533 } 1534 1535 newlist.Flink->Blink = &dr->refs; 1536 newlist.Blink->Flink = &dr->refs; 1537 dr->refs.Flink = newlist.Flink; 1538 dr->refs.Blink = newlist.Blink; 1539 } 1540 1541 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) { 1542 NTSTATUS Status; 1543 LIST_ENTRY* le; 1544 uint64_t rc = 0; 1545 uint16_t inline_len; 1546 bool all_inline = true; 1547 data_reloc_ref* first_noninline = NULL; 1548 EXTENT_ITEM* ei; 1549 uint8_t* ptr; 1550 1551 inline_len = sizeof(EXTENT_ITEM); 1552 1553 sort_data_reloc_refs(dr); 1554 1555 le = dr->refs.Flink; 1556 while (le != &dr->refs) { 1557 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1558 uint16_t extlen = 0; 1559 1560 if (ref->type == TYPE_EXTENT_DATA_REF) { 1561 extlen += sizeof(EXTENT_DATA_REF); 1562 rc += ref->edr.count; 1563 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1564 extlen += sizeof(SHARED_DATA_REF); 1565 rc++; 1566 } 1567 1568 if (all_inline) { 1569 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 1570 all_inline = false; 1571 first_noninline = ref; 1572 } else 1573 inline_len += extlen + 1; 1574 } 1575 1576 le = le->Flink; 1577 } 1578 1579 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 1580 if (!ei) { 1581 ERR("out of memory\n"); 1582 return STATUS_INSUFFICIENT_RESOURCES; 1583 } 1584 1585 ei->refcount = rc; 1586 ei->generation = dr->ei->generation; 1587 ei->flags = dr->ei->flags; 1588 ptr = (uint8_t*)&ei[1]; 1589 1590 le = dr->refs.Flink; 1591 while (le != &dr->refs) { 1592 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1593 1594 if (ref == first_noninline) 1595 break; 1596 1597 *ptr = ref->type; 1598 ptr++; 1599 1600 if (ref->type == TYPE_EXTENT_DATA_REF) { 1601 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr; 1602 1603 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1604 1605 ptr += sizeof(EXTENT_DATA_REF); 1606 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1607 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr; 1608 1609 sdr->offset = ref->parent->new_address; 1610 sdr->count = ref->sdr.count; 1611 1612 ptr += sizeof(SHARED_DATA_REF); 1613 } 1614 1615 le = le->Flink; 1616 } 1617 1618 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL); 1619 if (!NT_SUCCESS(Status)) { 1620 ERR("insert_tree_item returned %08lx\n", Status); 1621 return Status; 1622 } 1623 1624 if (!all_inline) { 1625 le = &first_noninline->list_entry; 1626 1627 while (le != &dr->refs) { 1628 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1629 1630 if (ref->type == TYPE_EXTENT_DATA_REF) { 1631 EXTENT_DATA_REF* edr; 1632 1633 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG); 1634 if (!edr) { 1635 ERR("out of memory\n"); 1636 return STATUS_INSUFFICIENT_RESOURCES; 1637 } 1638 1639 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1640 1641 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL); 1642 if (!NT_SUCCESS(Status)) { 1643 ERR("insert_tree_item returned %08lx\n", Status); 1644 return Status; 1645 } 1646 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1647 uint32_t* sdr; 1648 1649 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG); 1650 if (!sdr) { 1651 ERR("out of memory\n"); 1652 return STATUS_INSUFFICIENT_RESOURCES; 1653 } 1654 1655 *sdr = ref->sdr.count; 1656 1657 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL); 1658 if (!NT_SUCCESS(Status)) { 1659 ERR("insert_tree_item returned %08lx\n", Status); 1660 return Status; 1661 } 1662 } 1663 1664 le = le->Flink; 1665 } 1666 } 1667 1668 return STATUS_SUCCESS; 1669 } 1670 1671 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1672 KEY searchkey; 1673 traverse_ptr tp; 1674 NTSTATUS Status; 1675 bool b; 1676 LIST_ENTRY items, metadata_items, rollback, *le; 1677 uint64_t loaded = 0, num_loaded = 0; 1678 chunk* newchunk = NULL; 1679 uint8_t* data = NULL; 1680 1681 TRACE("chunk %I64x\n", c->offset); 1682 1683 InitializeListHead(&rollback); 1684 InitializeListHead(&items); 1685 InitializeListHead(&metadata_items); 1686 1687 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1688 1689 searchkey.obj_id = c->offset; 1690 searchkey.obj_type = TYPE_EXTENT_ITEM; 1691 searchkey.offset = 0xffffffffffffffff; 1692 1693 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1694 if (!NT_SUCCESS(Status)) { 1695 ERR("find_item returned %08lx\n", Status); 1696 goto end; 1697 } 1698 1699 do { 1700 traverse_ptr next_tp; 1701 1702 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1703 break; 1704 1705 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) { 1706 bool tree = false; 1707 1708 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1709 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1710 1711 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1712 tree = true; 1713 } 1714 1715 if (!tree) { 1716 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback); 1717 1718 if (!NT_SUCCESS(Status)) { 1719 ERR("add_data_reloc returned %08lx\n", Status); 1720 goto end; 1721 } 1722 1723 loaded += tp.item->key.offset; 1724 num_loaded++; 1725 1726 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously 1727 break; 1728 } 1729 } 1730 1731 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1732 1733 if (b) 1734 tp = next_tp; 1735 } while (b); 1736 1737 if (IsListEmpty(&items)) { 1738 *changed = false; 1739 Status = STATUS_SUCCESS; 1740 goto end; 1741 } else 1742 *changed = true; 1743 1744 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG); 1745 if (!data) { 1746 ERR("out of memory\n"); 1747 Status = STATUS_INSUFFICIENT_RESOURCES; 1748 goto end; 1749 } 1750 1751 le = items.Flink; 1752 while (le != &items) { 1753 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 1754 bool done = false; 1755 LIST_ENTRY* le2; 1756 void* csum; 1757 RTL_BITMAP bmp; 1758 ULONG* bmparr; 1759 ULONG bmplen, runlength, index, lastoff; 1760 1761 if (newchunk) { 1762 acquire_chunk_lock(newchunk, Vcb); 1763 1764 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1765 newchunk->used += dr->size; 1766 space_list_subtract(newchunk, dr->new_address, dr->size, &rollback); 1767 done = true; 1768 } 1769 1770 release_chunk_lock(newchunk, Vcb); 1771 } 1772 1773 if (!done) { 1774 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 1775 1776 le2 = Vcb->chunks.Flink; 1777 while (le2 != &Vcb->chunks) { 1778 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 1779 1780 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) { 1781 acquire_chunk_lock(c2, Vcb); 1782 1783 if ((c2->chunk_item->size - c2->used) >= dr->size) { 1784 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) { 1785 c2->used += dr->size; 1786 space_list_subtract(c2, dr->new_address, dr->size, &rollback); 1787 release_chunk_lock(c2, Vcb); 1788 newchunk = c2; 1789 done = true; 1790 break; 1791 } 1792 } 1793 1794 release_chunk_lock(c2, Vcb); 1795 } 1796 1797 le2 = le2->Flink; 1798 } 1799 1800 // allocate new chunk if necessary 1801 if (!done) { 1802 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false); 1803 1804 if (!NT_SUCCESS(Status)) { 1805 ERR("alloc_chunk returned %08lx\n", Status); 1806 ExReleaseResourceLite(&Vcb->chunk_lock); 1807 goto end; 1808 } 1809 1810 acquire_chunk_lock(newchunk, Vcb); 1811 1812 newchunk->balance_num = Vcb->balance.balance_num; 1813 1814 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1815 release_chunk_lock(newchunk, Vcb); 1816 ExReleaseResourceLite(&Vcb->chunk_lock); 1817 ERR("could not find address in new chunk\n"); 1818 Status = STATUS_DISK_FULL; 1819 goto end; 1820 } else { 1821 newchunk->used += dr->size; 1822 space_list_subtract(newchunk, dr->new_address, dr->size, &rollback); 1823 } 1824 1825 release_chunk_lock(newchunk, Vcb); 1826 } 1827 1828 ExReleaseResourceLite(&Vcb->chunk_lock); 1829 } 1830 1831 dr->newchunk = newchunk; 1832 1833 bmplen = (ULONG)(dr->size >> Vcb->sector_shift); 1834 1835 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG); 1836 if (!bmparr) { 1837 ERR("out of memory\n"); 1838 Status = STATUS_INSUFFICIENT_RESOURCES; 1839 goto end; 1840 } 1841 1842 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((dr->size * Vcb->csum_size) >> Vcb->sector_shift), ALLOC_TAG); 1843 if (!csum) { 1844 ERR("out of memory\n"); 1845 ExFreePool(bmparr); 1846 Status = STATUS_INSUFFICIENT_RESOURCES; 1847 goto end; 1848 } 1849 1850 RtlInitializeBitMap(&bmp, bmparr, bmplen); 1851 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum 1852 1853 searchkey.obj_id = EXTENT_CSUM_ID; 1854 searchkey.obj_type = TYPE_EXTENT_CSUM; 1855 searchkey.offset = dr->address; 1856 1857 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL); 1858 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 1859 ERR("find_item returned %08lx\n", Status); 1860 ExFreePool(csum); 1861 ExFreePool(bmparr); 1862 goto end; 1863 } 1864 1865 if (Status != STATUS_NOT_FOUND) { 1866 do { 1867 traverse_ptr next_tp; 1868 1869 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) { 1870 if (tp.item->key.offset >= dr->address + dr->size) 1871 break; 1872 else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size) >= dr->address) { 1873 uint64_t cs = max(dr->address, tp.item->key.offset); 1874 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size)); 1875 1876 RtlCopyMemory((uint8_t*)csum + (((cs - dr->address) * Vcb->csum_size) >> Vcb->sector_shift), 1877 tp.item->data + (((cs - tp.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift), 1878 (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift)); 1879 1880 RtlClearBits(&bmp, (ULONG)((cs - dr->address) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift)); 1881 1882 if (ce == dr->address + dr->size) 1883 break; 1884 } 1885 } 1886 1887 if (find_next_item(Vcb, &tp, &next_tp, false, NULL)) 1888 tp = next_tp; 1889 else 1890 break; 1891 } while (true); 1892 } 1893 1894 lastoff = 0; 1895 runlength = RtlFindFirstRunClear(&bmp, &index); 1896 1897 while (runlength != 0) { 1898 if (index >= bmplen) 1899 break; 1900 1901 if (index + runlength >= bmplen) { 1902 runlength = bmplen - index; 1903 1904 if (runlength == 0) 1905 break; 1906 } 1907 1908 if (index > lastoff) { 1909 ULONG off = lastoff; 1910 ULONG size = index - lastoff; 1911 1912 // handle no csum run 1913 do { 1914 ULONG rl; 1915 1916 if (size << Vcb->sector_shift > BALANCE_UNIT) 1917 rl = BALANCE_UNIT >> Vcb->sector_shift; 1918 else 1919 rl = size; 1920 1921 Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data, 1922 c, NULL, NULL, 0, false, NormalPagePriority); 1923 if (!NT_SUCCESS(Status)) { 1924 ERR("read_data returned %08lx\n", Status); 1925 ExFreePool(csum); 1926 ExFreePool(bmparr); 1927 goto end; 1928 } 1929 1930 Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift, 1931 NULL, newchunk, false, 0, NormalPagePriority); 1932 if (!NT_SUCCESS(Status)) { 1933 ERR("write_data_complete returned %08lx\n", Status); 1934 ExFreePool(csum); 1935 ExFreePool(bmparr); 1936 goto end; 1937 } 1938 1939 size -= rl; 1940 off += rl; 1941 } while (size > 0); 1942 } 1943 1944 add_checksum_entry(Vcb, dr->new_address + (index << Vcb->sector_shift), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL); 1945 add_checksum_entry(Vcb, dr->address + (index << Vcb->sector_shift), runlength, NULL, NULL); 1946 1947 // handle csum run 1948 do { 1949 ULONG rl; 1950 1951 if (runlength << Vcb->sector_shift > BALANCE_UNIT) 1952 rl = BALANCE_UNIT >> Vcb->sector_shift; 1953 else 1954 rl = runlength; 1955 1956 Status = read_data(Vcb, dr->address + (index << Vcb->sector_shift), rl << Vcb->sector_shift, 1957 (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority); 1958 if (!NT_SUCCESS(Status)) { 1959 ERR("read_data returned %08lx\n", Status); 1960 ExFreePool(csum); 1961 ExFreePool(bmparr); 1962 goto end; 1963 } 1964 1965 Status = write_data_complete(Vcb, dr->new_address + (index << Vcb->sector_shift), data, rl << Vcb->sector_shift, 1966 NULL, newchunk, false, 0, NormalPagePriority); 1967 if (!NT_SUCCESS(Status)) { 1968 ERR("write_data_complete returned %08lx\n", Status); 1969 ExFreePool(csum); 1970 ExFreePool(bmparr); 1971 goto end; 1972 } 1973 1974 runlength -= rl; 1975 index += rl; 1976 } while (runlength > 0); 1977 1978 lastoff = index; 1979 runlength = RtlFindNextForwardRunClear(&bmp, index, &index); 1980 } 1981 1982 ExFreePool(csum); 1983 ExFreePool(bmparr); 1984 1985 // handle final nocsum run 1986 if (lastoff < dr->size >> Vcb->sector_shift) { 1987 ULONG off = lastoff; 1988 ULONG size = (ULONG)((dr->size >> Vcb->sector_shift) - lastoff); 1989 1990 do { 1991 ULONG rl; 1992 1993 if (size << Vcb->sector_shift > BALANCE_UNIT) 1994 rl = BALANCE_UNIT >> Vcb->sector_shift; 1995 else 1996 rl = size; 1997 1998 Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data, 1999 c, NULL, NULL, 0, false, NormalPagePriority); 2000 if (!NT_SUCCESS(Status)) { 2001 ERR("read_data returned %08lx\n", Status); 2002 goto end; 2003 } 2004 2005 Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift, 2006 NULL, newchunk, false, 0, NormalPagePriority); 2007 if (!NT_SUCCESS(Status)) { 2008 ERR("write_data_complete returned %08lx\n", Status); 2009 goto end; 2010 } 2011 2012 size -= rl; 2013 off += rl; 2014 } while (size > 0); 2015 } 2016 2017 le = le->Flink; 2018 } 2019 2020 ExFreePool(data); 2021 data = NULL; 2022 2023 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback); 2024 if (!NT_SUCCESS(Status)) { 2025 ERR("write_metadata_items returned %08lx\n", Status); 2026 goto end; 2027 } 2028 2029 le = items.Flink; 2030 while (le != &items) { 2031 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 2032 2033 Status = add_data_reloc_extent_item(Vcb, dr); 2034 if (!NT_SUCCESS(Status)) { 2035 ERR("add_data_reloc_extent_item returned %08lx\n", Status); 2036 goto end; 2037 } 2038 2039 le = le->Flink; 2040 } 2041 2042 le = c->changed_extents.Flink; 2043 while (le != &c->changed_extents) { 2044 LIST_ENTRY *le2, *le3; 2045 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 2046 2047 le3 = le->Flink; 2048 2049 le2 = items.Flink; 2050 while (le2 != &items) { 2051 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 2052 2053 if (ce->address == dr->address) { 2054 ce->address = dr->new_address; 2055 RemoveEntryList(&ce->list_entry); 2056 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry); 2057 break; 2058 } 2059 2060 le2 = le2->Flink; 2061 } 2062 2063 le = le3; 2064 } 2065 2066 Status = STATUS_SUCCESS; 2067 2068 Vcb->need_write = true; 2069 2070 end: 2071 if (NT_SUCCESS(Status)) { 2072 // update extents in cache inodes before we flush 2073 le = Vcb->chunks.Flink; 2074 while (le != &Vcb->chunks) { 2075 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry); 2076 2077 if (c2->cache) { 2078 LIST_ENTRY* le2; 2079 2080 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true); 2081 2082 le2 = c2->cache->extents.Flink; 2083 while (le2 != &c2->cache->extents) { 2084 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2085 2086 if (!ext->ignore) { 2087 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2088 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2089 2090 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2091 LIST_ENTRY* le3 = items.Flink; 2092 while (le3 != &items) { 2093 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2094 2095 if (ed2->address == dr->address) { 2096 ed2->address = dr->new_address; 2097 break; 2098 } 2099 2100 le3 = le3->Flink; 2101 } 2102 } 2103 } 2104 } 2105 2106 le2 = le2->Flink; 2107 } 2108 2109 ExReleaseResourceLite(c2->cache->Header.Resource); 2110 } 2111 2112 le = le->Flink; 2113 } 2114 2115 Status = do_write(Vcb, NULL); 2116 if (!NT_SUCCESS(Status)) 2117 ERR("do_write returned %08lx\n", Status); 2118 } 2119 2120 if (NT_SUCCESS(Status)) { 2121 clear_rollback(&rollback); 2122 2123 // update open FCBs 2124 // FIXME - speed this up(?) 2125 2126 le = Vcb->all_fcbs.Flink; 2127 while (le != &Vcb->all_fcbs) { 2128 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all); 2129 LIST_ENTRY* le2; 2130 2131 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true); 2132 2133 le2 = fcb->extents.Flink; 2134 while (le2 != &fcb->extents) { 2135 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2136 2137 if (!ext->ignore) { 2138 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2139 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2140 2141 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2142 LIST_ENTRY* le3 = items.Flink; 2143 while (le3 != &items) { 2144 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2145 2146 if (ed2->address == dr->address) { 2147 ed2->address = dr->new_address; 2148 break; 2149 } 2150 2151 le3 = le3->Flink; 2152 } 2153 } 2154 } 2155 } 2156 2157 le2 = le2->Flink; 2158 } 2159 2160 ExReleaseResourceLite(fcb->Header.Resource); 2161 2162 le = le->Flink; 2163 } 2164 } else 2165 do_rollback(Vcb, &rollback); 2166 2167 free_trees(Vcb); 2168 2169 ExReleaseResourceLite(&Vcb->tree_lock); 2170 2171 if (data) 2172 ExFreePool(data); 2173 2174 while (!IsListEmpty(&items)) { 2175 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry); 2176 2177 while (!IsListEmpty(&dr->refs)) { 2178 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 2179 2180 ExFreePool(ref); 2181 } 2182 2183 ExFreePool(dr); 2184 } 2185 2186 while (!IsListEmpty(&metadata_items)) { 2187 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry); 2188 2189 while (!IsListEmpty(&mr->refs)) { 2190 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 2191 2192 ExFreePool(ref); 2193 } 2194 2195 if (mr->data) 2196 ExFreePool(mr->data); 2197 2198 ExFreePool(mr); 2199 } 2200 2201 return Status; 2202 } 2203 2204 static __inline uint64_t get_chunk_dup_type(chunk* c) { 2205 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2206 return BLOCK_FLAG_RAID0; 2207 else if (c->chunk_item->type & BLOCK_FLAG_RAID1) 2208 return BLOCK_FLAG_RAID1; 2209 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE) 2210 return BLOCK_FLAG_DUPLICATE; 2211 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2212 return BLOCK_FLAG_RAID10; 2213 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2214 return BLOCK_FLAG_RAID5; 2215 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2216 return BLOCK_FLAG_RAID6; 2217 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3) 2218 return BLOCK_FLAG_RAID1C3; 2219 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4) 2220 return BLOCK_FLAG_RAID1C4; 2221 else 2222 return BLOCK_FLAG_SINGLE; 2223 } 2224 2225 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) { 2226 btrfs_balance_opts* opts; 2227 2228 opts = &Vcb->balance.opts[sort]; 2229 2230 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED)) 2231 return false; 2232 2233 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2234 uint64_t type = get_chunk_dup_type(c); 2235 2236 if (!(type & opts->profiles)) 2237 return false; 2238 } 2239 2240 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2241 uint16_t i; 2242 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2243 bool b = false; 2244 2245 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2246 if (cis[i].dev_id == opts->devid) { 2247 b = true; 2248 break; 2249 } 2250 } 2251 2252 if (!b) 2253 return false; 2254 } 2255 2256 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2257 uint16_t i, factor; 2258 uint64_t physsize; 2259 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2260 bool b = false; 2261 2262 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2263 factor = c->chunk_item->num_stripes; 2264 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2265 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 2266 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2267 factor = c->chunk_item->num_stripes - 1; 2268 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2269 factor = c->chunk_item->num_stripes - 2; 2270 else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4 2271 factor = 1; 2272 2273 physsize = c->chunk_item->size / factor; 2274 2275 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2276 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start && 2277 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) { 2278 b = true; 2279 break; 2280 } 2281 } 2282 2283 if (!b) 2284 return false; 2285 } 2286 2287 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2288 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end) 2289 return false; 2290 } 2291 2292 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2293 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end) 2294 return false; 2295 } 2296 2297 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2298 uint64_t usage = c->used * 100 / c->chunk_item->size; 2299 2300 // usage == 0 should mean completely empty, not just that usage rounds to 0% 2301 if (c->used > 0 && usage == 0) 2302 usage = 1; 2303 2304 if (usage < opts->usage_start || usage > opts->usage_end) 2305 return false; 2306 } 2307 2308 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) { 2309 uint64_t type = get_chunk_dup_type(c); 2310 2311 if (type == opts->convert) 2312 return false; 2313 } 2314 2315 return true; 2316 } 2317 2318 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2319 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2320 args->profiles = opts->profiles; 2321 args->flags |= BALANCE_ARGS_FLAGS_PROFILES; 2322 } 2323 2324 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2325 if (args->usage_start == 0) { 2326 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE; 2327 args->usage_start = opts->usage_start; 2328 args->usage_end = opts->usage_end; 2329 } else { 2330 args->flags |= BALANCE_ARGS_FLAGS_USAGE; 2331 args->usage = opts->usage_end; 2332 } 2333 } 2334 2335 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2336 args->devid = opts->devid; 2337 args->flags |= BALANCE_ARGS_FLAGS_DEVID; 2338 } 2339 2340 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2341 args->drange_start = opts->drange_start; 2342 args->drange_end = opts->drange_end; 2343 args->flags |= BALANCE_ARGS_FLAGS_DRANGE; 2344 } 2345 2346 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2347 args->vrange_start = opts->vrange_start; 2348 args->vrange_end = opts->vrange_end; 2349 args->flags |= BALANCE_ARGS_FLAGS_VRANGE; 2350 } 2351 2352 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) { 2353 args->convert = opts->convert; 2354 args->flags |= BALANCE_ARGS_FLAGS_CONVERT; 2355 2356 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT) 2357 args->flags |= BALANCE_ARGS_FLAGS_SOFT; 2358 } 2359 2360 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) { 2361 if (args->limit_start == 0) { 2362 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE; 2363 args->limit_start = (uint32_t)opts->limit_start; 2364 args->limit_end = (uint32_t)opts->limit_end; 2365 } else { 2366 args->flags |= BALANCE_ARGS_FLAGS_LIMIT; 2367 args->limit = opts->limit_end; 2368 } 2369 } 2370 2371 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2372 args->stripes_start = opts->stripes_start; 2373 args->stripes_end = opts->stripes_end; 2374 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE; 2375 } 2376 } 2377 2378 static NTSTATUS add_balance_item(device_extension* Vcb) { 2379 KEY searchkey; 2380 traverse_ptr tp; 2381 NTSTATUS Status; 2382 BALANCE_ITEM* bi; 2383 2384 searchkey.obj_id = BALANCE_ITEM_ID; 2385 searchkey.obj_type = TYPE_TEMP_ITEM; 2386 searchkey.offset = 0; 2387 2388 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2389 2390 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2391 if (!NT_SUCCESS(Status)) { 2392 ERR("find_item returned %08lx\n", Status); 2393 goto end; 2394 } 2395 2396 if (!keycmp(tp.item->key, searchkey)) { 2397 Status = delete_tree_item(Vcb, &tp); 2398 if (!NT_SUCCESS(Status)) { 2399 ERR("delete_tree_item returned %08lx\n", Status); 2400 goto end; 2401 } 2402 } 2403 2404 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG); 2405 if (!bi) { 2406 ERR("out of memory\n"); 2407 Status = STATUS_INSUFFICIENT_RESOURCES; 2408 goto end; 2409 } 2410 2411 RtlZeroMemory(bi, sizeof(BALANCE_ITEM)); 2412 2413 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2414 bi->flags |= BALANCE_FLAGS_DATA; 2415 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 2416 } 2417 2418 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2419 bi->flags |= BALANCE_FLAGS_METADATA; 2420 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 2421 } 2422 2423 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2424 bi->flags |= BALANCE_FLAGS_SYSTEM; 2425 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 2426 } 2427 2428 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL); 2429 if (!NT_SUCCESS(Status)) { 2430 ERR("insert_tree_item returned %08lx\n", Status); 2431 ExFreePool(bi); 2432 goto end; 2433 } 2434 2435 Status = STATUS_SUCCESS; 2436 2437 end: 2438 if (NT_SUCCESS(Status)) { 2439 Status = do_write(Vcb, NULL); 2440 if (!NT_SUCCESS(Status)) 2441 ERR("do_write returned %08lx\n", Status); 2442 } 2443 2444 free_trees(Vcb); 2445 2446 ExReleaseResourceLite(&Vcb->tree_lock); 2447 2448 return Status; 2449 } 2450 2451 static NTSTATUS remove_balance_item(device_extension* Vcb) { 2452 KEY searchkey; 2453 traverse_ptr tp; 2454 NTSTATUS Status; 2455 2456 searchkey.obj_id = BALANCE_ITEM_ID; 2457 searchkey.obj_type = TYPE_TEMP_ITEM; 2458 searchkey.offset = 0; 2459 2460 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2461 2462 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2463 if (!NT_SUCCESS(Status)) { 2464 ERR("find_item returned %08lx\n", Status); 2465 goto end; 2466 } 2467 2468 if (!keycmp(tp.item->key, searchkey)) { 2469 Status = delete_tree_item(Vcb, &tp); 2470 if (!NT_SUCCESS(Status)) { 2471 ERR("delete_tree_item returned %08lx\n", Status); 2472 goto end; 2473 } 2474 2475 Status = do_write(Vcb, NULL); 2476 if (!NT_SUCCESS(Status)) { 2477 ERR("do_write returned %08lx\n", Status); 2478 goto end; 2479 } 2480 2481 free_trees(Vcb); 2482 } 2483 2484 Status = STATUS_SUCCESS; 2485 2486 end: 2487 ExReleaseResourceLite(&Vcb->tree_lock); 2488 2489 return Status; 2490 } 2491 2492 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2493 opts->flags = BTRFS_BALANCE_OPTS_ENABLED; 2494 2495 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) { 2496 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES; 2497 opts->profiles = args->profiles; 2498 } 2499 2500 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) { 2501 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2502 2503 opts->usage_start = 0; 2504 opts->usage_end = (uint8_t)args->usage; 2505 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) { 2506 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2507 2508 opts->usage_start = (uint8_t)args->usage_start; 2509 opts->usage_end = (uint8_t)args->usage_end; 2510 } 2511 2512 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) { 2513 opts->flags |= BTRFS_BALANCE_OPTS_DEVID; 2514 opts->devid = args->devid; 2515 } 2516 2517 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) { 2518 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE; 2519 opts->drange_start = args->drange_start; 2520 opts->drange_end = args->drange_end; 2521 } 2522 2523 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) { 2524 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE; 2525 opts->vrange_start = args->vrange_start; 2526 opts->vrange_end = args->vrange_end; 2527 } 2528 2529 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) { 2530 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2531 2532 opts->limit_start = 0; 2533 opts->limit_end = args->limit; 2534 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) { 2535 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2536 2537 opts->limit_start = args->limit_start; 2538 opts->limit_end = args->limit_end; 2539 } 2540 2541 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) { 2542 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES; 2543 2544 opts->stripes_start = (uint16_t)args->stripes_start; 2545 opts->stripes_end = (uint16_t)args->stripes_end; 2546 } 2547 2548 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) { 2549 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT; 2550 opts->convert = args->convert; 2551 2552 if (args->flags & BALANCE_ARGS_FLAGS_SOFT) 2553 opts->flags |= BTRFS_BALANCE_OPTS_SOFT; 2554 } 2555 } 2556 2557 static NTSTATUS remove_superblocks(device* dev) { 2558 NTSTATUS Status; 2559 superblock* sb; 2560 int i = 0; 2561 2562 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG); 2563 if (!sb) { 2564 ERR("out of memory\n"); 2565 return STATUS_INSUFFICIENT_RESOURCES; 2566 } 2567 2568 RtlZeroMemory(sb, sizeof(superblock)); 2569 2570 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) { 2571 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock)); 2572 2573 if (!NT_SUCCESS(Status)) { 2574 ExFreePool(sb); 2575 return Status; 2576 } 2577 2578 i++; 2579 } 2580 2581 ExFreePool(sb); 2582 2583 return STATUS_SUCCESS; 2584 } 2585 2586 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2587 KEY searchkey; 2588 traverse_ptr tp; 2589 NTSTATUS Status; 2590 LIST_ENTRY* le; 2591 volume_device_extension* vde; 2592 2593 if (Vcb->need_write) { 2594 Status = do_write(Vcb, NULL); 2595 2596 if (!NT_SUCCESS(Status)) 2597 ERR("do_write returned %08lx\n", Status); 2598 } else 2599 Status = STATUS_SUCCESS; 2600 2601 free_trees(Vcb); 2602 2603 if (!NT_SUCCESS(Status)) 2604 return Status; 2605 2606 // remove entry in chunk tree 2607 2608 searchkey.obj_id = 1; 2609 searchkey.obj_type = TYPE_DEV_ITEM; 2610 searchkey.offset = dev->devitem.dev_id; 2611 2612 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL); 2613 if (!NT_SUCCESS(Status)) { 2614 ERR("find_item returned %08lx\n", Status); 2615 return Status; 2616 } 2617 2618 if (!keycmp(searchkey, tp.item->key)) { 2619 Status = delete_tree_item(Vcb, &tp); 2620 2621 if (!NT_SUCCESS(Status)) { 2622 ERR("delete_tree_item returned %08lx\n", Status); 2623 return Status; 2624 } 2625 } 2626 2627 // remove stats entry in device tree 2628 2629 searchkey.obj_id = 0; 2630 searchkey.obj_type = TYPE_DEV_STATS; 2631 searchkey.offset = dev->devitem.dev_id; 2632 2633 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2634 if (!NT_SUCCESS(Status)) { 2635 ERR("find_item returned %08lx\n", Status); 2636 return Status; 2637 } 2638 2639 if (!keycmp(searchkey, tp.item->key)) { 2640 Status = delete_tree_item(Vcb, &tp); 2641 2642 if (!NT_SUCCESS(Status)) { 2643 ERR("delete_tree_item returned %08lx\n", Status); 2644 return Status; 2645 } 2646 } 2647 2648 // update superblock 2649 2650 Vcb->superblock.num_devices--; 2651 Vcb->superblock.total_bytes -= dev->devitem.num_bytes; 2652 Vcb->devices_loaded--; 2653 2654 RemoveEntryList(&dev->list_entry); 2655 2656 // flush 2657 2658 Status = do_write(Vcb, NULL); 2659 if (!NT_SUCCESS(Status)) 2660 ERR("do_write returned %08lx\n", Status); 2661 2662 free_trees(Vcb); 2663 2664 if (!NT_SUCCESS(Status)) 2665 return Status; 2666 2667 if (!dev->readonly && dev->devobj) { 2668 Status = remove_superblocks(dev); 2669 if (!NT_SUCCESS(Status)) 2670 WARN("remove_superblocks returned %08lx\n", Status); 2671 } 2672 2673 // remove entry in volume list 2674 2675 vde = Vcb->vde; 2676 2677 if (dev->devobj) { 2678 pdo_device_extension* pdode = vde->pdode; 2679 2680 ExAcquireResourceExclusiveLite(&pdode->child_lock, true); 2681 2682 le = pdode->children.Flink; 2683 while (le != &pdode->children) { 2684 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2685 2686 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) { 2687 PFILE_OBJECT FileObject; 2688 PDEVICE_OBJECT mountmgr; 2689 UNICODE_STRING mmdevpath; 2690 2691 pdode->children_loaded--; 2692 2693 if (vc->had_drive_letter) { // re-add entry to mountmgr 2694 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME); 2695 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr); 2696 if (!NT_SUCCESS(Status)) 2697 ERR("IoGetDeviceObjectPointer returned %08lx\n", Status); 2698 else { 2699 MOUNTDEV_NAME mdn; 2700 2701 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL); 2702 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW) 2703 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status); 2704 else { 2705 MOUNTDEV_NAME* mdn2; 2706 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength; 2707 2708 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG); 2709 if (!mdn2) 2710 ERR("out of memory\n"); 2711 else { 2712 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL); 2713 if (!NT_SUCCESS(Status)) 2714 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status); 2715 else { 2716 UNICODE_STRING name; 2717 2718 name.Buffer = mdn2->Name; 2719 name.Length = name.MaximumLength = mdn2->NameLength; 2720 2721 Status = mountmgr_add_drive_letter(mountmgr, &name); 2722 if (!NT_SUCCESS(Status)) 2723 WARN("mountmgr_add_drive_letter returned %08lx\n", Status); 2724 } 2725 2726 ExFreePool(mdn2); 2727 } 2728 } 2729 2730 ObDereferenceObject(FileObject); 2731 } 2732 } 2733 2734 ExFreePool(vc->pnp_name.Buffer); 2735 RemoveEntryList(&vc->list_entry); 2736 ExFreePool(vc); 2737 2738 ObDereferenceObject(vc->fileobj); 2739 2740 break; 2741 } 2742 2743 le = le->Flink; 2744 } 2745 2746 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) { 2747 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA; 2748 2749 le = pdode->children.Flink; 2750 while (le != &pdode->children) { 2751 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2752 2753 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) { 2754 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA; 2755 break; 2756 } 2757 2758 le = le->Flink; 2759 } 2760 } 2761 2762 pdode->num_children = Vcb->superblock.num_devices; 2763 2764 ExReleaseResourceLite(&pdode->child_lock); 2765 2766 // free dev 2767 2768 if (dev->trim && !dev->readonly && !Vcb->options.no_trim) 2769 trim_whole_device(dev); 2770 } 2771 2772 while (!IsListEmpty(&dev->space)) { 2773 LIST_ENTRY* le2 = RemoveHeadList(&dev->space); 2774 space* s = CONTAINING_RECORD(le2, space, list_entry); 2775 2776 ExFreePool(s); 2777 } 2778 2779 ExFreePool(dev); 2780 2781 if (Vcb->trim) { 2782 Vcb->trim = false; 2783 2784 le = Vcb->devices.Flink; 2785 while (le != &Vcb->devices) { 2786 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 2787 2788 if (dev2->trim) { 2789 Vcb->trim = true; 2790 break; 2791 } 2792 2793 le = le->Flink; 2794 } 2795 } 2796 2797 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 2798 2799 return STATUS_SUCCESS; 2800 } 2801 2802 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2803 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa; 2804 DEVICE_DATA_SET_RANGE* ranges; 2805 ULONG datalen, i; 2806 KEY searchkey; 2807 traverse_ptr tp; 2808 NTSTATUS Status; 2809 bool b; 2810 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there 2811 LIST_ENTRY* le; 2812 2813 dev->num_trim_entries = 0; 2814 2815 searchkey.obj_id = dev->devitem.dev_id; 2816 searchkey.obj_type = TYPE_DEV_EXTENT; 2817 searchkey.offset = 0; 2818 2819 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2820 if (!NT_SUCCESS(Status)) { 2821 ERR("find_item returned %08lx\n", Status); 2822 return; 2823 } 2824 2825 do { 2826 traverse_ptr next_tp; 2827 2828 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) { 2829 if (tp.item->size >= sizeof(DEV_EXTENT)) { 2830 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data; 2831 2832 if (tp.item->key.offset > lastoff) 2833 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff); 2834 2835 lastoff = tp.item->key.offset + de->length; 2836 } else { 2837 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT)); 2838 return; 2839 } 2840 } 2841 2842 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 2843 2844 if (b) { 2845 tp = next_tp; 2846 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type)) 2847 break; 2848 } 2849 } while (b); 2850 2851 if (lastoff < dev->devitem.num_bytes) 2852 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff); 2853 2854 if (dev->num_trim_entries == 0) 2855 return; 2856 2857 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE)); 2858 2859 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG); 2860 if (!dmdsa) { 2861 ERR("out of memory\n"); 2862 goto end; 2863 } 2864 2865 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES); 2866 dmdsa->Action = DeviceDsmAction_Trim; 2867 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED; 2868 dmdsa->ParameterBlockOffset = 0; 2869 dmdsa->ParameterBlockLength = 0; 2870 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)); 2871 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE); 2872 2873 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset); 2874 2875 i = 0; 2876 le = dev->trim_list.Flink; 2877 while (le != &dev->trim_list) { 2878 space* s = CONTAINING_RECORD(le, space, list_entry); 2879 2880 ranges[i].StartingOffset = s->address; 2881 ranges[i].LengthInBytes = s->size; 2882 i++; 2883 2884 le = le->Flink; 2885 } 2886 2887 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL); 2888 if (!NT_SUCCESS(Status)) 2889 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status); 2890 2891 ExFreePool(dmdsa); 2892 2893 end: 2894 while (!IsListEmpty(&dev->trim_list)) { 2895 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry); 2896 ExFreePool(s); 2897 } 2898 2899 dev->num_trim_entries = 0; 2900 } 2901 2902 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) { 2903 NTSTATUS Status; 2904 bool changed; 2905 LIST_ENTRY* le; 2906 chunk* rc; 2907 2908 // FIXME - allow with metadata chunks? 2909 2910 while (true) { 2911 rc = NULL; 2912 2913 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 2914 2915 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 2916 2917 // choose the least-used chunk we haven't looked at yet 2918 le = Vcb->chunks.Flink; 2919 while (le != &Vcb->chunks) { 2920 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 2921 2922 // FIXME - skip full-size chunks over e.g. 90% full? 2923 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used)) 2924 rc = c; 2925 2926 le = le->Flink; 2927 } 2928 2929 ExReleaseResourceLite(&Vcb->chunk_lock); 2930 2931 if (!rc) { 2932 ExReleaseResourceLite(&Vcb->tree_lock); 2933 break; 2934 } 2935 2936 if (rc->list_entry_balance.Flink) { 2937 RemoveEntryList(&rc->list_entry_balance); 2938 Vcb->balance.chunks_left--; 2939 } 2940 2941 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped 2942 rc->reloc = true; 2943 2944 ExReleaseResourceLite(&Vcb->tree_lock); 2945 2946 do { 2947 changed = false; 2948 2949 Status = balance_data_chunk(Vcb, rc, &changed); 2950 if (!NT_SUCCESS(Status)) { 2951 ERR("balance_data_chunk returned %08lx\n", Status); 2952 Vcb->balance.status = Status; 2953 rc->list_entry_balance.Flink = NULL; 2954 rc->reloc = false; 2955 return Status; 2956 } 2957 2958 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 2959 2960 if (Vcb->readonly) 2961 Vcb->balance.stopping = true; 2962 2963 if (Vcb->balance.stopping) 2964 return STATUS_SUCCESS; 2965 } while (changed); 2966 2967 rc->list_entry_balance.Flink = NULL; 2968 2969 rc->changed = true; 2970 rc->space_changed = true; 2971 rc->balance_num = Vcb->balance.balance_num; 2972 2973 Status = do_write(Vcb, NULL); 2974 if (!NT_SUCCESS(Status)) { 2975 ERR("do_write returned %08lx\n", Status); 2976 return Status; 2977 } 2978 2979 free_trees(Vcb); 2980 } 2981 2982 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 2983 2984 Status = alloc_chunk(Vcb, flags, &rc, true); 2985 2986 ExReleaseResourceLite(&Vcb->chunk_lock); 2987 2988 if (NT_SUCCESS(Status)) { 2989 *newchunk = rc; 2990 return Status; 2991 } else { 2992 ERR("alloc_chunk returned %08lx\n", Status); 2993 return Status; 2994 } 2995 } 2996 2997 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) { 2998 LIST_ENTRY* le; 2999 3000 while (!IsListEmpty(&dev->space)) { 3001 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry); 3002 3003 ExFreePool(s); 3004 } 3005 3006 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device. 3007 3008 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL); 3009 3010 le = Vcb->chunks.Flink; 3011 while (le != &Vcb->chunks) { 3012 uint16_t n; 3013 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3014 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 3015 3016 for (n = 0; n < c->chunk_item->num_stripes; n++) { 3017 uint64_t stripe_size = 0; 3018 3019 if (cis[n].dev_id == dev->devitem.dev_id) { 3020 if (stripe_size == 0) { 3021 uint16_t factor; 3022 3023 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 3024 factor = c->chunk_item->num_stripes; 3025 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 3026 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 3027 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 3028 factor = c->chunk_item->num_stripes - 1; 3029 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 3030 factor = c->chunk_item->num_stripes - 2; 3031 else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4 3032 factor = 1; 3033 3034 stripe_size = c->chunk_item->size / factor; 3035 } 3036 3037 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL); 3038 } 3039 } 3040 3041 le = le->Flink; 3042 } 3043 3044 return STATUS_SUCCESS; 3045 } 3046 3047 _Function_class_(KSTART_ROUTINE) 3048 void __stdcall balance_thread(void* context) { 3049 device_extension* Vcb = (device_extension*)context; 3050 LIST_ENTRY chunks; 3051 LIST_ENTRY* le; 3052 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0; 3053 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0; 3054 NTSTATUS Status; 3055 3056 Vcb->balance.balance_num++; 3057 3058 Vcb->balance.stopping = false; 3059 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false); 3060 3061 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3062 old_data_flags = Vcb->data_flags; 3063 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert); 3064 3065 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3066 } 3067 3068 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3069 old_metadata_flags = Vcb->metadata_flags; 3070 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert); 3071 } 3072 3073 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3074 old_system_flags = Vcb->system_flags; 3075 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert); 3076 } 3077 3078 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) { 3079 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3080 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3081 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3082 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3083 } 3084 3085 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0; 3086 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0; 3087 3088 InitializeListHead(&chunks); 3089 3090 // FIXME - what are we supposed to do with limit_start? 3091 3092 if (!Vcb->readonly) { 3093 if (!Vcb->balance.removing && !Vcb->balance.shrinking) { 3094 Status = add_balance_item(Vcb); 3095 if (!NT_SUCCESS(Status)) { 3096 ERR("add_balance_item returned %08lx\n", Status); 3097 Vcb->balance.status = Status; 3098 goto end; 3099 } 3100 } else { 3101 if (Vcb->need_write) { 3102 Status = do_write(Vcb, NULL); 3103 3104 free_trees(Vcb); 3105 3106 if (!NT_SUCCESS(Status)) { 3107 ERR("do_write returned %08lx\n", Status); 3108 Vcb->balance.status = Status; 3109 goto end; 3110 } 3111 } 3112 } 3113 } 3114 3115 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3116 3117 if (Vcb->balance.stopping) 3118 goto end; 3119 3120 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3121 3122 le = Vcb->chunks.Flink; 3123 while (le != &Vcb->chunks) { 3124 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3125 uint8_t sort; 3126 3127 acquire_chunk_lock(c, Vcb); 3128 3129 if (c->chunk_item->type & BLOCK_FLAG_DATA) 3130 sort = BALANCE_OPTS_DATA; 3131 else if (c->chunk_item->type & BLOCK_FLAG_METADATA) 3132 sort = BALANCE_OPTS_METADATA; 3133 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM) 3134 sort = BALANCE_OPTS_SYSTEM; 3135 else { 3136 ERR("unexpected chunk type %I64x\n", c->chunk_item->type); 3137 release_chunk_lock(c, Vcb); 3138 break; 3139 } 3140 3141 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) && 3142 should_balance_chunk(Vcb, sort, c)) { 3143 InsertTailList(&chunks, &c->list_entry_balance); 3144 3145 num_chunks[sort]++; 3146 Vcb->balance.total_chunks++; 3147 Vcb->balance.chunks_left++; 3148 } else if (sort == BALANCE_OPTS_METADATA) 3149 okay_metadata_chunks++; 3150 else if (sort == BALANCE_OPTS_DATA) 3151 okay_data_chunks++; 3152 else if (sort == BALANCE_OPTS_SYSTEM) 3153 okay_system_chunks++; 3154 3155 if (!c->cache_loaded) { 3156 Status = load_cache_chunk(Vcb, c, NULL); 3157 3158 if (!NT_SUCCESS(Status)) { 3159 ERR("load_cache_chunk returned %08lx\n", Status); 3160 Vcb->balance.status = Status; 3161 release_chunk_lock(c, Vcb); 3162 ExReleaseResourceLite(&Vcb->chunk_lock); 3163 goto end; 3164 } 3165 } 3166 3167 release_chunk_lock(c, Vcb); 3168 3169 le = le->Flink; 3170 } 3171 3172 ExReleaseResourceLite(&Vcb->chunk_lock); 3173 3174 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up 3175 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) { 3176 bool consolidated = false; 3177 chunk* c; 3178 3179 if (okay_metadata_chunks == 0) { 3180 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3181 3182 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true); 3183 if (NT_SUCCESS(Status)) 3184 c->balance_num = Vcb->balance.balance_num; 3185 else if (Status != STATUS_DISK_FULL || consolidated) { 3186 ERR("alloc_chunk returned %08lx\n", Status); 3187 ExReleaseResourceLite(&Vcb->chunk_lock); 3188 Vcb->balance.status = Status; 3189 goto end; 3190 } 3191 3192 ExReleaseResourceLite(&Vcb->chunk_lock); 3193 3194 if (Status == STATUS_DISK_FULL) { 3195 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c); 3196 if (!NT_SUCCESS(Status)) { 3197 ERR("try_consolidation returned %08lx\n", Status); 3198 Vcb->balance.status = Status; 3199 goto end; 3200 } else 3201 c->balance_num = Vcb->balance.balance_num; 3202 3203 consolidated = true; 3204 3205 if (Vcb->balance.stopping) 3206 goto end; 3207 } 3208 } 3209 3210 if (okay_data_chunks == 0) { 3211 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3212 3213 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true); 3214 if (NT_SUCCESS(Status)) 3215 c->balance_num = Vcb->balance.balance_num; 3216 else if (Status != STATUS_DISK_FULL || consolidated) { 3217 ERR("alloc_chunk returned %08lx\n", Status); 3218 ExReleaseResourceLite(&Vcb->chunk_lock); 3219 Vcb->balance.status = Status; 3220 goto end; 3221 } 3222 3223 ExReleaseResourceLite(&Vcb->chunk_lock); 3224 3225 if (Status == STATUS_DISK_FULL) { 3226 Status = try_consolidation(Vcb, Vcb->data_flags, &c); 3227 if (!NT_SUCCESS(Status)) { 3228 ERR("try_consolidation returned %08lx\n", Status); 3229 Vcb->balance.status = Status; 3230 goto end; 3231 } else 3232 c->balance_num = Vcb->balance.balance_num; 3233 3234 consolidated = true; 3235 3236 if (Vcb->balance.stopping) 3237 goto end; 3238 } 3239 } 3240 3241 if (okay_system_chunks == 0) { 3242 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3243 3244 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true); 3245 if (NT_SUCCESS(Status)) 3246 c->balance_num = Vcb->balance.balance_num; 3247 else if (Status != STATUS_DISK_FULL || consolidated) { 3248 ERR("alloc_chunk returned %08lx\n", Status); 3249 ExReleaseResourceLite(&Vcb->chunk_lock); 3250 Vcb->balance.status = Status; 3251 goto end; 3252 } 3253 3254 ExReleaseResourceLite(&Vcb->chunk_lock); 3255 3256 if (Status == STATUS_DISK_FULL) { 3257 Status = try_consolidation(Vcb, Vcb->system_flags, &c); 3258 if (!NT_SUCCESS(Status)) { 3259 ERR("try_consolidation returned %08lx\n", Status); 3260 Vcb->balance.status = Status; 3261 goto end; 3262 } else 3263 c->balance_num = Vcb->balance.balance_num; 3264 3265 consolidated = true; 3266 3267 if (Vcb->balance.stopping) 3268 goto end; 3269 } 3270 } 3271 } 3272 3273 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3274 3275 le = chunks.Flink; 3276 while (le != &chunks) { 3277 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3278 3279 c->reloc = true; 3280 3281 le = le->Flink; 3282 } 3283 3284 ExReleaseResourceLite(&Vcb->chunk_lock); 3285 3286 // do data chunks before metadata 3287 le = chunks.Flink; 3288 while (le != &chunks) { 3289 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3290 LIST_ENTRY* le2 = le->Flink; 3291 3292 if (c->chunk_item->type & BLOCK_FLAG_DATA) { 3293 bool changed; 3294 3295 do { 3296 changed = false; 3297 3298 Status = balance_data_chunk(Vcb, c, &changed); 3299 if (!NT_SUCCESS(Status)) { 3300 ERR("balance_data_chunk returned %08lx\n", Status); 3301 Vcb->balance.status = Status; 3302 goto end; 3303 } 3304 3305 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3306 3307 if (Vcb->readonly) 3308 Vcb->balance.stopping = true; 3309 3310 if (Vcb->balance.stopping) 3311 break; 3312 } while (changed); 3313 3314 c->changed = true; 3315 c->space_changed = true; 3316 } 3317 3318 if (Vcb->balance.stopping) 3319 goto end; 3320 3321 if (c->chunk_item->type & BLOCK_FLAG_DATA && 3322 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) { 3323 RemoveEntryList(&c->list_entry_balance); 3324 c->list_entry_balance.Flink = NULL; 3325 3326 Vcb->balance.chunks_left--; 3327 } 3328 3329 le = le2; 3330 } 3331 3332 // do metadata chunks 3333 while (!IsListEmpty(&chunks)) { 3334 chunk* c; 3335 bool changed; 3336 3337 le = RemoveHeadList(&chunks); 3338 c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3339 3340 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) { 3341 do { 3342 Status = balance_metadata_chunk(Vcb, c, &changed); 3343 if (!NT_SUCCESS(Status)) { 3344 ERR("balance_metadata_chunk returned %08lx\n", Status); 3345 Vcb->balance.status = Status; 3346 goto end; 3347 } 3348 3349 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3350 3351 if (Vcb->readonly) 3352 Vcb->balance.stopping = true; 3353 3354 if (Vcb->balance.stopping) 3355 break; 3356 } while (changed); 3357 3358 c->changed = true; 3359 c->space_changed = true; 3360 } 3361 3362 if (Vcb->balance.stopping) 3363 break; 3364 3365 c->list_entry_balance.Flink = NULL; 3366 3367 Vcb->balance.chunks_left--; 3368 } 3369 3370 end: 3371 if (!Vcb->readonly) { 3372 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3373 le = chunks.Flink; 3374 while (le != &chunks) { 3375 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3376 c->reloc = false; 3377 3378 le = le->Flink; 3379 c->list_entry_balance.Flink = NULL; 3380 } 3381 3382 if (old_data_flags != 0) 3383 Vcb->data_flags = old_data_flags; 3384 3385 if (old_metadata_flags != 0) 3386 Vcb->metadata_flags = old_metadata_flags; 3387 3388 if (old_system_flags != 0) 3389 Vcb->system_flags = old_system_flags; 3390 } 3391 3392 if (Vcb->balance.removing) { 3393 device* dev = NULL; 3394 3395 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3396 3397 le = Vcb->devices.Flink; 3398 while (le != &Vcb->devices) { 3399 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3400 3401 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3402 dev = dev2; 3403 break; 3404 } 3405 3406 le = le->Flink; 3407 } 3408 3409 if (dev) { 3410 if (Vcb->balance.chunks_left == 0) { 3411 Status = finish_removing_device(Vcb, dev); 3412 3413 if (!NT_SUCCESS(Status)) { 3414 ERR("finish_removing_device returned %08lx\n", Status); 3415 dev->reloc = false; 3416 } 3417 } else 3418 dev->reloc = false; 3419 } 3420 3421 ExReleaseResourceLite(&Vcb->tree_lock); 3422 } else if (Vcb->balance.shrinking) { 3423 device* dev = NULL; 3424 3425 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3426 3427 le = Vcb->devices.Flink; 3428 while (le != &Vcb->devices) { 3429 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3430 3431 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3432 dev = dev2; 3433 break; 3434 } 3435 3436 le = le->Flink; 3437 } 3438 3439 if (!dev) { 3440 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid); 3441 Vcb->balance.status = STATUS_INTERNAL_ERROR; 3442 } 3443 3444 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3445 if (dev) { 3446 Status = regenerate_space_list(Vcb, dev); 3447 if (!NT_SUCCESS(Status)) 3448 WARN("regenerate_space_list returned %08lx\n", Status); 3449 } 3450 } else { 3451 uint64_t old_size; 3452 3453 old_size = dev->devitem.num_bytes; 3454 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start; 3455 3456 Status = update_dev_item(Vcb, dev, NULL); 3457 if (!NT_SUCCESS(Status)) { 3458 ERR("update_dev_item returned %08lx\n", Status); 3459 dev->devitem.num_bytes = old_size; 3460 Vcb->balance.status = Status; 3461 3462 Status = regenerate_space_list(Vcb, dev); 3463 if (!NT_SUCCESS(Status)) 3464 WARN("regenerate_space_list returned %08lx\n", Status); 3465 } else { 3466 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes; 3467 3468 Status = do_write(Vcb, NULL); 3469 if (!NT_SUCCESS(Status)) 3470 ERR("do_write returned %08lx\n", Status); 3471 3472 free_trees(Vcb); 3473 } 3474 } 3475 3476 ExReleaseResourceLite(&Vcb->tree_lock); 3477 3478 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status)) 3479 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3480 } else { 3481 Status = remove_balance_item(Vcb); 3482 if (!NT_SUCCESS(Status)) { 3483 ERR("remove_balance_item returned %08lx\n", Status); 3484 goto end; 3485 } 3486 } 3487 3488 if (Vcb->trim && !Vcb->options.no_trim) { 3489 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3490 3491 le = Vcb->devices.Flink; 3492 while (le != &Vcb->devices) { 3493 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3494 3495 if (dev2->devobj && !dev2->readonly && dev2->trim) 3496 trim_unalloc_space(Vcb, dev2); 3497 3498 le = le->Flink; 3499 } 3500 3501 ExReleaseResourceLite(&Vcb->tree_lock); 3502 } 3503 } 3504 3505 ZwClose(Vcb->balance.thread); 3506 Vcb->balance.thread = NULL; 3507 3508 KeSetEvent(&Vcb->balance.finished, 0, false); 3509 } 3510 3511 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3512 NTSTATUS Status; 3513 btrfs_start_balance* bsb = (btrfs_start_balance*)data; 3514 OBJECT_ATTRIBUTES oa; 3515 uint8_t i; 3516 3517 if (length < sizeof(btrfs_start_balance) || !data) 3518 return STATUS_INVALID_PARAMETER; 3519 3520 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3521 return STATUS_PRIVILEGE_NOT_HELD; 3522 3523 if (Vcb->locked) { 3524 WARN("cannot start balance while locked\n"); 3525 return STATUS_DEVICE_NOT_READY; 3526 } 3527 3528 if (Vcb->scrub.thread) { 3529 WARN("cannot start balance while scrub running\n"); 3530 return STATUS_DEVICE_NOT_READY; 3531 } 3532 3533 if (Vcb->balance.thread) { 3534 WARN("balance already running\n"); 3535 return STATUS_DEVICE_NOT_READY; 3536 } 3537 3538 if (Vcb->readonly) 3539 return STATUS_MEDIA_WRITE_PROTECTED; 3540 3541 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3542 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3543 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED)) 3544 return STATUS_SUCCESS; 3545 3546 for (i = 0; i < 3; i++) { 3547 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3548 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) { 3549 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 | 3550 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 | 3551 BLOCK_FLAG_RAID1C4; 3552 3553 if (bsb->opts[i].profiles == 0) 3554 return STATUS_INVALID_PARAMETER; 3555 } 3556 3557 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) { 3558 if (bsb->opts[i].devid == 0) 3559 return STATUS_INVALID_PARAMETER; 3560 } 3561 3562 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) { 3563 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end) 3564 return STATUS_INVALID_PARAMETER; 3565 } 3566 3567 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) { 3568 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end) 3569 return STATUS_INVALID_PARAMETER; 3570 } 3571 3572 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) { 3573 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start); 3574 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end); 3575 3576 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end) 3577 return STATUS_INVALID_PARAMETER; 3578 } 3579 3580 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) { 3581 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start); 3582 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end); 3583 3584 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3585 return STATUS_INVALID_PARAMETER; 3586 } 3587 3588 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) { 3589 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start); 3590 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end); 3591 3592 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3593 return STATUS_INVALID_PARAMETER; 3594 } 3595 3596 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3597 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 && 3598 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 && 3599 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 && 3600 bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 && 3601 bsb->opts[i].convert != BLOCK_FLAG_RAID1C4) 3602 return STATUS_INVALID_PARAMETER; 3603 } 3604 } 3605 } 3606 3607 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3608 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3609 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3610 3611 Vcb->balance.paused = false; 3612 Vcb->balance.removing = false; 3613 Vcb->balance.shrinking = false; 3614 Vcb->balance.status = STATUS_SUCCESS; 3615 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3616 3617 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3618 3619 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3620 if (!NT_SUCCESS(Status)) { 3621 ERR("PsCreateSystemThread returned %08lx\n", Status); 3622 return Status; 3623 } 3624 3625 return STATUS_SUCCESS; 3626 } 3627 3628 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) { 3629 KEY searchkey; 3630 traverse_ptr tp; 3631 NTSTATUS Status; 3632 BALANCE_ITEM* bi; 3633 OBJECT_ATTRIBUTES oa; 3634 int i; 3635 3636 searchkey.obj_id = BALANCE_ITEM_ID; 3637 searchkey.obj_type = TYPE_TEMP_ITEM; 3638 searchkey.offset = 0; 3639 3640 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 3641 if (!NT_SUCCESS(Status)) { 3642 ERR("find_item returned %08lx\n", Status); 3643 return Status; 3644 } 3645 3646 if (keycmp(tp.item->key, searchkey)) { 3647 TRACE("no balance item found\n"); 3648 return STATUS_NOT_FOUND; 3649 } 3650 3651 if (tp.item->size < sizeof(BALANCE_ITEM)) { 3652 WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, 3653 tp.item->size, sizeof(BALANCE_ITEM)); 3654 return STATUS_INTERNAL_ERROR; 3655 } 3656 3657 bi = (BALANCE_ITEM*)tp.item->data; 3658 3659 if (bi->flags & BALANCE_FLAGS_DATA) 3660 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 3661 3662 if (bi->flags & BALANCE_FLAGS_METADATA) 3663 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 3664 3665 if (bi->flags & BALANCE_FLAGS_SYSTEM) 3666 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 3667 3668 // do the heuristics that Linux driver does 3669 3670 for (i = 0; i < 3; i++) { 3671 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3672 // if converting, don't redo chunks already done 3673 3674 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3675 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT; 3676 3677 // don't balance chunks more than 90% filled - presumably these 3678 // have already been done 3679 3680 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) && 3681 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3682 ) { 3683 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE; 3684 Vcb->balance.opts[i].usage_start = 0; 3685 Vcb->balance.opts[i].usage_end = 90; 3686 } 3687 } 3688 } 3689 3690 if (Vcb->readonly || Vcb->options.skip_balance) 3691 Vcb->balance.paused = true; 3692 else 3693 Vcb->balance.paused = false; 3694 3695 Vcb->balance.removing = false; 3696 Vcb->balance.shrinking = false; 3697 Vcb->balance.status = STATUS_SUCCESS; 3698 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3699 3700 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3701 3702 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3703 if (!NT_SUCCESS(Status)) { 3704 ERR("PsCreateSystemThread returned %08lx\n", Status); 3705 return Status; 3706 } 3707 3708 return STATUS_SUCCESS; 3709 } 3710 3711 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) { 3712 btrfs_query_balance* bqb = (btrfs_query_balance*)data; 3713 3714 if (length < sizeof(btrfs_query_balance) || !data) 3715 return STATUS_INVALID_PARAMETER; 3716 3717 if (!Vcb->balance.thread) { 3718 bqb->status = BTRFS_BALANCE_STOPPED; 3719 3720 if (!NT_SUCCESS(Vcb->balance.status)) { 3721 bqb->status |= BTRFS_BALANCE_ERROR; 3722 bqb->error = Vcb->balance.status; 3723 } 3724 3725 return STATUS_SUCCESS; 3726 } 3727 3728 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING; 3729 3730 if (Vcb->balance.removing) 3731 bqb->status |= BTRFS_BALANCE_REMOVAL; 3732 3733 if (Vcb->balance.shrinking) 3734 bqb->status |= BTRFS_BALANCE_SHRINKING; 3735 3736 if (!NT_SUCCESS(Vcb->balance.status)) 3737 bqb->status |= BTRFS_BALANCE_ERROR; 3738 3739 bqb->chunks_left = Vcb->balance.chunks_left; 3740 bqb->total_chunks = Vcb->balance.total_chunks; 3741 bqb->error = Vcb->balance.status; 3742 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3743 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3744 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3745 3746 return STATUS_SUCCESS; 3747 } 3748 3749 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3750 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3751 return STATUS_PRIVILEGE_NOT_HELD; 3752 3753 if (!Vcb->balance.thread) 3754 return STATUS_DEVICE_NOT_READY; 3755 3756 if (Vcb->balance.paused) 3757 return STATUS_DEVICE_NOT_READY; 3758 3759 Vcb->balance.paused = true; 3760 KeClearEvent(&Vcb->balance.event); 3761 3762 return STATUS_SUCCESS; 3763 } 3764 3765 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3766 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3767 return STATUS_PRIVILEGE_NOT_HELD; 3768 3769 if (!Vcb->balance.thread) 3770 return STATUS_DEVICE_NOT_READY; 3771 3772 if (!Vcb->balance.paused) 3773 return STATUS_DEVICE_NOT_READY; 3774 3775 if (Vcb->readonly) 3776 return STATUS_MEDIA_WRITE_PROTECTED; 3777 3778 Vcb->balance.paused = false; 3779 KeSetEvent(&Vcb->balance.event, 0, false); 3780 3781 return STATUS_SUCCESS; 3782 } 3783 3784 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3785 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3786 return STATUS_PRIVILEGE_NOT_HELD; 3787 3788 if (!Vcb->balance.thread) 3789 return STATUS_DEVICE_NOT_READY; 3790 3791 Vcb->balance.paused = false; 3792 Vcb->balance.stopping = true; 3793 Vcb->balance.status = STATUS_SUCCESS; 3794 KeSetEvent(&Vcb->balance.event, 0, false); 3795 3796 return STATUS_SUCCESS; 3797 } 3798 3799 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3800 uint64_t devid; 3801 LIST_ENTRY* le; 3802 device* dev = NULL; 3803 NTSTATUS Status; 3804 int i; 3805 uint64_t num_rw_devices; 3806 OBJECT_ATTRIBUTES oa; 3807 3808 TRACE("(%p, %p, %lx)\n", Vcb, data, length); 3809 3810 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3811 return STATUS_PRIVILEGE_NOT_HELD; 3812 3813 if (length < sizeof(uint64_t)) 3814 return STATUS_INVALID_PARAMETER; 3815 3816 devid = *(uint64_t*)data; 3817 3818 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 3819 3820 if (Vcb->readonly) { 3821 ExReleaseResourceLite(&Vcb->tree_lock); 3822 return STATUS_MEDIA_WRITE_PROTECTED; 3823 } 3824 3825 num_rw_devices = 0; 3826 3827 le = Vcb->devices.Flink; 3828 while (le != &Vcb->devices) { 3829 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3830 3831 if (dev2->devitem.dev_id == devid) 3832 dev = dev2; 3833 3834 if (!dev2->readonly) 3835 num_rw_devices++; 3836 3837 le = le->Flink; 3838 } 3839 3840 if (!dev) { 3841 ExReleaseResourceLite(&Vcb->tree_lock); 3842 WARN("device %I64x not found\n", devid); 3843 return STATUS_NOT_FOUND; 3844 } 3845 3846 if (!dev->readonly) { 3847 if (num_rw_devices == 1) { 3848 ExReleaseResourceLite(&Vcb->tree_lock); 3849 WARN("not removing last non-readonly device\n"); 3850 return STATUS_INVALID_PARAMETER; 3851 } 3852 3853 if (num_rw_devices == 4 && 3854 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) || 3855 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) || 3856 (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4) 3857 ) 3858 ) { 3859 ExReleaseResourceLite(&Vcb->tree_lock); 3860 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n"); 3861 return STATUS_CANNOT_DELETE; 3862 } 3863 3864 if (num_rw_devices == 3 && 3865 ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) || 3866 (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3)) 3867 ) { 3868 ExReleaseResourceLite(&Vcb->tree_lock); 3869 ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n"); 3870 return STATUS_CANNOT_DELETE; 3871 } 3872 3873 if (num_rw_devices == 2 && 3874 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) || 3875 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1)) 3876 ) { 3877 ExReleaseResourceLite(&Vcb->tree_lock); 3878 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n"); 3879 return STATUS_CANNOT_DELETE; 3880 } 3881 } 3882 3883 ExReleaseResourceLite(&Vcb->tree_lock); 3884 3885 if (Vcb->balance.thread) { 3886 WARN("balance already running\n"); 3887 return STATUS_DEVICE_NOT_READY; 3888 } 3889 3890 dev->reloc = true; 3891 3892 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3); 3893 3894 for (i = 0; i < 3; i++) { 3895 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID; 3896 Vcb->balance.opts[i].devid = devid; 3897 } 3898 3899 Vcb->balance.paused = false; 3900 Vcb->balance.removing = true; 3901 Vcb->balance.shrinking = false; 3902 Vcb->balance.status = STATUS_SUCCESS; 3903 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3904 3905 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3906 3907 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3908 if (!NT_SUCCESS(Status)) { 3909 ERR("PsCreateSystemThread returned %08lx\n", Status); 3910 dev->reloc = false; 3911 return Status; 3912 } 3913 3914 return STATUS_SUCCESS; 3915 } 3916