1 /* Copyright (c) Mark Harmstone 2016-17 2 * 3 * This file is part of WinBtrfs. 4 * 5 * WinBtrfs is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public Licence as published by 7 * the Free Software Foundation, either version 3 of the Licence, or 8 * (at your option) any later version. 9 * 10 * WinBtrfs is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public Licence for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public Licence 16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include "btrfs_drv.h" 19 #include "btrfsioctl.h" 20 #include <ntddstor.h> 21 22 typedef struct { 23 uint64_t address; 24 uint64_t new_address; 25 tree_header* data; 26 EXTENT_ITEM* ei; 27 tree* t; 28 bool system; 29 LIST_ENTRY refs; 30 LIST_ENTRY list_entry; 31 } metadata_reloc; 32 33 typedef struct { 34 uint8_t type; 35 uint64_t hash; 36 37 union { 38 TREE_BLOCK_REF tbr; 39 SHARED_BLOCK_REF sbr; 40 }; 41 42 metadata_reloc* parent; 43 bool top; 44 LIST_ENTRY list_entry; 45 } metadata_reloc_ref; 46 47 typedef struct { 48 uint64_t address; 49 uint64_t size; 50 uint64_t new_address; 51 chunk* newchunk; 52 EXTENT_ITEM* ei; 53 LIST_ENTRY refs; 54 LIST_ENTRY list_entry; 55 } data_reloc; 56 57 typedef struct { 58 uint8_t type; 59 uint64_t hash; 60 61 union { 62 EXTENT_DATA_REF edr; 63 SHARED_DATA_REF sdr; 64 }; 65 66 metadata_reloc* parent; 67 LIST_ENTRY list_entry; 68 } data_reloc_ref; 69 70 #ifndef _MSC_VER // not in mingw yet 71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000 72 #endif 73 74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time 75 76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp, 77 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) { 78 NTSTATUS Status; 79 metadata_reloc* mr; 80 EXTENT_ITEM* ei; 81 uint16_t len; 82 uint64_t inline_rc; 83 uint8_t* ptr; 84 85 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG); 86 if (!mr) { 87 ERR("out of memory\n"); 88 return STATUS_INSUFFICIENT_RESOURCES; 89 } 90 91 mr->address = tp->item->key.obj_id; 92 mr->data = NULL; 93 mr->ei = (EXTENT_ITEM*)tp->item->data; 94 mr->system = false; 95 InitializeListHead(&mr->refs); 96 97 Status = delete_tree_item(Vcb, tp); 98 if (!NT_SUCCESS(Status)) { 99 ERR("delete_tree_item returned %08x\n", Status); 100 ExFreePool(mr); 101 return Status; 102 } 103 104 if (!c) 105 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 106 107 if (c) { 108 acquire_chunk_lock(c, Vcb); 109 110 c->used -= Vcb->superblock.node_size; 111 112 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback); 113 114 release_chunk_lock(c, Vcb); 115 } 116 117 ei = (EXTENT_ITEM*)tp->item->data; 118 inline_rc = 0; 119 120 len = tp->item->size - sizeof(EXTENT_ITEM); 121 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 122 if (!skinny) { 123 len -= sizeof(EXTENT_ITEM2); 124 ptr += sizeof(EXTENT_ITEM2); 125 } 126 127 while (len > 0) { 128 uint8_t secttype = *ptr; 129 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0); 130 metadata_reloc_ref* ref; 131 132 len--; 133 134 if (sectlen > len) { 135 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 136 return STATUS_INTERNAL_ERROR; 137 } 138 139 if (sectlen == 0) { 140 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 141 return STATUS_INTERNAL_ERROR; 142 } 143 144 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 145 if (!ref) { 146 ERR("out of memory\n"); 147 return STATUS_INSUFFICIENT_RESOURCES; 148 } 149 150 if (secttype == TYPE_TREE_BLOCK_REF) { 151 ref->type = TYPE_TREE_BLOCK_REF; 152 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF)); 153 inline_rc++; 154 } else if (secttype == TYPE_SHARED_BLOCK_REF) { 155 ref->type = TYPE_SHARED_BLOCK_REF; 156 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF)); 157 inline_rc++; 158 } else { 159 ERR("unexpected tree type %x\n", secttype); 160 ExFreePool(ref); 161 return STATUS_INTERNAL_ERROR; 162 } 163 164 ref->parent = NULL; 165 ref->top = false; 166 InsertTailList(&mr->refs, &ref->list_entry); 167 168 len -= sectlen; 169 ptr += sizeof(uint8_t) + sectlen; 170 } 171 172 if (inline_rc < ei->refcount) { // look for non-inline entries 173 traverse_ptr tp2 = *tp, next_tp; 174 175 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 176 tp2 = next_tp; 177 178 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 179 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) { 180 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 181 if (!ref) { 182 ERR("out of memory\n"); 183 return STATUS_INSUFFICIENT_RESOURCES; 184 } 185 186 ref->type = TYPE_TREE_BLOCK_REF; 187 ref->tbr.offset = tp2.item->key.offset; 188 ref->parent = NULL; 189 ref->top = false; 190 InsertTailList(&mr->refs, &ref->list_entry); 191 192 Status = delete_tree_item(Vcb, &tp2); 193 if (!NT_SUCCESS(Status)) { 194 ERR("delete_tree_item returned %08x\n", Status); 195 return Status; 196 } 197 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) { 198 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 199 if (!ref) { 200 ERR("out of memory\n"); 201 return STATUS_INSUFFICIENT_RESOURCES; 202 } 203 204 ref->type = TYPE_SHARED_BLOCK_REF; 205 ref->sbr.offset = tp2.item->key.offset; 206 ref->parent = NULL; 207 ref->top = false; 208 InsertTailList(&mr->refs, &ref->list_entry); 209 210 Status = delete_tree_item(Vcb, &tp2); 211 if (!NT_SUCCESS(Status)) { 212 ERR("delete_tree_item returned %08x\n", Status); 213 return Status; 214 } 215 } 216 } else 217 break; 218 } 219 } 220 221 InsertTailList(items, &mr->list_entry); 222 223 if (mr2) 224 *mr2 = mr; 225 226 return STATUS_SUCCESS; 227 } 228 229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 230 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) { 231 LIST_ENTRY* le; 232 KEY searchkey; 233 traverse_ptr tp; 234 bool skinny = false; 235 NTSTATUS Status; 236 237 le = items->Flink; 238 while (le != items) { 239 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 240 241 if (mr->address == address) { 242 *mr2 = mr; 243 return STATUS_SUCCESS; 244 } 245 246 le = le->Flink; 247 } 248 249 searchkey.obj_id = address; 250 searchkey.obj_type = TYPE_METADATA_ITEM; 251 searchkey.offset = 0xffffffffffffffff; 252 253 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 254 if (!NT_SUCCESS(Status)) { 255 ERR("find_item returned %08x\n", Status); 256 return Status; 257 } 258 259 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) 260 skinny = true; 261 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 262 tp.item->size >= sizeof(EXTENT_ITEM)) { 263 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 264 265 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) { 266 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address); 267 return STATUS_INTERNAL_ERROR; 268 } 269 } else { 270 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address); 271 return STATUS_INTERNAL_ERROR; 272 } 273 274 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback); 275 if (!NT_SUCCESS(Status)) { 276 ERR("add_metadata_reloc returned %08x\n", Status); 277 return Status; 278 } 279 280 return STATUS_SUCCESS; 281 } 282 283 static void sort_metadata_reloc_refs(metadata_reloc* mr) { 284 LIST_ENTRY newlist, *le; 285 286 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items 287 return; 288 289 // insertion sort 290 291 InitializeListHead(&newlist); 292 293 while (!IsListEmpty(&mr->refs)) { 294 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 295 bool inserted = false; 296 297 if (ref->type == TYPE_TREE_BLOCK_REF) 298 ref->hash = ref->tbr.offset; 299 else if (ref->type == TYPE_SHARED_BLOCK_REF) 300 ref->hash = ref->parent->new_address; 301 302 le = newlist.Flink; 303 while (le != &newlist) { 304 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 305 306 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 307 InsertHeadList(le->Blink, &ref->list_entry); 308 inserted = true; 309 break; 310 } 311 312 le = le->Flink; 313 } 314 315 if (!inserted) 316 InsertTailList(&newlist, &ref->list_entry); 317 } 318 319 newlist.Flink->Blink = &mr->refs; 320 newlist.Blink->Flink = &mr->refs; 321 mr->refs.Flink = newlist.Flink; 322 mr->refs.Blink = newlist.Blink; 323 } 324 325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) { 326 NTSTATUS Status; 327 LIST_ENTRY* le; 328 uint64_t rc = 0; 329 uint16_t inline_len; 330 bool all_inline = true; 331 metadata_reloc_ref* first_noninline = NULL; 332 EXTENT_ITEM* ei; 333 uint8_t* ptr; 334 335 inline_len = sizeof(EXTENT_ITEM); 336 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) 337 inline_len += sizeof(EXTENT_ITEM2); 338 339 sort_metadata_reloc_refs(mr); 340 341 le = mr->refs.Flink; 342 while (le != &mr->refs) { 343 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 344 uint16_t extlen = 0; 345 346 rc++; 347 348 if (ref->type == TYPE_TREE_BLOCK_REF) 349 extlen += sizeof(TREE_BLOCK_REF); 350 else if (ref->type == TYPE_SHARED_BLOCK_REF) 351 extlen += sizeof(SHARED_BLOCK_REF); 352 353 if (all_inline) { 354 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 355 all_inline = false; 356 first_noninline = ref; 357 } else 358 inline_len += extlen + 1; 359 } 360 361 le = le->Flink; 362 } 363 364 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 365 if (!ei) { 366 ERR("out of memory\n"); 367 return STATUS_INSUFFICIENT_RESOURCES; 368 } 369 370 ei->refcount = rc; 371 ei->generation = mr->ei->generation; 372 ei->flags = mr->ei->flags; 373 ptr = (uint8_t*)&ei[1]; 374 375 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) { 376 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr; 377 378 ei2->firstitem = *(KEY*)&mr->data[1]; 379 ei2->level = mr->data->level; 380 381 ptr += sizeof(EXTENT_ITEM2); 382 } 383 384 le = mr->refs.Flink; 385 while (le != &mr->refs) { 386 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 387 388 if (ref == first_noninline) 389 break; 390 391 *ptr = ref->type; 392 ptr++; 393 394 if (ref->type == TYPE_TREE_BLOCK_REF) { 395 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr; 396 397 tbr->offset = ref->tbr.offset; 398 399 ptr += sizeof(TREE_BLOCK_REF); 400 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 401 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr; 402 403 sbr->offset = ref->parent->new_address; 404 405 ptr += sizeof(SHARED_BLOCK_REF); 406 } 407 408 le = le->Flink; 409 } 410 411 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA) 412 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL); 413 else 414 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL); 415 416 if (!NT_SUCCESS(Status)) { 417 ERR("insert_tree_item returned %08x\n", Status); 418 ExFreePool(ei); 419 return Status; 420 } 421 422 if (!all_inline) { 423 le = &first_noninline->list_entry; 424 425 while (le != &mr->refs) { 426 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 427 428 if (ref->type == TYPE_TREE_BLOCK_REF) { 429 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL); 430 if (!NT_SUCCESS(Status)) { 431 ERR("insert_tree_item returned %08x\n", Status); 432 return Status; 433 } 434 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 435 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL); 436 if (!NT_SUCCESS(Status)) { 437 ERR("insert_tree_item returned %08x\n", Status); 438 return Status; 439 } 440 } 441 442 le = le->Flink; 443 } 444 } 445 446 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) { 447 if (mr->data->level > 0) { 448 uint16_t i; 449 internal_node* in = (internal_node*)&mr->data[1]; 450 451 for (i = 0; i < mr->data->num_items; i++) { 452 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL); 453 454 if (sbrrc > 0) { 455 SHARED_BLOCK_REF sbr; 456 457 sbr.offset = mr->new_address; 458 459 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL); 460 if (!NT_SUCCESS(Status)) { 461 ERR("increase_extent_refcount returned %08x\n", Status); 462 return Status; 463 } 464 465 sbr.offset = mr->address; 466 467 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, 468 sbr.offset, false, NULL); 469 if (!NT_SUCCESS(Status)) { 470 ERR("decrease_extent_refcount returned %08x\n", Status); 471 return Status; 472 } 473 } 474 } 475 } else { 476 uint16_t i; 477 leaf_node* ln = (leaf_node*)&mr->data[1]; 478 479 for (i = 0; i < mr->data->num_items; i++) { 480 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 481 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 482 483 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 484 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 485 486 if (ed2->size > 0) { // not sparse 487 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL); 488 489 if (sdrrc > 0) { 490 SHARED_DATA_REF sdr; 491 chunk* c; 492 493 sdr.offset = mr->new_address; 494 sdr.count = sdrrc; 495 496 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL); 497 if (!NT_SUCCESS(Status)) { 498 ERR("increase_extent_refcount returned %08x\n", Status); 499 return Status; 500 } 501 502 sdr.offset = mr->address; 503 504 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, 505 sdr.offset, false, NULL); 506 if (!NT_SUCCESS(Status)) { 507 ERR("decrease_extent_refcount returned %08x\n", Status); 508 return Status; 509 } 510 511 c = get_chunk_from_address(Vcb, ed2->address); 512 513 if (c) { 514 // check changed_extents 515 516 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true); 517 518 le = c->changed_extents.Flink; 519 520 while (le != &c->changed_extents) { 521 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 522 523 if (ce->address == ed2->address) { 524 LIST_ENTRY* le2; 525 526 le2 = ce->refs.Flink; 527 while (le2 != &ce->refs) { 528 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 529 530 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 531 cer->sdr.offset = mr->new_address; 532 break; 533 } 534 535 le2 = le2->Flink; 536 } 537 538 le2 = ce->old_refs.Flink; 539 while (le2 != &ce->old_refs) { 540 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 541 542 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 543 cer->sdr.offset = mr->new_address; 544 break; 545 } 546 547 le2 = le2->Flink; 548 } 549 550 break; 551 } 552 553 le = le->Flink; 554 } 555 556 ExReleaseResourceLite(&c->changed_extents_lock); 557 } 558 } 559 } 560 } 561 } 562 } 563 } 564 } 565 566 return STATUS_SUCCESS; 567 } 568 569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 570 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) { 571 LIST_ENTRY tree_writes, *le; 572 NTSTATUS Status; 573 traverse_ptr tp; 574 uint8_t level, max_level = 0; 575 chunk* newchunk = NULL; 576 577 InitializeListHead(&tree_writes); 578 579 le = items->Flink; 580 while (le != items) { 581 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 582 LIST_ENTRY* le2; 583 chunk* pc; 584 585 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); 586 if (!mr->data) { 587 ERR("out of memory\n"); 588 return STATUS_INSUFFICIENT_RESOURCES; 589 } 590 591 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data, 592 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority); 593 if (!NT_SUCCESS(Status)) { 594 ERR("read_data returned %08x\n", Status); 595 return Status; 596 } 597 598 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM) 599 mr->system = true; 600 601 if (data_items && mr->data->level == 0) { 602 le2 = data_items->Flink; 603 while (le2 != data_items) { 604 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 605 leaf_node* ln = (leaf_node*)&mr->data[1]; 606 uint16_t i; 607 608 for (i = 0; i < mr->data->num_items; i++) { 609 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 610 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 611 612 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 613 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 614 615 if (ed2->address == dr->address) 616 ed2->address = dr->new_address; 617 } 618 } 619 } 620 621 le2 = le2->Flink; 622 } 623 } 624 625 if (mr->data->level > max_level) 626 max_level = mr->data->level; 627 628 le2 = mr->refs.Flink; 629 while (le2 != &mr->refs) { 630 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 631 632 if (ref->type == TYPE_TREE_BLOCK_REF) { 633 KEY* firstitem; 634 root* r = NULL; 635 LIST_ENTRY* le3; 636 tree* t; 637 638 firstitem = (KEY*)&mr->data[1]; 639 640 le3 = Vcb->roots.Flink; 641 while (le3 != &Vcb->roots) { 642 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 643 644 if (r2->id == ref->tbr.offset) { 645 r = r2; 646 break; 647 } 648 649 le3 = le3->Flink; 650 } 651 652 if (!r) { 653 ERR("could not find subvol with id %I64x\n", ref->tbr.offset); 654 return STATUS_INTERNAL_ERROR; 655 } 656 657 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL); 658 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 659 ERR("find_item_to_level returned %08x\n", Status); 660 return Status; 661 } 662 663 t = tp.tree; 664 while (t && t->header.level < mr->data->level + 1) { 665 t = t->parent; 666 } 667 668 if (!t) 669 ref->top = true; 670 else { 671 metadata_reloc* mr2; 672 673 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback); 674 if (!NT_SUCCESS(Status)) { 675 ERR("add_metadata_reloc_parent returned %08x\n", Status); 676 return Status; 677 } 678 679 ref->parent = mr2; 680 } 681 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 682 metadata_reloc* mr2; 683 684 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback); 685 if (!NT_SUCCESS(Status)) { 686 ERR("add_metadata_reloc_parent returned %08x\n", Status); 687 return Status; 688 } 689 690 ref->parent = mr2; 691 } 692 693 le2 = le2->Flink; 694 } 695 696 le = le->Flink; 697 } 698 699 le = items->Flink; 700 while (le != items) { 701 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 702 LIST_ENTRY* le2; 703 uint32_t hash; 704 705 mr->t = NULL; 706 707 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t)); 708 709 le2 = Vcb->trees_ptrs[hash >> 24]; 710 711 if (le2) { 712 while (le2 != &Vcb->trees_hash) { 713 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash); 714 715 if (t->header.address == mr->address) { 716 mr->t = t; 717 break; 718 } else if (t->hash > hash) 719 break; 720 721 le2 = le2->Flink; 722 } 723 } 724 725 le = le->Flink; 726 } 727 728 for (level = 0; level <= max_level; level++) { 729 le = items->Flink; 730 while (le != items) { 731 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 732 733 if (mr->data->level == level) { 734 bool done = false; 735 LIST_ENTRY* le2; 736 tree_write* tw; 737 uint64_t flags; 738 tree* t3; 739 740 if (mr->system) 741 flags = Vcb->system_flags; 742 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) 743 flags = Vcb->data_flags; 744 else 745 flags = Vcb->metadata_flags; 746 747 if (newchunk) { 748 acquire_chunk_lock(newchunk, Vcb); 749 750 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 751 newchunk->used += Vcb->superblock.node_size; 752 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 753 done = true; 754 } 755 756 release_chunk_lock(newchunk, Vcb); 757 } 758 759 if (!done) { 760 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 761 762 le2 = Vcb->chunks.Flink; 763 while (le2 != &Vcb->chunks) { 764 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 765 766 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) { 767 acquire_chunk_lock(c2, Vcb); 768 769 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) { 770 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) { 771 c2->used += Vcb->superblock.node_size; 772 space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback); 773 release_chunk_lock(c2, Vcb); 774 newchunk = c2; 775 done = true; 776 break; 777 } 778 } 779 780 release_chunk_lock(c2, Vcb); 781 } 782 783 le2 = le2->Flink; 784 } 785 786 // allocate new chunk if necessary 787 if (!done) { 788 Status = alloc_chunk(Vcb, flags, &newchunk, false); 789 790 if (!NT_SUCCESS(Status)) { 791 ERR("alloc_chunk returned %08x\n", Status); 792 ExReleaseResourceLite(&Vcb->chunk_lock); 793 goto end; 794 } 795 796 acquire_chunk_lock(newchunk, Vcb); 797 798 newchunk->balance_num = Vcb->balance.balance_num; 799 800 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 801 release_chunk_lock(newchunk, Vcb); 802 ExReleaseResourceLite(&Vcb->chunk_lock); 803 ERR("could not find address in new chunk\n"); 804 Status = STATUS_DISK_FULL; 805 goto end; 806 } else { 807 newchunk->used += Vcb->superblock.node_size; 808 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 809 } 810 811 release_chunk_lock(newchunk, Vcb); 812 } 813 814 ExReleaseResourceLite(&Vcb->chunk_lock); 815 } 816 817 // update parents 818 le2 = mr->refs.Flink; 819 while (le2 != &mr->refs) { 820 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 821 822 if (ref->parent) { 823 uint16_t i; 824 internal_node* in = (internal_node*)&ref->parent->data[1]; 825 826 for (i = 0; i < ref->parent->data->num_items; i++) { 827 if (in[i].address == mr->address) { 828 in[i].address = mr->new_address; 829 break; 830 } 831 } 832 833 if (ref->parent->t) { 834 LIST_ENTRY* le3; 835 836 le3 = ref->parent->t->itemlist.Flink; 837 while (le3 != &ref->parent->t->itemlist) { 838 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 839 840 if (!td->inserted && td->treeholder.address == mr->address) 841 td->treeholder.address = mr->new_address; 842 843 le3 = le3->Flink; 844 } 845 } 846 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) { 847 LIST_ENTRY* le3; 848 root* r = NULL; 849 850 // alter ROOT_ITEM 851 852 le3 = Vcb->roots.Flink; 853 while (le3 != &Vcb->roots) { 854 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 855 856 if (r2->id == ref->tbr.offset) { 857 r = r2; 858 break; 859 } 860 861 le3 = le3->Flink; 862 } 863 864 if (r) { 865 r->treeholder.address = mr->new_address; 866 867 if (r == Vcb->root_root) 868 Vcb->superblock.root_tree_addr = mr->new_address; 869 else if (r == Vcb->chunk_root) 870 Vcb->superblock.chunk_tree_addr = mr->new_address; 871 else if (r->root_item.block_number == mr->address) { 872 KEY searchkey; 873 ROOT_ITEM* ri; 874 875 r->root_item.block_number = mr->new_address; 876 877 searchkey.obj_id = r->id; 878 searchkey.obj_type = TYPE_ROOT_ITEM; 879 searchkey.offset = 0xffffffffffffffff; 880 881 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 882 if (!NT_SUCCESS(Status)) { 883 ERR("find_item returned %08x\n", Status); 884 goto end; 885 } 886 887 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) { 888 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id); 889 Status = STATUS_INTERNAL_ERROR; 890 goto end; 891 } 892 893 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG); 894 if (!ri) { 895 ERR("out of memory\n"); 896 Status = STATUS_INSUFFICIENT_RESOURCES; 897 goto end; 898 } 899 900 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM)); 901 902 Status = delete_tree_item(Vcb, &tp); 903 if (!NT_SUCCESS(Status)) { 904 ERR("delete_tree_item returned %08x\n", Status); 905 goto end; 906 } 907 908 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL); 909 if (!NT_SUCCESS(Status)) { 910 ERR("insert_tree_item returned %08x\n", Status); 911 goto end; 912 } 913 } 914 } 915 } 916 917 le2 = le2->Flink; 918 } 919 920 mr->data->address = mr->new_address; 921 922 t3 = mr->t; 923 924 while (t3) { 925 uint8_t h; 926 bool inserted; 927 tree* t4 = NULL; 928 929 // check if tree loaded more than once 930 if (t3->list_entry.Flink != &Vcb->trees_hash) { 931 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 932 933 if (nt->header.address == t3->header.address) 934 t4 = nt; 935 } 936 937 t3->header.address = mr->new_address; 938 939 h = t3->hash >> 24; 940 941 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) { 942 if (t3->list_entry_hash.Flink == &Vcb->trees_hash) 943 Vcb->trees_ptrs[h] = NULL; 944 else { 945 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 946 947 if (t2->hash >> 24 == h) 948 Vcb->trees_ptrs[h] = &t2->list_entry_hash; 949 else 950 Vcb->trees_ptrs[h] = NULL; 951 } 952 } 953 954 RemoveEntryList(&t3->list_entry_hash); 955 956 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t)); 957 h = t3->hash >> 24; 958 959 if (!Vcb->trees_ptrs[h]) { 960 uint8_t h2 = h; 961 962 le2 = Vcb->trees_hash.Flink; 963 964 if (h2 > 0) { 965 h2--; 966 do { 967 if (Vcb->trees_ptrs[h2]) { 968 le2 = Vcb->trees_ptrs[h2]; 969 break; 970 } 971 972 h2--; 973 } while (h2 > 0); 974 } 975 } else 976 le2 = Vcb->trees_ptrs[h]; 977 978 inserted = false; 979 while (le2 != &Vcb->trees_hash) { 980 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash); 981 982 if (t2->hash >= t3->hash) { 983 InsertHeadList(le2->Blink, &t3->list_entry_hash); 984 inserted = true; 985 break; 986 } 987 988 le2 = le2->Flink; 989 } 990 991 if (!inserted) 992 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash); 993 994 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h]) 995 Vcb->trees_ptrs[h] = &t3->list_entry_hash; 996 997 if (data_items && level == 0) { 998 le2 = data_items->Flink; 999 1000 while (le2 != data_items) { 1001 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 1002 LIST_ENTRY* le3 = t3->itemlist.Flink; 1003 1004 while (le3 != &t3->itemlist) { 1005 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 1006 1007 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 1008 EXTENT_DATA* ed = (EXTENT_DATA*)td->data; 1009 1010 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 1011 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1012 1013 if (ed2->address == dr->address) 1014 ed2->address = dr->new_address; 1015 } 1016 } 1017 1018 le3 = le3->Flink; 1019 } 1020 1021 le2 = le2->Flink; 1022 } 1023 } 1024 1025 t3 = t4; 1026 } 1027 1028 *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum)); 1029 1030 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG); 1031 if (!tw) { 1032 ERR("out of memory\n"); 1033 Status = STATUS_INSUFFICIENT_RESOURCES; 1034 goto end; 1035 } 1036 1037 tw->address = mr->new_address; 1038 tw->length = Vcb->superblock.node_size; 1039 tw->data = (uint8_t*)mr->data; 1040 tw->allocated = false; 1041 1042 if (IsListEmpty(&tree_writes)) 1043 InsertTailList(&tree_writes, &tw->list_entry); 1044 else { 1045 bool inserted = false; 1046 1047 le2 = tree_writes.Flink; 1048 while (le2 != &tree_writes) { 1049 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry); 1050 1051 if (tw2->address > tw->address) { 1052 InsertHeadList(le2->Blink, &tw->list_entry); 1053 inserted = true; 1054 break; 1055 } 1056 1057 le2 = le2->Flink; 1058 } 1059 1060 if (!inserted) 1061 InsertTailList(&tree_writes, &tw->list_entry); 1062 } 1063 } 1064 1065 le = le->Flink; 1066 } 1067 } 1068 1069 Status = do_tree_writes(Vcb, &tree_writes, true); 1070 if (!NT_SUCCESS(Status)) { 1071 ERR("do_tree_writes returned %08x\n", Status); 1072 goto end; 1073 } 1074 1075 le = items->Flink; 1076 while (le != items) { 1077 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 1078 1079 Status = add_metadata_reloc_extent_item(Vcb, mr); 1080 if (!NT_SUCCESS(Status)) { 1081 ERR("add_metadata_reloc_extent_item returned %08x\n", Status); 1082 goto end; 1083 } 1084 1085 le = le->Flink; 1086 } 1087 1088 Status = STATUS_SUCCESS; 1089 1090 end: 1091 while (!IsListEmpty(&tree_writes)) { 1092 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry); 1093 1094 if (tw->allocated) 1095 ExFreePool(tw->data); 1096 1097 ExFreePool(tw); 1098 } 1099 1100 return Status; 1101 } 1102 1103 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1104 KEY searchkey; 1105 traverse_ptr tp; 1106 NTSTATUS Status; 1107 bool b; 1108 LIST_ENTRY items, rollback; 1109 uint32_t loaded = 0; 1110 1111 TRACE("chunk %I64x\n", c->offset); 1112 1113 InitializeListHead(&rollback); 1114 InitializeListHead(&items); 1115 1116 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1117 1118 searchkey.obj_id = c->offset; 1119 searchkey.obj_type = TYPE_METADATA_ITEM; 1120 searchkey.offset = 0xffffffffffffffff; 1121 1122 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1123 if (!NT_SUCCESS(Status)) { 1124 ERR("find_item returned %08x\n", Status); 1125 goto end; 1126 } 1127 1128 do { 1129 traverse_ptr next_tp; 1130 1131 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1132 break; 1133 1134 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { 1135 bool tree = false, skinny = false; 1136 1137 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1138 tree = true; 1139 skinny = true; 1140 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 1141 tp.item->size >= sizeof(EXTENT_ITEM)) { 1142 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1143 1144 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1145 tree = true; 1146 } 1147 1148 if (tree) { 1149 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback); 1150 1151 if (!NT_SUCCESS(Status)) { 1152 ERR("add_metadata_reloc returned %08x\n", Status); 1153 goto end; 1154 } 1155 1156 loaded++; 1157 1158 if (loaded >= 64) // only do 64 at a time 1159 break; 1160 } 1161 } 1162 1163 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1164 1165 if (b) 1166 tp = next_tp; 1167 } while (b); 1168 1169 if (IsListEmpty(&items)) { 1170 *changed = false; 1171 Status = STATUS_SUCCESS; 1172 goto end; 1173 } else 1174 *changed = true; 1175 1176 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback); 1177 if (!NT_SUCCESS(Status)) { 1178 ERR("write_metadata_items returned %08x\n", Status); 1179 goto end; 1180 } 1181 1182 Status = STATUS_SUCCESS; 1183 1184 Vcb->need_write = true; 1185 1186 end: 1187 if (NT_SUCCESS(Status)) { 1188 Status = do_write(Vcb, NULL); 1189 if (!NT_SUCCESS(Status)) 1190 ERR("do_write returned %08x\n", Status); 1191 } 1192 1193 if (NT_SUCCESS(Status)) 1194 clear_rollback(&rollback); 1195 else 1196 do_rollback(Vcb, &rollback); 1197 1198 free_trees(Vcb); 1199 1200 ExReleaseResourceLite(&Vcb->tree_lock); 1201 1202 while (!IsListEmpty(&items)) { 1203 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry); 1204 1205 while (!IsListEmpty(&mr->refs)) { 1206 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 1207 1208 ExFreePool(ref); 1209 } 1210 1211 if (mr->data) 1212 ExFreePool(mr->data); 1213 1214 ExFreePool(mr); 1215 } 1216 1217 return Status; 1218 } 1219 1220 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items, 1221 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) { 1222 NTSTATUS Status; 1223 LIST_ENTRY* le; 1224 KEY searchkey; 1225 traverse_ptr tp; 1226 root* r = NULL; 1227 metadata_reloc* mr; 1228 uint64_t last_tree = 0; 1229 data_reloc_ref* ref; 1230 1231 le = Vcb->roots.Flink; 1232 while (le != &Vcb->roots) { 1233 root* r2 = CONTAINING_RECORD(le, root, list_entry); 1234 1235 if (r2->id == edr->root) { 1236 r = r2; 1237 break; 1238 } 1239 1240 le = le->Flink; 1241 } 1242 1243 if (!r) { 1244 ERR("could not find subvol %I64x\n", edr->count); 1245 return STATUS_INTERNAL_ERROR; 1246 } 1247 1248 searchkey.obj_id = edr->objid; 1249 searchkey.obj_type = TYPE_EXTENT_DATA; 1250 searchkey.offset = 0; 1251 1252 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL); 1253 if (!NT_SUCCESS(Status)) { 1254 ERR("find_item returned %08x\n", Status); 1255 return Status; 1256 } 1257 1258 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) { 1259 traverse_ptr tp2; 1260 1261 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1262 tp = tp2; 1263 else { 1264 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id); 1265 return STATUS_INTERNAL_ERROR; 1266 } 1267 } 1268 1269 ref = NULL; 1270 1271 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) { 1272 traverse_ptr tp2; 1273 1274 if (tp.item->size >= sizeof(EXTENT_DATA)) { 1275 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data; 1276 1277 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) { 1278 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1279 1280 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) { 1281 if (ref && last_tree == tp.tree->header.address) 1282 ref->edr.count++; 1283 else { 1284 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1285 if (!ref) { 1286 ERR("out of memory\n"); 1287 return STATUS_INSUFFICIENT_RESOURCES; 1288 } 1289 1290 ref->type = TYPE_EXTENT_DATA_REF; 1291 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF)); 1292 ref->edr.count = 1; 1293 1294 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback); 1295 if (!NT_SUCCESS(Status)) { 1296 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1297 ExFreePool(ref); 1298 return Status; 1299 } 1300 1301 last_tree = tp.tree->header.address; 1302 ref->parent = mr; 1303 1304 InsertTailList(&dr->refs, &ref->list_entry); 1305 } 1306 } 1307 } 1308 } 1309 1310 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1311 tp = tp2; 1312 else 1313 break; 1314 } 1315 1316 return STATUS_SUCCESS; 1317 } 1318 1319 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items, 1320 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) { 1321 NTSTATUS Status; 1322 data_reloc* dr; 1323 EXTENT_ITEM* ei; 1324 uint16_t len; 1325 uint64_t inline_rc; 1326 uint8_t* ptr; 1327 1328 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG); 1329 if (!dr) { 1330 ERR("out of memory\n"); 1331 return STATUS_INSUFFICIENT_RESOURCES; 1332 } 1333 1334 dr->address = tp->item->key.obj_id; 1335 dr->size = tp->item->key.offset; 1336 dr->ei = (EXTENT_ITEM*)tp->item->data; 1337 InitializeListHead(&dr->refs); 1338 1339 Status = delete_tree_item(Vcb, tp); 1340 if (!NT_SUCCESS(Status)) { 1341 ERR("delete_tree_item returned %08x\n", Status); 1342 return Status; 1343 } 1344 1345 if (!c) 1346 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 1347 1348 if (c) { 1349 acquire_chunk_lock(c, Vcb); 1350 1351 c->used -= tp->item->key.offset; 1352 1353 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback); 1354 1355 release_chunk_lock(c, Vcb); 1356 } 1357 1358 ei = (EXTENT_ITEM*)tp->item->data; 1359 inline_rc = 0; 1360 1361 len = tp->item->size - sizeof(EXTENT_ITEM); 1362 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 1363 1364 while (len > 0) { 1365 uint8_t secttype = *ptr; 1366 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0); 1367 1368 len--; 1369 1370 if (sectlen > len) { 1371 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 1372 return STATUS_INTERNAL_ERROR; 1373 } 1374 1375 if (sectlen == 0) { 1376 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 1377 return STATUS_INTERNAL_ERROR; 1378 } 1379 1380 if (secttype == TYPE_EXTENT_DATA_REF) { 1381 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t)); 1382 1383 inline_rc += edr->count; 1384 1385 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback); 1386 if (!NT_SUCCESS(Status)) { 1387 ERR("data_reloc_add_tree_edr returned %08x\n", Status); 1388 return Status; 1389 } 1390 } else if (secttype == TYPE_SHARED_DATA_REF) { 1391 metadata_reloc* mr; 1392 data_reloc_ref* ref; 1393 1394 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1395 if (!ref) { 1396 ERR("out of memory\n"); 1397 return STATUS_INSUFFICIENT_RESOURCES; 1398 } 1399 1400 ref->type = TYPE_SHARED_DATA_REF; 1401 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF)); 1402 inline_rc += ref->sdr.count; 1403 1404 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1405 if (!NT_SUCCESS(Status)) { 1406 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1407 ExFreePool(ref); 1408 return Status; 1409 } 1410 1411 ref->parent = mr; 1412 1413 InsertTailList(&dr->refs, &ref->list_entry); 1414 } else { 1415 ERR("unexpected tree type %x\n", secttype); 1416 return STATUS_INTERNAL_ERROR; 1417 } 1418 1419 1420 len -= sectlen; 1421 ptr += sizeof(uint8_t) + sectlen; 1422 } 1423 1424 if (inline_rc < ei->refcount) { // look for non-inline entries 1425 traverse_ptr tp2 = *tp, next_tp; 1426 1427 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 1428 tp2 = next_tp; 1429 1430 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 1431 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) { 1432 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback); 1433 if (!NT_SUCCESS(Status)) { 1434 ERR("data_reloc_add_tree_edr returned %08x\n", Status); 1435 return Status; 1436 } 1437 1438 Status = delete_tree_item(Vcb, &tp2); 1439 if (!NT_SUCCESS(Status)) { 1440 ERR("delete_tree_item returned %08x\n", Status); 1441 return Status; 1442 } 1443 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) { 1444 metadata_reloc* mr; 1445 data_reloc_ref* ref; 1446 1447 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1448 if (!ref) { 1449 ERR("out of memory\n"); 1450 return STATUS_INSUFFICIENT_RESOURCES; 1451 } 1452 1453 ref->type = TYPE_SHARED_DATA_REF; 1454 ref->sdr.offset = tp2.item->key.offset; 1455 ref->sdr.count = *((uint32_t*)tp2.item->data); 1456 1457 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1458 if (!NT_SUCCESS(Status)) { 1459 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1460 ExFreePool(ref); 1461 return Status; 1462 } 1463 1464 ref->parent = mr; 1465 InsertTailList(&dr->refs, &ref->list_entry); 1466 1467 Status = delete_tree_item(Vcb, &tp2); 1468 if (!NT_SUCCESS(Status)) { 1469 ERR("delete_tree_item returned %08x\n", Status); 1470 return Status; 1471 } 1472 } 1473 } else 1474 break; 1475 } 1476 } 1477 1478 InsertTailList(items, &dr->list_entry); 1479 1480 return STATUS_SUCCESS; 1481 } 1482 1483 static void sort_data_reloc_refs(data_reloc* dr) { 1484 LIST_ENTRY newlist, *le; 1485 1486 if (IsListEmpty(&dr->refs)) 1487 return; 1488 1489 // insertion sort 1490 1491 InitializeListHead(&newlist); 1492 1493 while (!IsListEmpty(&dr->refs)) { 1494 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 1495 bool inserted = false; 1496 1497 if (ref->type == TYPE_EXTENT_DATA_REF) 1498 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset); 1499 else if (ref->type == TYPE_SHARED_DATA_REF) 1500 ref->hash = ref->parent->new_address; 1501 1502 le = newlist.Flink; 1503 while (le != &newlist) { 1504 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1505 1506 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 1507 InsertHeadList(le->Blink, &ref->list_entry); 1508 inserted = true; 1509 break; 1510 } 1511 1512 le = le->Flink; 1513 } 1514 1515 if (!inserted) 1516 InsertTailList(&newlist, &ref->list_entry); 1517 } 1518 1519 le = newlist.Flink; 1520 while (le != &newlist) { 1521 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1522 1523 if (le->Flink != &newlist) { 1524 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry); 1525 1526 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root && 1527 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) { 1528 RemoveEntryList(&ref2->list_entry); 1529 ref->edr.count += ref2->edr.count; 1530 ExFreePool(ref2); 1531 continue; 1532 } 1533 } 1534 1535 le = le->Flink; 1536 } 1537 1538 newlist.Flink->Blink = &dr->refs; 1539 newlist.Blink->Flink = &dr->refs; 1540 dr->refs.Flink = newlist.Flink; 1541 dr->refs.Blink = newlist.Blink; 1542 } 1543 1544 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) { 1545 NTSTATUS Status; 1546 LIST_ENTRY* le; 1547 uint64_t rc = 0; 1548 uint16_t inline_len; 1549 bool all_inline = true; 1550 data_reloc_ref* first_noninline = NULL; 1551 EXTENT_ITEM* ei; 1552 uint8_t* ptr; 1553 1554 inline_len = sizeof(EXTENT_ITEM); 1555 1556 sort_data_reloc_refs(dr); 1557 1558 le = dr->refs.Flink; 1559 while (le != &dr->refs) { 1560 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1561 uint16_t extlen = 0; 1562 1563 if (ref->type == TYPE_EXTENT_DATA_REF) { 1564 extlen += sizeof(EXTENT_DATA_REF); 1565 rc += ref->edr.count; 1566 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1567 extlen += sizeof(SHARED_DATA_REF); 1568 rc++; 1569 } 1570 1571 if (all_inline) { 1572 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 1573 all_inline = false; 1574 first_noninline = ref; 1575 } else 1576 inline_len += extlen + 1; 1577 } 1578 1579 le = le->Flink; 1580 } 1581 1582 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 1583 if (!ei) { 1584 ERR("out of memory\n"); 1585 return STATUS_INSUFFICIENT_RESOURCES; 1586 } 1587 1588 ei->refcount = rc; 1589 ei->generation = dr->ei->generation; 1590 ei->flags = dr->ei->flags; 1591 ptr = (uint8_t*)&ei[1]; 1592 1593 le = dr->refs.Flink; 1594 while (le != &dr->refs) { 1595 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1596 1597 if (ref == first_noninline) 1598 break; 1599 1600 *ptr = ref->type; 1601 ptr++; 1602 1603 if (ref->type == TYPE_EXTENT_DATA_REF) { 1604 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr; 1605 1606 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1607 1608 ptr += sizeof(EXTENT_DATA_REF); 1609 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1610 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr; 1611 1612 sdr->offset = ref->parent->new_address; 1613 sdr->count = ref->sdr.count; 1614 1615 ptr += sizeof(SHARED_DATA_REF); 1616 } 1617 1618 le = le->Flink; 1619 } 1620 1621 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL); 1622 if (!NT_SUCCESS(Status)) { 1623 ERR("insert_tree_item returned %08x\n", Status); 1624 return Status; 1625 } 1626 1627 if (!all_inline) { 1628 le = &first_noninline->list_entry; 1629 1630 while (le != &dr->refs) { 1631 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1632 1633 if (ref->type == TYPE_EXTENT_DATA_REF) { 1634 EXTENT_DATA_REF* edr; 1635 1636 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG); 1637 if (!edr) { 1638 ERR("out of memory\n"); 1639 return STATUS_INSUFFICIENT_RESOURCES; 1640 } 1641 1642 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1643 1644 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL); 1645 if (!NT_SUCCESS(Status)) { 1646 ERR("insert_tree_item returned %08x\n", Status); 1647 return Status; 1648 } 1649 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1650 uint32_t* sdr; 1651 1652 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG); 1653 if (!sdr) { 1654 ERR("out of memory\n"); 1655 return STATUS_INSUFFICIENT_RESOURCES; 1656 } 1657 1658 *sdr = ref->sdr.count; 1659 1660 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL); 1661 if (!NT_SUCCESS(Status)) { 1662 ERR("insert_tree_item returned %08x\n", Status); 1663 return Status; 1664 } 1665 } 1666 1667 le = le->Flink; 1668 } 1669 } 1670 1671 return STATUS_SUCCESS; 1672 } 1673 1674 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1675 KEY searchkey; 1676 traverse_ptr tp; 1677 NTSTATUS Status; 1678 bool b; 1679 LIST_ENTRY items, metadata_items, rollback, *le; 1680 uint64_t loaded = 0, num_loaded = 0; 1681 chunk* newchunk = NULL; 1682 uint8_t* data = NULL; 1683 1684 TRACE("chunk %I64x\n", c->offset); 1685 1686 InitializeListHead(&rollback); 1687 InitializeListHead(&items); 1688 InitializeListHead(&metadata_items); 1689 1690 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1691 1692 searchkey.obj_id = c->offset; 1693 searchkey.obj_type = TYPE_EXTENT_ITEM; 1694 searchkey.offset = 0xffffffffffffffff; 1695 1696 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1697 if (!NT_SUCCESS(Status)) { 1698 ERR("find_item returned %08x\n", Status); 1699 goto end; 1700 } 1701 1702 do { 1703 traverse_ptr next_tp; 1704 1705 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1706 break; 1707 1708 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) { 1709 bool tree = false; 1710 1711 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1712 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1713 1714 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1715 tree = true; 1716 } 1717 1718 if (!tree) { 1719 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback); 1720 1721 if (!NT_SUCCESS(Status)) { 1722 ERR("add_data_reloc returned %08x\n", Status); 1723 goto end; 1724 } 1725 1726 loaded += tp.item->key.offset; 1727 num_loaded++; 1728 1729 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously 1730 break; 1731 } 1732 } 1733 1734 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1735 1736 if (b) 1737 tp = next_tp; 1738 } while (b); 1739 1740 if (IsListEmpty(&items)) { 1741 *changed = false; 1742 Status = STATUS_SUCCESS; 1743 goto end; 1744 } else 1745 *changed = true; 1746 1747 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG); 1748 if (!data) { 1749 ERR("out of memory\n"); 1750 Status = STATUS_INSUFFICIENT_RESOURCES; 1751 goto end; 1752 } 1753 1754 le = items.Flink; 1755 while (le != &items) { 1756 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 1757 bool done = false; 1758 LIST_ENTRY* le2; 1759 uint32_t* csum; 1760 RTL_BITMAP bmp; 1761 ULONG* bmparr; 1762 ULONG bmplen, runlength, index, lastoff; 1763 1764 if (newchunk) { 1765 acquire_chunk_lock(newchunk, Vcb); 1766 1767 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1768 newchunk->used += dr->size; 1769 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1770 done = true; 1771 } 1772 1773 release_chunk_lock(newchunk, Vcb); 1774 } 1775 1776 if (!done) { 1777 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 1778 1779 le2 = Vcb->chunks.Flink; 1780 while (le2 != &Vcb->chunks) { 1781 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 1782 1783 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) { 1784 acquire_chunk_lock(c2, Vcb); 1785 1786 if ((c2->chunk_item->size - c2->used) >= dr->size) { 1787 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) { 1788 c2->used += dr->size; 1789 space_list_subtract(c2, false, dr->new_address, dr->size, &rollback); 1790 release_chunk_lock(c2, Vcb); 1791 newchunk = c2; 1792 done = true; 1793 break; 1794 } 1795 } 1796 1797 release_chunk_lock(c2, Vcb); 1798 } 1799 1800 le2 = le2->Flink; 1801 } 1802 1803 // allocate new chunk if necessary 1804 if (!done) { 1805 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false); 1806 1807 if (!NT_SUCCESS(Status)) { 1808 ERR("alloc_chunk returned %08x\n", Status); 1809 ExReleaseResourceLite(&Vcb->chunk_lock); 1810 goto end; 1811 } 1812 1813 acquire_chunk_lock(newchunk, Vcb); 1814 1815 newchunk->balance_num = Vcb->balance.balance_num; 1816 1817 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1818 release_chunk_lock(newchunk, Vcb); 1819 ExReleaseResourceLite(&Vcb->chunk_lock); 1820 ERR("could not find address in new chunk\n"); 1821 Status = STATUS_DISK_FULL; 1822 goto end; 1823 } else { 1824 newchunk->used += dr->size; 1825 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1826 } 1827 1828 release_chunk_lock(newchunk, Vcb); 1829 } 1830 1831 ExReleaseResourceLite(&Vcb->chunk_lock); 1832 } 1833 1834 dr->newchunk = newchunk; 1835 1836 bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size); 1837 1838 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG); 1839 if (!bmparr) { 1840 ERR("out of memory\n"); 1841 Status = STATUS_INSUFFICIENT_RESOURCES; 1842 goto end; 1843 } 1844 1845 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG); 1846 if (!csum) { 1847 ERR("out of memory\n"); 1848 ExFreePool(bmparr); 1849 Status = STATUS_INSUFFICIENT_RESOURCES; 1850 goto end; 1851 } 1852 1853 RtlInitializeBitMap(&bmp, bmparr, bmplen); 1854 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum 1855 1856 searchkey.obj_id = EXTENT_CSUM_ID; 1857 searchkey.obj_type = TYPE_EXTENT_CSUM; 1858 searchkey.offset = dr->address; 1859 1860 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL); 1861 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 1862 ERR("find_item returned %08x\n", Status); 1863 ExFreePool(csum); 1864 ExFreePool(bmparr); 1865 goto end; 1866 } 1867 1868 if (Status != STATUS_NOT_FOUND) { 1869 do { 1870 traverse_ptr next_tp; 1871 1872 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) { 1873 if (tp.item->key.offset >= dr->address + dr->size) 1874 break; 1875 else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) { 1876 uint64_t cs = max(dr->address, tp.item->key.offset); 1877 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t))); 1878 1879 RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size), 1880 tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size), 1881 (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size)); 1882 1883 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size)); 1884 1885 if (ce == dr->address + dr->size) 1886 break; 1887 } 1888 } 1889 1890 if (find_next_item(Vcb, &tp, &next_tp, false, NULL)) 1891 tp = next_tp; 1892 else 1893 break; 1894 } while (true); 1895 } 1896 1897 lastoff = 0; 1898 runlength = RtlFindFirstRunClear(&bmp, &index); 1899 1900 while (runlength != 0) { 1901 if (index >= bmplen) 1902 break; 1903 1904 if (index + runlength >= bmplen) { 1905 runlength = bmplen - index; 1906 1907 if (runlength == 0) 1908 break; 1909 } 1910 1911 if (index > lastoff) { 1912 ULONG off = lastoff; 1913 ULONG size = index - lastoff; 1914 1915 // handle no csum run 1916 do { 1917 ULONG rl; 1918 1919 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1920 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1921 else 1922 rl = size; 1923 1924 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 1925 c, NULL, NULL, 0, false, NormalPagePriority); 1926 if (!NT_SUCCESS(Status)) { 1927 ERR("read_data returned %08x\n", Status); 1928 ExFreePool(csum); 1929 ExFreePool(bmparr); 1930 goto end; 1931 } 1932 1933 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1934 NULL, newchunk, false, 0, NormalPagePriority); 1935 if (!NT_SUCCESS(Status)) { 1936 ERR("write_data_complete returned %08x\n", Status); 1937 ExFreePool(csum); 1938 ExFreePool(bmparr); 1939 goto end; 1940 } 1941 1942 size -= rl; 1943 off += rl; 1944 } while (size > 0); 1945 } 1946 1947 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL); 1948 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL); 1949 1950 // handle csum run 1951 do { 1952 ULONG rl; 1953 1954 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT) 1955 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1956 else 1957 rl = runlength; 1958 1959 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data, 1960 c, NULL, NULL, 0, false, NormalPagePriority); 1961 if (!NT_SUCCESS(Status)) { 1962 ERR("read_data returned %08x\n", Status); 1963 ExFreePool(csum); 1964 ExFreePool(bmparr); 1965 goto end; 1966 } 1967 1968 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1969 NULL, newchunk, false, 0, NormalPagePriority); 1970 if (!NT_SUCCESS(Status)) { 1971 ERR("write_data_complete returned %08x\n", Status); 1972 ExFreePool(csum); 1973 ExFreePool(bmparr); 1974 goto end; 1975 } 1976 1977 runlength -= rl; 1978 index += rl; 1979 } while (runlength > 0); 1980 1981 lastoff = index; 1982 runlength = RtlFindNextForwardRunClear(&bmp, index, &index); 1983 } 1984 1985 ExFreePool(csum); 1986 ExFreePool(bmparr); 1987 1988 // handle final nocsum run 1989 if (lastoff < dr->size / Vcb->superblock.sector_size) { 1990 ULONG off = lastoff; 1991 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff); 1992 1993 do { 1994 ULONG rl; 1995 1996 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1997 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1998 else 1999 rl = size; 2000 2001 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 2002 c, NULL, NULL, 0, false, NormalPagePriority); 2003 if (!NT_SUCCESS(Status)) { 2004 ERR("read_data returned %08x\n", Status); 2005 goto end; 2006 } 2007 2008 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 2009 NULL, newchunk, false, 0, NormalPagePriority); 2010 if (!NT_SUCCESS(Status)) { 2011 ERR("write_data_complete returned %08x\n", Status); 2012 goto end; 2013 } 2014 2015 size -= rl; 2016 off += rl; 2017 } while (size > 0); 2018 } 2019 2020 le = le->Flink; 2021 } 2022 2023 ExFreePool(data); 2024 data = NULL; 2025 2026 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback); 2027 if (!NT_SUCCESS(Status)) { 2028 ERR("write_metadata_items returned %08x\n", Status); 2029 goto end; 2030 } 2031 2032 le = items.Flink; 2033 while (le != &items) { 2034 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 2035 2036 Status = add_data_reloc_extent_item(Vcb, dr); 2037 if (!NT_SUCCESS(Status)) { 2038 ERR("add_data_reloc_extent_item returned %08x\n", Status); 2039 goto end; 2040 } 2041 2042 le = le->Flink; 2043 } 2044 2045 le = c->changed_extents.Flink; 2046 while (le != &c->changed_extents) { 2047 LIST_ENTRY *le2, *le3; 2048 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 2049 2050 le3 = le->Flink; 2051 2052 le2 = items.Flink; 2053 while (le2 != &items) { 2054 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 2055 2056 if (ce->address == dr->address) { 2057 ce->address = dr->new_address; 2058 RemoveEntryList(&ce->list_entry); 2059 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry); 2060 break; 2061 } 2062 2063 le2 = le2->Flink; 2064 } 2065 2066 le = le3; 2067 } 2068 2069 Status = STATUS_SUCCESS; 2070 2071 Vcb->need_write = true; 2072 2073 end: 2074 if (NT_SUCCESS(Status)) { 2075 // update extents in cache inodes before we flush 2076 le = Vcb->chunks.Flink; 2077 while (le != &Vcb->chunks) { 2078 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry); 2079 2080 if (c2->cache) { 2081 LIST_ENTRY* le2; 2082 2083 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true); 2084 2085 le2 = c2->cache->extents.Flink; 2086 while (le2 != &c2->cache->extents) { 2087 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2088 2089 if (!ext->ignore) { 2090 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2091 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2092 2093 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2094 LIST_ENTRY* le3 = items.Flink; 2095 while (le3 != &items) { 2096 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2097 2098 if (ed2->address == dr->address) { 2099 ed2->address = dr->new_address; 2100 break; 2101 } 2102 2103 le3 = le3->Flink; 2104 } 2105 } 2106 } 2107 } 2108 2109 le2 = le2->Flink; 2110 } 2111 2112 ExReleaseResourceLite(c2->cache->Header.Resource); 2113 } 2114 2115 le = le->Flink; 2116 } 2117 2118 Status = do_write(Vcb, NULL); 2119 if (!NT_SUCCESS(Status)) 2120 ERR("do_write returned %08x\n", Status); 2121 } 2122 2123 if (NT_SUCCESS(Status)) { 2124 clear_rollback(&rollback); 2125 2126 // update open FCBs 2127 // FIXME - speed this up(?) 2128 2129 le = Vcb->all_fcbs.Flink; 2130 while (le != &Vcb->all_fcbs) { 2131 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all); 2132 LIST_ENTRY* le2; 2133 2134 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true); 2135 2136 le2 = fcb->extents.Flink; 2137 while (le2 != &fcb->extents) { 2138 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2139 2140 if (!ext->ignore) { 2141 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2142 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2143 2144 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2145 LIST_ENTRY* le3 = items.Flink; 2146 while (le3 != &items) { 2147 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2148 2149 if (ed2->address == dr->address) { 2150 ed2->address = dr->new_address; 2151 break; 2152 } 2153 2154 le3 = le3->Flink; 2155 } 2156 } 2157 } 2158 } 2159 2160 le2 = le2->Flink; 2161 } 2162 2163 ExReleaseResourceLite(fcb->Header.Resource); 2164 2165 le = le->Flink; 2166 } 2167 } else 2168 do_rollback(Vcb, &rollback); 2169 2170 free_trees(Vcb); 2171 2172 ExReleaseResourceLite(&Vcb->tree_lock); 2173 2174 if (data) 2175 ExFreePool(data); 2176 2177 while (!IsListEmpty(&items)) { 2178 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry); 2179 2180 while (!IsListEmpty(&dr->refs)) { 2181 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 2182 2183 ExFreePool(ref); 2184 } 2185 2186 ExFreePool(dr); 2187 } 2188 2189 while (!IsListEmpty(&metadata_items)) { 2190 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry); 2191 2192 while (!IsListEmpty(&mr->refs)) { 2193 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 2194 2195 ExFreePool(ref); 2196 } 2197 2198 if (mr->data) 2199 ExFreePool(mr->data); 2200 2201 ExFreePool(mr); 2202 } 2203 2204 return Status; 2205 } 2206 2207 static __inline uint64_t get_chunk_dup_type(chunk* c) { 2208 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2209 return BLOCK_FLAG_RAID0; 2210 else if (c->chunk_item->type & BLOCK_FLAG_RAID1) 2211 return BLOCK_FLAG_RAID1; 2212 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE) 2213 return BLOCK_FLAG_DUPLICATE; 2214 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2215 return BLOCK_FLAG_RAID10; 2216 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2217 return BLOCK_FLAG_RAID5; 2218 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2219 return BLOCK_FLAG_RAID6; 2220 else 2221 return BLOCK_FLAG_SINGLE; 2222 } 2223 2224 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) { 2225 btrfs_balance_opts* opts; 2226 2227 opts = &Vcb->balance.opts[sort]; 2228 2229 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED)) 2230 return false; 2231 2232 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2233 uint64_t type = get_chunk_dup_type(c); 2234 2235 if (!(type & opts->profiles)) 2236 return false; 2237 } 2238 2239 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2240 uint16_t i; 2241 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2242 bool b = false; 2243 2244 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2245 if (cis[i].dev_id == opts->devid) { 2246 b = true; 2247 break; 2248 } 2249 } 2250 2251 if (!b) 2252 return false; 2253 } 2254 2255 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2256 uint16_t i, factor; 2257 uint64_t physsize; 2258 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2259 bool b = false; 2260 2261 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2262 factor = c->chunk_item->num_stripes; 2263 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2264 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 2265 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2266 factor = c->chunk_item->num_stripes - 1; 2267 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2268 factor = c->chunk_item->num_stripes - 2; 2269 else // SINGLE, DUPLICATE, RAID1 2270 factor = 1; 2271 2272 physsize = c->chunk_item->size / factor; 2273 2274 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2275 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start && 2276 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) { 2277 b = true; 2278 break; 2279 } 2280 } 2281 2282 if (!b) 2283 return false; 2284 } 2285 2286 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2287 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end) 2288 return false; 2289 } 2290 2291 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2292 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end) 2293 return false; 2294 } 2295 2296 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2297 uint64_t usage = c->used * 100 / c->chunk_item->size; 2298 2299 // usage == 0 should mean completely empty, not just that usage rounds to 0% 2300 if (c->used > 0 && usage == 0) 2301 usage = 1; 2302 2303 if (usage < opts->usage_start || usage > opts->usage_end) 2304 return false; 2305 } 2306 2307 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) { 2308 uint64_t type = get_chunk_dup_type(c); 2309 2310 if (type == opts->convert) 2311 return false; 2312 } 2313 2314 return true; 2315 } 2316 2317 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2318 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2319 args->profiles = opts->profiles; 2320 args->flags |= BALANCE_ARGS_FLAGS_PROFILES; 2321 } 2322 2323 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2324 if (args->usage_start == 0) { 2325 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE; 2326 args->usage_start = opts->usage_start; 2327 args->usage_end = opts->usage_end; 2328 } else { 2329 args->flags |= BALANCE_ARGS_FLAGS_USAGE; 2330 args->usage = opts->usage_end; 2331 } 2332 } 2333 2334 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2335 args->devid = opts->devid; 2336 args->flags |= BALANCE_ARGS_FLAGS_DEVID; 2337 } 2338 2339 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2340 args->drange_start = opts->drange_start; 2341 args->drange_end = opts->drange_end; 2342 args->flags |= BALANCE_ARGS_FLAGS_DRANGE; 2343 } 2344 2345 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2346 args->vrange_start = opts->vrange_start; 2347 args->vrange_end = opts->vrange_end; 2348 args->flags |= BALANCE_ARGS_FLAGS_VRANGE; 2349 } 2350 2351 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) { 2352 args->convert = opts->convert; 2353 args->flags |= BALANCE_ARGS_FLAGS_CONVERT; 2354 2355 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT) 2356 args->flags |= BALANCE_ARGS_FLAGS_SOFT; 2357 } 2358 2359 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) { 2360 if (args->limit_start == 0) { 2361 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE; 2362 args->limit_start = (uint32_t)opts->limit_start; 2363 args->limit_end = (uint32_t)opts->limit_end; 2364 } else { 2365 args->flags |= BALANCE_ARGS_FLAGS_LIMIT; 2366 args->limit = opts->limit_end; 2367 } 2368 } 2369 2370 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2371 args->stripes_start = opts->stripes_start; 2372 args->stripes_end = opts->stripes_end; 2373 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE; 2374 } 2375 } 2376 2377 static NTSTATUS add_balance_item(device_extension* Vcb) { 2378 KEY searchkey; 2379 traverse_ptr tp; 2380 NTSTATUS Status; 2381 BALANCE_ITEM* bi; 2382 2383 searchkey.obj_id = BALANCE_ITEM_ID; 2384 searchkey.obj_type = TYPE_TEMP_ITEM; 2385 searchkey.offset = 0; 2386 2387 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2388 2389 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2390 if (!NT_SUCCESS(Status)) { 2391 ERR("find_item returned %08x\n", Status); 2392 goto end; 2393 } 2394 2395 if (!keycmp(tp.item->key, searchkey)) { 2396 Status = delete_tree_item(Vcb, &tp); 2397 if (!NT_SUCCESS(Status)) { 2398 ERR("delete_tree_item returned %08x\n", Status); 2399 goto end; 2400 } 2401 } 2402 2403 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG); 2404 if (!bi) { 2405 ERR("out of memory\n"); 2406 Status = STATUS_INSUFFICIENT_RESOURCES; 2407 goto end; 2408 } 2409 2410 RtlZeroMemory(bi, sizeof(BALANCE_ITEM)); 2411 2412 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2413 bi->flags |= BALANCE_FLAGS_DATA; 2414 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 2415 } 2416 2417 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2418 bi->flags |= BALANCE_FLAGS_METADATA; 2419 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 2420 } 2421 2422 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2423 bi->flags |= BALANCE_FLAGS_SYSTEM; 2424 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 2425 } 2426 2427 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL); 2428 if (!NT_SUCCESS(Status)) { 2429 ERR("insert_tree_item returned %08x\n", Status); 2430 ExFreePool(bi); 2431 goto end; 2432 } 2433 2434 Status = STATUS_SUCCESS; 2435 2436 end: 2437 if (NT_SUCCESS(Status)) { 2438 Status = do_write(Vcb, NULL); 2439 if (!NT_SUCCESS(Status)) 2440 ERR("do_write returned %08x\n", Status); 2441 } 2442 2443 free_trees(Vcb); 2444 2445 ExReleaseResourceLite(&Vcb->tree_lock); 2446 2447 return Status; 2448 } 2449 2450 static NTSTATUS remove_balance_item(device_extension* Vcb) { 2451 KEY searchkey; 2452 traverse_ptr tp; 2453 NTSTATUS Status; 2454 2455 searchkey.obj_id = BALANCE_ITEM_ID; 2456 searchkey.obj_type = TYPE_TEMP_ITEM; 2457 searchkey.offset = 0; 2458 2459 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2460 2461 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2462 if (!NT_SUCCESS(Status)) { 2463 ERR("find_item returned %08x\n", Status); 2464 goto end; 2465 } 2466 2467 if (!keycmp(tp.item->key, searchkey)) { 2468 Status = delete_tree_item(Vcb, &tp); 2469 if (!NT_SUCCESS(Status)) { 2470 ERR("delete_tree_item returned %08x\n", Status); 2471 goto end; 2472 } 2473 2474 Status = do_write(Vcb, NULL); 2475 if (!NT_SUCCESS(Status)) { 2476 ERR("do_write returned %08x\n", Status); 2477 goto end; 2478 } 2479 2480 free_trees(Vcb); 2481 } 2482 2483 Status = STATUS_SUCCESS; 2484 2485 end: 2486 ExReleaseResourceLite(&Vcb->tree_lock); 2487 2488 return Status; 2489 } 2490 2491 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2492 opts->flags = BTRFS_BALANCE_OPTS_ENABLED; 2493 2494 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) { 2495 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES; 2496 opts->profiles = args->profiles; 2497 } 2498 2499 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) { 2500 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2501 2502 opts->usage_start = 0; 2503 opts->usage_end = (uint8_t)args->usage; 2504 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) { 2505 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2506 2507 opts->usage_start = (uint8_t)args->usage_start; 2508 opts->usage_end = (uint8_t)args->usage_end; 2509 } 2510 2511 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) { 2512 opts->flags |= BTRFS_BALANCE_OPTS_DEVID; 2513 opts->devid = args->devid; 2514 } 2515 2516 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) { 2517 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE; 2518 opts->drange_start = args->drange_start; 2519 opts->drange_end = args->drange_end; 2520 } 2521 2522 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) { 2523 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE; 2524 opts->vrange_start = args->vrange_start; 2525 opts->vrange_end = args->vrange_end; 2526 } 2527 2528 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) { 2529 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2530 2531 opts->limit_start = 0; 2532 opts->limit_end = args->limit; 2533 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) { 2534 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2535 2536 opts->limit_start = args->limit_start; 2537 opts->limit_end = args->limit_end; 2538 } 2539 2540 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) { 2541 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES; 2542 2543 opts->stripes_start = (uint16_t)args->stripes_start; 2544 opts->stripes_end = (uint16_t)args->stripes_end; 2545 } 2546 2547 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) { 2548 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT; 2549 opts->convert = args->convert; 2550 2551 if (args->flags & BALANCE_ARGS_FLAGS_SOFT) 2552 opts->flags |= BTRFS_BALANCE_OPTS_SOFT; 2553 } 2554 } 2555 2556 static NTSTATUS remove_superblocks(device* dev) { 2557 NTSTATUS Status; 2558 superblock* sb; 2559 int i = 0; 2560 2561 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG); 2562 if (!sb) { 2563 ERR("out of memory\n"); 2564 return STATUS_INSUFFICIENT_RESOURCES; 2565 } 2566 2567 RtlZeroMemory(sb, sizeof(superblock)); 2568 2569 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) { 2570 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock)); 2571 2572 if (!NT_SUCCESS(Status)) { 2573 ExFreePool(sb); 2574 return Status; 2575 } 2576 2577 i++; 2578 } 2579 2580 ExFreePool(sb); 2581 2582 return STATUS_SUCCESS; 2583 } 2584 2585 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2586 KEY searchkey; 2587 traverse_ptr tp; 2588 NTSTATUS Status; 2589 LIST_ENTRY* le; 2590 volume_device_extension* vde; 2591 2592 if (Vcb->need_write) { 2593 Status = do_write(Vcb, NULL); 2594 2595 if (!NT_SUCCESS(Status)) 2596 ERR("do_write returned %08x\n", Status); 2597 } else 2598 Status = STATUS_SUCCESS; 2599 2600 free_trees(Vcb); 2601 2602 if (!NT_SUCCESS(Status)) 2603 return Status; 2604 2605 // remove entry in chunk tree 2606 2607 searchkey.obj_id = 1; 2608 searchkey.obj_type = TYPE_DEV_ITEM; 2609 searchkey.offset = dev->devitem.dev_id; 2610 2611 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL); 2612 if (!NT_SUCCESS(Status)) { 2613 ERR("find_item returned %08x\n", Status); 2614 return Status; 2615 } 2616 2617 if (!keycmp(searchkey, tp.item->key)) { 2618 Status = delete_tree_item(Vcb, &tp); 2619 2620 if (!NT_SUCCESS(Status)) { 2621 ERR("delete_tree_item returned %08x\n", Status); 2622 return Status; 2623 } 2624 } 2625 2626 // remove stats entry in device tree 2627 2628 searchkey.obj_id = 0; 2629 searchkey.obj_type = TYPE_DEV_STATS; 2630 searchkey.offset = dev->devitem.dev_id; 2631 2632 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2633 if (!NT_SUCCESS(Status)) { 2634 ERR("find_item returned %08x\n", Status); 2635 return Status; 2636 } 2637 2638 if (!keycmp(searchkey, tp.item->key)) { 2639 Status = delete_tree_item(Vcb, &tp); 2640 2641 if (!NT_SUCCESS(Status)) { 2642 ERR("delete_tree_item returned %08x\n", Status); 2643 return Status; 2644 } 2645 } 2646 2647 // update superblock 2648 2649 Vcb->superblock.num_devices--; 2650 Vcb->superblock.total_bytes -= dev->devitem.num_bytes; 2651 Vcb->devices_loaded--; 2652 2653 RemoveEntryList(&dev->list_entry); 2654 2655 // flush 2656 2657 Status = do_write(Vcb, NULL); 2658 if (!NT_SUCCESS(Status)) 2659 ERR("do_write returned %08x\n", Status); 2660 2661 free_trees(Vcb); 2662 2663 if (!NT_SUCCESS(Status)) 2664 return Status; 2665 2666 if (!dev->readonly && dev->devobj) { 2667 Status = remove_superblocks(dev); 2668 if (!NT_SUCCESS(Status)) 2669 WARN("remove_superblocks returned %08x\n", Status); 2670 } 2671 2672 // remove entry in volume list 2673 2674 vde = Vcb->vde; 2675 2676 if (dev->devobj) { 2677 pdo_device_extension* pdode = vde->pdode; 2678 2679 ExAcquireResourceExclusiveLite(&pdode->child_lock, true); 2680 2681 le = pdode->children.Flink; 2682 while (le != &pdode->children) { 2683 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2684 2685 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) { 2686 PFILE_OBJECT FileObject; 2687 PDEVICE_OBJECT mountmgr; 2688 UNICODE_STRING mmdevpath; 2689 2690 pdode->children_loaded--; 2691 2692 if (vc->had_drive_letter) { // re-add entry to mountmgr 2693 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME); 2694 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr); 2695 if (!NT_SUCCESS(Status)) 2696 ERR("IoGetDeviceObjectPointer returned %08x\n", Status); 2697 else { 2698 MOUNTDEV_NAME mdn; 2699 2700 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL); 2701 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW) 2702 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status); 2703 else { 2704 MOUNTDEV_NAME* mdn2; 2705 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength; 2706 2707 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG); 2708 if (!mdn2) 2709 ERR("out of memory\n"); 2710 else { 2711 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL); 2712 if (!NT_SUCCESS(Status)) 2713 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status); 2714 else { 2715 UNICODE_STRING name; 2716 2717 name.Buffer = mdn2->Name; 2718 name.Length = name.MaximumLength = mdn2->NameLength; 2719 2720 Status = mountmgr_add_drive_letter(mountmgr, &name); 2721 if (!NT_SUCCESS(Status)) 2722 WARN("mountmgr_add_drive_letter returned %08x\n", Status); 2723 } 2724 2725 ExFreePool(mdn2); 2726 } 2727 } 2728 2729 ObDereferenceObject(FileObject); 2730 } 2731 } 2732 2733 ExFreePool(vc->pnp_name.Buffer); 2734 RemoveEntryList(&vc->list_entry); 2735 ExFreePool(vc); 2736 2737 ObDereferenceObject(vc->fileobj); 2738 2739 break; 2740 } 2741 2742 le = le->Flink; 2743 } 2744 2745 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) { 2746 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA; 2747 2748 le = pdode->children.Flink; 2749 while (le != &pdode->children) { 2750 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2751 2752 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) { 2753 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA; 2754 break; 2755 } 2756 2757 le = le->Flink; 2758 } 2759 } 2760 2761 pdode->num_children = Vcb->superblock.num_devices; 2762 2763 ExReleaseResourceLite(&pdode->child_lock); 2764 2765 // free dev 2766 2767 if (dev->trim && !dev->readonly && !Vcb->options.no_trim) 2768 trim_whole_device(dev); 2769 } 2770 2771 while (!IsListEmpty(&dev->space)) { 2772 LIST_ENTRY* le2 = RemoveHeadList(&dev->space); 2773 space* s = CONTAINING_RECORD(le2, space, list_entry); 2774 2775 ExFreePool(s); 2776 } 2777 2778 ExFreePool(dev); 2779 2780 if (Vcb->trim) { 2781 Vcb->trim = false; 2782 2783 le = Vcb->devices.Flink; 2784 while (le != &Vcb->devices) { 2785 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 2786 2787 if (dev2->trim) { 2788 Vcb->trim = true; 2789 break; 2790 } 2791 2792 le = le->Flink; 2793 } 2794 } 2795 2796 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 2797 2798 return STATUS_SUCCESS; 2799 } 2800 2801 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2802 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa; 2803 DEVICE_DATA_SET_RANGE* ranges; 2804 ULONG datalen, i; 2805 KEY searchkey; 2806 traverse_ptr tp; 2807 NTSTATUS Status; 2808 bool b; 2809 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there 2810 LIST_ENTRY* le; 2811 2812 dev->num_trim_entries = 0; 2813 2814 searchkey.obj_id = dev->devitem.dev_id; 2815 searchkey.obj_type = TYPE_DEV_EXTENT; 2816 searchkey.offset = 0; 2817 2818 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2819 if (!NT_SUCCESS(Status)) { 2820 ERR("find_item returned %08x\n", Status); 2821 return; 2822 } 2823 2824 do { 2825 traverse_ptr next_tp; 2826 2827 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) { 2828 if (tp.item->size >= sizeof(DEV_EXTENT)) { 2829 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data; 2830 2831 if (tp.item->key.offset > lastoff) 2832 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff); 2833 2834 lastoff = tp.item->key.offset + de->length; 2835 } else { 2836 ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT)); 2837 return; 2838 } 2839 } 2840 2841 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 2842 2843 if (b) { 2844 tp = next_tp; 2845 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type)) 2846 break; 2847 } 2848 } while (b); 2849 2850 if (lastoff < dev->devitem.num_bytes) 2851 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff); 2852 2853 if (dev->num_trim_entries == 0) 2854 return; 2855 2856 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE)); 2857 2858 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG); 2859 if (!dmdsa) { 2860 ERR("out of memory\n"); 2861 goto end; 2862 } 2863 2864 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES); 2865 dmdsa->Action = DeviceDsmAction_Trim; 2866 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED; 2867 dmdsa->ParameterBlockOffset = 0; 2868 dmdsa->ParameterBlockLength = 0; 2869 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)); 2870 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE); 2871 2872 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset); 2873 2874 i = 0; 2875 le = dev->trim_list.Flink; 2876 while (le != &dev->trim_list) { 2877 space* s = CONTAINING_RECORD(le, space, list_entry); 2878 2879 ranges[i].StartingOffset = s->address; 2880 ranges[i].LengthInBytes = s->size; 2881 i++; 2882 2883 le = le->Flink; 2884 } 2885 2886 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL); 2887 if (!NT_SUCCESS(Status)) 2888 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status); 2889 2890 ExFreePool(dmdsa); 2891 2892 end: 2893 while (!IsListEmpty(&dev->trim_list)) { 2894 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry); 2895 ExFreePool(s); 2896 } 2897 2898 dev->num_trim_entries = 0; 2899 } 2900 2901 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) { 2902 NTSTATUS Status; 2903 bool changed; 2904 LIST_ENTRY* le; 2905 chunk* rc; 2906 2907 // FIXME - allow with metadata chunks? 2908 2909 while (true) { 2910 rc = NULL; 2911 2912 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 2913 2914 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 2915 2916 // choose the least-used chunk we haven't looked at yet 2917 le = Vcb->chunks.Flink; 2918 while (le != &Vcb->chunks) { 2919 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 2920 2921 // FIXME - skip full-size chunks over e.g. 90% full? 2922 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used)) 2923 rc = c; 2924 2925 le = le->Flink; 2926 } 2927 2928 ExReleaseResourceLite(&Vcb->chunk_lock); 2929 2930 if (!rc) { 2931 ExReleaseResourceLite(&Vcb->tree_lock); 2932 break; 2933 } 2934 2935 if (rc->list_entry_balance.Flink) { 2936 RemoveEntryList(&rc->list_entry_balance); 2937 Vcb->balance.chunks_left--; 2938 } 2939 2940 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped 2941 rc->reloc = true; 2942 2943 ExReleaseResourceLite(&Vcb->tree_lock); 2944 2945 do { 2946 changed = false; 2947 2948 Status = balance_data_chunk(Vcb, rc, &changed); 2949 if (!NT_SUCCESS(Status)) { 2950 ERR("balance_data_chunk returned %08x\n", Status); 2951 Vcb->balance.status = Status; 2952 rc->list_entry_balance.Flink = NULL; 2953 rc->reloc = false; 2954 return Status; 2955 } 2956 2957 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 2958 2959 if (Vcb->readonly) 2960 Vcb->balance.stopping = true; 2961 2962 if (Vcb->balance.stopping) 2963 return STATUS_SUCCESS; 2964 } while (changed); 2965 2966 rc->list_entry_balance.Flink = NULL; 2967 2968 rc->changed = true; 2969 rc->space_changed = true; 2970 rc->balance_num = Vcb->balance.balance_num; 2971 2972 Status = do_write(Vcb, NULL); 2973 if (!NT_SUCCESS(Status)) { 2974 ERR("do_write returned %08x\n", Status); 2975 return Status; 2976 } 2977 2978 free_trees(Vcb); 2979 } 2980 2981 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 2982 2983 Status = alloc_chunk(Vcb, flags, &rc, true); 2984 2985 ExReleaseResourceLite(&Vcb->chunk_lock); 2986 2987 if (NT_SUCCESS(Status)) { 2988 *newchunk = rc; 2989 return Status; 2990 } else { 2991 ERR("alloc_chunk returned %08x\n", Status); 2992 return Status; 2993 } 2994 } 2995 2996 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) { 2997 LIST_ENTRY* le; 2998 2999 while (!IsListEmpty(&dev->space)) { 3000 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry); 3001 3002 ExFreePool(s); 3003 } 3004 3005 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device. 3006 3007 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL); 3008 3009 le = Vcb->chunks.Flink; 3010 while (le != &Vcb->chunks) { 3011 uint16_t n; 3012 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3013 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 3014 3015 for (n = 0; n < c->chunk_item->num_stripes; n++) { 3016 uint64_t stripe_size = 0; 3017 3018 if (cis[n].dev_id == dev->devitem.dev_id) { 3019 if (stripe_size == 0) { 3020 uint16_t factor; 3021 3022 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 3023 factor = c->chunk_item->num_stripes; 3024 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 3025 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 3026 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 3027 factor = c->chunk_item->num_stripes - 1; 3028 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 3029 factor = c->chunk_item->num_stripes - 2; 3030 else // SINGLE, DUP, RAID1 3031 factor = 1; 3032 3033 stripe_size = c->chunk_item->size / factor; 3034 } 3035 3036 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL); 3037 } 3038 } 3039 3040 le = le->Flink; 3041 } 3042 3043 return STATUS_SUCCESS; 3044 } 3045 3046 _Function_class_(KSTART_ROUTINE) 3047 void __stdcall balance_thread(void* context) { 3048 device_extension* Vcb = (device_extension*)context; 3049 LIST_ENTRY chunks; 3050 LIST_ENTRY* le; 3051 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0; 3052 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0; 3053 NTSTATUS Status; 3054 3055 Vcb->balance.balance_num++; 3056 3057 Vcb->balance.stopping = false; 3058 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false); 3059 3060 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3061 old_data_flags = Vcb->data_flags; 3062 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert); 3063 3064 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3065 } 3066 3067 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3068 old_metadata_flags = Vcb->metadata_flags; 3069 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert); 3070 } 3071 3072 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3073 old_system_flags = Vcb->system_flags; 3074 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert); 3075 } 3076 3077 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) { 3078 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3079 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3080 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3081 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3082 } 3083 3084 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0; 3085 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0; 3086 3087 InitializeListHead(&chunks); 3088 3089 // FIXME - what are we supposed to do with limit_start? 3090 3091 if (!Vcb->readonly) { 3092 if (!Vcb->balance.removing && !Vcb->balance.shrinking) { 3093 Status = add_balance_item(Vcb); 3094 if (!NT_SUCCESS(Status)) { 3095 ERR("add_balance_item returned %08x\n", Status); 3096 Vcb->balance.status = Status; 3097 goto end; 3098 } 3099 } else { 3100 if (Vcb->need_write) { 3101 Status = do_write(Vcb, NULL); 3102 3103 free_trees(Vcb); 3104 3105 if (!NT_SUCCESS(Status)) { 3106 ERR("do_write returned %08x\n", Status); 3107 Vcb->balance.status = Status; 3108 goto end; 3109 } 3110 } 3111 } 3112 } 3113 3114 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3115 3116 if (Vcb->balance.stopping) 3117 goto end; 3118 3119 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3120 3121 le = Vcb->chunks.Flink; 3122 while (le != &Vcb->chunks) { 3123 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3124 uint8_t sort; 3125 3126 acquire_chunk_lock(c, Vcb); 3127 3128 if (c->chunk_item->type & BLOCK_FLAG_DATA) 3129 sort = BALANCE_OPTS_DATA; 3130 else if (c->chunk_item->type & BLOCK_FLAG_METADATA) 3131 sort = BALANCE_OPTS_METADATA; 3132 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM) 3133 sort = BALANCE_OPTS_SYSTEM; 3134 else { 3135 ERR("unexpected chunk type %I64x\n", c->chunk_item->type); 3136 release_chunk_lock(c, Vcb); 3137 break; 3138 } 3139 3140 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) && 3141 should_balance_chunk(Vcb, sort, c)) { 3142 InsertTailList(&chunks, &c->list_entry_balance); 3143 3144 num_chunks[sort]++; 3145 Vcb->balance.total_chunks++; 3146 Vcb->balance.chunks_left++; 3147 } else if (sort == BALANCE_OPTS_METADATA) 3148 okay_metadata_chunks++; 3149 else if (sort == BALANCE_OPTS_DATA) 3150 okay_data_chunks++; 3151 else if (sort == BALANCE_OPTS_SYSTEM) 3152 okay_system_chunks++; 3153 3154 if (!c->cache_loaded) { 3155 Status = load_cache_chunk(Vcb, c, NULL); 3156 3157 if (!NT_SUCCESS(Status)) { 3158 ERR("load_cache_chunk returned %08x\n", Status); 3159 Vcb->balance.status = Status; 3160 release_chunk_lock(c, Vcb); 3161 ExReleaseResourceLite(&Vcb->chunk_lock); 3162 goto end; 3163 } 3164 } 3165 3166 release_chunk_lock(c, Vcb); 3167 3168 le = le->Flink; 3169 } 3170 3171 ExReleaseResourceLite(&Vcb->chunk_lock); 3172 3173 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up 3174 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) { 3175 bool consolidated = false; 3176 chunk* c; 3177 3178 if (okay_metadata_chunks == 0) { 3179 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3180 3181 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true); 3182 if (NT_SUCCESS(Status)) 3183 c->balance_num = Vcb->balance.balance_num; 3184 else if (Status != STATUS_DISK_FULL || consolidated) { 3185 ERR("alloc_chunk returned %08x\n", Status); 3186 ExReleaseResourceLite(&Vcb->chunk_lock); 3187 Vcb->balance.status = Status; 3188 goto end; 3189 } 3190 3191 ExReleaseResourceLite(&Vcb->chunk_lock); 3192 3193 if (Status == STATUS_DISK_FULL) { 3194 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c); 3195 if (!NT_SUCCESS(Status)) { 3196 ERR("try_consolidation returned %08x\n", Status); 3197 Vcb->balance.status = Status; 3198 goto end; 3199 } else 3200 c->balance_num = Vcb->balance.balance_num; 3201 3202 consolidated = true; 3203 3204 if (Vcb->balance.stopping) 3205 goto end; 3206 } 3207 } 3208 3209 if (okay_data_chunks == 0) { 3210 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3211 3212 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true); 3213 if (NT_SUCCESS(Status)) 3214 c->balance_num = Vcb->balance.balance_num; 3215 else if (Status != STATUS_DISK_FULL || consolidated) { 3216 ERR("alloc_chunk returned %08x\n", Status); 3217 ExReleaseResourceLite(&Vcb->chunk_lock); 3218 Vcb->balance.status = Status; 3219 goto end; 3220 } 3221 3222 ExReleaseResourceLite(&Vcb->chunk_lock); 3223 3224 if (Status == STATUS_DISK_FULL) { 3225 Status = try_consolidation(Vcb, Vcb->data_flags, &c); 3226 if (!NT_SUCCESS(Status)) { 3227 ERR("try_consolidation returned %08x\n", Status); 3228 Vcb->balance.status = Status; 3229 goto end; 3230 } else 3231 c->balance_num = Vcb->balance.balance_num; 3232 3233 consolidated = true; 3234 3235 if (Vcb->balance.stopping) 3236 goto end; 3237 } 3238 } 3239 3240 if (okay_system_chunks == 0) { 3241 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3242 3243 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true); 3244 if (NT_SUCCESS(Status)) 3245 c->balance_num = Vcb->balance.balance_num; 3246 else if (Status != STATUS_DISK_FULL || consolidated) { 3247 ERR("alloc_chunk returned %08x\n", Status); 3248 ExReleaseResourceLite(&Vcb->chunk_lock); 3249 Vcb->balance.status = Status; 3250 goto end; 3251 } 3252 3253 ExReleaseResourceLite(&Vcb->chunk_lock); 3254 3255 if (Status == STATUS_DISK_FULL) { 3256 Status = try_consolidation(Vcb, Vcb->system_flags, &c); 3257 if (!NT_SUCCESS(Status)) { 3258 ERR("try_consolidation returned %08x\n", Status); 3259 Vcb->balance.status = Status; 3260 goto end; 3261 } else 3262 c->balance_num = Vcb->balance.balance_num; 3263 3264 consolidated = true; 3265 3266 if (Vcb->balance.stopping) 3267 goto end; 3268 } 3269 } 3270 } 3271 3272 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3273 3274 le = chunks.Flink; 3275 while (le != &chunks) { 3276 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3277 3278 c->reloc = true; 3279 3280 le = le->Flink; 3281 } 3282 3283 ExReleaseResourceLite(&Vcb->chunk_lock); 3284 3285 // do data chunks before metadata 3286 le = chunks.Flink; 3287 while (le != &chunks) { 3288 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3289 LIST_ENTRY* le2 = le->Flink; 3290 3291 if (c->chunk_item->type & BLOCK_FLAG_DATA) { 3292 bool changed; 3293 3294 do { 3295 changed = false; 3296 3297 Status = balance_data_chunk(Vcb, c, &changed); 3298 if (!NT_SUCCESS(Status)) { 3299 ERR("balance_data_chunk returned %08x\n", Status); 3300 Vcb->balance.status = Status; 3301 goto end; 3302 } 3303 3304 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3305 3306 if (Vcb->readonly) 3307 Vcb->balance.stopping = true; 3308 3309 if (Vcb->balance.stopping) 3310 break; 3311 } while (changed); 3312 3313 c->changed = true; 3314 c->space_changed = true; 3315 } 3316 3317 if (Vcb->balance.stopping) 3318 goto end; 3319 3320 if (c->chunk_item->type & BLOCK_FLAG_DATA && 3321 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) { 3322 RemoveEntryList(&c->list_entry_balance); 3323 c->list_entry_balance.Flink = NULL; 3324 3325 Vcb->balance.chunks_left--; 3326 } 3327 3328 le = le2; 3329 } 3330 3331 // do metadata chunks 3332 while (!IsListEmpty(&chunks)) { 3333 chunk* c; 3334 bool changed; 3335 3336 le = RemoveHeadList(&chunks); 3337 c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3338 3339 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) { 3340 do { 3341 Status = balance_metadata_chunk(Vcb, c, &changed); 3342 if (!NT_SUCCESS(Status)) { 3343 ERR("balance_metadata_chunk returned %08x\n", Status); 3344 Vcb->balance.status = Status; 3345 goto end; 3346 } 3347 3348 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3349 3350 if (Vcb->readonly) 3351 Vcb->balance.stopping = true; 3352 3353 if (Vcb->balance.stopping) 3354 break; 3355 } while (changed); 3356 3357 c->changed = true; 3358 c->space_changed = true; 3359 } 3360 3361 if (Vcb->balance.stopping) 3362 break; 3363 3364 c->list_entry_balance.Flink = NULL; 3365 3366 Vcb->balance.chunks_left--; 3367 } 3368 3369 end: 3370 if (!Vcb->readonly) { 3371 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3372 le = chunks.Flink; 3373 while (le != &chunks) { 3374 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3375 c->reloc = false; 3376 3377 le = le->Flink; 3378 c->list_entry_balance.Flink = NULL; 3379 } 3380 3381 if (old_data_flags != 0) 3382 Vcb->data_flags = old_data_flags; 3383 3384 if (old_metadata_flags != 0) 3385 Vcb->metadata_flags = old_metadata_flags; 3386 3387 if (old_system_flags != 0) 3388 Vcb->system_flags = old_system_flags; 3389 } 3390 3391 if (Vcb->balance.removing) { 3392 device* dev = NULL; 3393 3394 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3395 3396 le = Vcb->devices.Flink; 3397 while (le != &Vcb->devices) { 3398 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3399 3400 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3401 dev = dev2; 3402 break; 3403 } 3404 3405 le = le->Flink; 3406 } 3407 3408 if (dev) { 3409 if (Vcb->balance.chunks_left == 0) { 3410 Status = finish_removing_device(Vcb, dev); 3411 3412 if (!NT_SUCCESS(Status)) { 3413 ERR("finish_removing_device returned %08x\n", Status); 3414 dev->reloc = false; 3415 } 3416 } else 3417 dev->reloc = false; 3418 } 3419 3420 ExReleaseResourceLite(&Vcb->tree_lock); 3421 } else if (Vcb->balance.shrinking) { 3422 device* dev = NULL; 3423 3424 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3425 3426 le = Vcb->devices.Flink; 3427 while (le != &Vcb->devices) { 3428 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3429 3430 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3431 dev = dev2; 3432 break; 3433 } 3434 3435 le = le->Flink; 3436 } 3437 3438 if (!dev) { 3439 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid); 3440 Vcb->balance.status = STATUS_INTERNAL_ERROR; 3441 } 3442 3443 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3444 if (dev) { 3445 Status = regenerate_space_list(Vcb, dev); 3446 if (!NT_SUCCESS(Status)) 3447 WARN("regenerate_space_list returned %08x\n", Status); 3448 } 3449 } else { 3450 uint64_t old_size; 3451 3452 old_size = dev->devitem.num_bytes; 3453 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start; 3454 3455 Status = update_dev_item(Vcb, dev, NULL); 3456 if (!NT_SUCCESS(Status)) { 3457 ERR("update_dev_item returned %08x\n", Status); 3458 dev->devitem.num_bytes = old_size; 3459 Vcb->balance.status = Status; 3460 3461 Status = regenerate_space_list(Vcb, dev); 3462 if (!NT_SUCCESS(Status)) 3463 WARN("regenerate_space_list returned %08x\n", Status); 3464 } else { 3465 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes; 3466 3467 Status = do_write(Vcb, NULL); 3468 if (!NT_SUCCESS(Status)) 3469 ERR("do_write returned %08x\n", Status); 3470 3471 free_trees(Vcb); 3472 } 3473 } 3474 3475 ExReleaseResourceLite(&Vcb->tree_lock); 3476 3477 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status)) 3478 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3479 } else { 3480 Status = remove_balance_item(Vcb); 3481 if (!NT_SUCCESS(Status)) { 3482 ERR("remove_balance_item returned %08x\n", Status); 3483 goto end; 3484 } 3485 } 3486 3487 if (Vcb->trim && !Vcb->options.no_trim) { 3488 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3489 3490 le = Vcb->devices.Flink; 3491 while (le != &Vcb->devices) { 3492 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3493 3494 if (dev2->devobj && !dev2->readonly && dev2->trim) 3495 trim_unalloc_space(Vcb, dev2); 3496 3497 le = le->Flink; 3498 } 3499 3500 ExReleaseResourceLite(&Vcb->tree_lock); 3501 } 3502 } 3503 3504 ZwClose(Vcb->balance.thread); 3505 Vcb->balance.thread = NULL; 3506 3507 KeSetEvent(&Vcb->balance.finished, 0, false); 3508 } 3509 3510 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3511 NTSTATUS Status; 3512 btrfs_start_balance* bsb = (btrfs_start_balance*)data; 3513 OBJECT_ATTRIBUTES oa; 3514 uint8_t i; 3515 3516 if (length < sizeof(btrfs_start_balance) || !data) 3517 return STATUS_INVALID_PARAMETER; 3518 3519 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3520 return STATUS_PRIVILEGE_NOT_HELD; 3521 3522 if (Vcb->locked) { 3523 WARN("cannot start balance while locked\n"); 3524 return STATUS_DEVICE_NOT_READY; 3525 } 3526 3527 if (Vcb->scrub.thread) { 3528 WARN("cannot start balance while scrub running\n"); 3529 return STATUS_DEVICE_NOT_READY; 3530 } 3531 3532 if (Vcb->balance.thread) { 3533 WARN("balance already running\n"); 3534 return STATUS_DEVICE_NOT_READY; 3535 } 3536 3537 if (Vcb->readonly) 3538 return STATUS_MEDIA_WRITE_PROTECTED; 3539 3540 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3541 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3542 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED)) 3543 return STATUS_SUCCESS; 3544 3545 for (i = 0; i < 3; i++) { 3546 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3547 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) { 3548 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 | 3549 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE; 3550 3551 if (bsb->opts[i].profiles == 0) 3552 return STATUS_INVALID_PARAMETER; 3553 } 3554 3555 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) { 3556 if (bsb->opts[i].devid == 0) 3557 return STATUS_INVALID_PARAMETER; 3558 } 3559 3560 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) { 3561 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end) 3562 return STATUS_INVALID_PARAMETER; 3563 } 3564 3565 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) { 3566 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end) 3567 return STATUS_INVALID_PARAMETER; 3568 } 3569 3570 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) { 3571 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start); 3572 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end); 3573 3574 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end) 3575 return STATUS_INVALID_PARAMETER; 3576 } 3577 3578 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) { 3579 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start); 3580 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end); 3581 3582 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3583 return STATUS_INVALID_PARAMETER; 3584 } 3585 3586 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) { 3587 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start); 3588 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end); 3589 3590 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3591 return STATUS_INVALID_PARAMETER; 3592 } 3593 3594 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3595 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 && 3596 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 && 3597 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 && 3598 bsb->opts[i].convert != BLOCK_FLAG_SINGLE) 3599 return STATUS_INVALID_PARAMETER; 3600 } 3601 } 3602 } 3603 3604 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3605 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3606 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3607 3608 Vcb->balance.paused = false; 3609 Vcb->balance.removing = false; 3610 Vcb->balance.shrinking = false; 3611 Vcb->balance.status = STATUS_SUCCESS; 3612 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3613 3614 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3615 3616 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3617 if (!NT_SUCCESS(Status)) { 3618 ERR("PsCreateSystemThread returned %08x\n", Status); 3619 return Status; 3620 } 3621 3622 return STATUS_SUCCESS; 3623 } 3624 3625 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) { 3626 KEY searchkey; 3627 traverse_ptr tp; 3628 NTSTATUS Status; 3629 BALANCE_ITEM* bi; 3630 OBJECT_ATTRIBUTES oa; 3631 int i; 3632 3633 searchkey.obj_id = BALANCE_ITEM_ID; 3634 searchkey.obj_type = TYPE_TEMP_ITEM; 3635 searchkey.offset = 0; 3636 3637 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 3638 if (!NT_SUCCESS(Status)) { 3639 ERR("find_item returned %08x\n", Status); 3640 return Status; 3641 } 3642 3643 if (keycmp(tp.item->key, searchkey)) { 3644 TRACE("no balance item found\n"); 3645 return STATUS_NOT_FOUND; 3646 } 3647 3648 if (tp.item->size < sizeof(BALANCE_ITEM)) { 3649 WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, 3650 tp.item->size, sizeof(BALANCE_ITEM)); 3651 return STATUS_INTERNAL_ERROR; 3652 } 3653 3654 bi = (BALANCE_ITEM*)tp.item->data; 3655 3656 if (bi->flags & BALANCE_FLAGS_DATA) 3657 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 3658 3659 if (bi->flags & BALANCE_FLAGS_METADATA) 3660 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 3661 3662 if (bi->flags & BALANCE_FLAGS_SYSTEM) 3663 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 3664 3665 // do the heuristics that Linux driver does 3666 3667 for (i = 0; i < 3; i++) { 3668 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3669 // if converting, don't redo chunks already done 3670 3671 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3672 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT; 3673 3674 // don't balance chunks more than 90% filled - presumably these 3675 // have already been done 3676 3677 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) && 3678 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3679 ) { 3680 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE; 3681 Vcb->balance.opts[i].usage_start = 0; 3682 Vcb->balance.opts[i].usage_end = 90; 3683 } 3684 } 3685 } 3686 3687 if (Vcb->readonly || Vcb->options.skip_balance) 3688 Vcb->balance.paused = true; 3689 else 3690 Vcb->balance.paused = false; 3691 3692 Vcb->balance.removing = false; 3693 Vcb->balance.shrinking = false; 3694 Vcb->balance.status = STATUS_SUCCESS; 3695 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3696 3697 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3698 3699 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3700 if (!NT_SUCCESS(Status)) { 3701 ERR("PsCreateSystemThread returned %08x\n", Status); 3702 return Status; 3703 } 3704 3705 return STATUS_SUCCESS; 3706 } 3707 3708 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) { 3709 btrfs_query_balance* bqb = (btrfs_query_balance*)data; 3710 3711 if (length < sizeof(btrfs_query_balance) || !data) 3712 return STATUS_INVALID_PARAMETER; 3713 3714 if (!Vcb->balance.thread) { 3715 bqb->status = BTRFS_BALANCE_STOPPED; 3716 3717 if (!NT_SUCCESS(Vcb->balance.status)) { 3718 bqb->status |= BTRFS_BALANCE_ERROR; 3719 bqb->error = Vcb->balance.status; 3720 } 3721 3722 return STATUS_SUCCESS; 3723 } 3724 3725 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING; 3726 3727 if (Vcb->balance.removing) 3728 bqb->status |= BTRFS_BALANCE_REMOVAL; 3729 3730 if (Vcb->balance.shrinking) 3731 bqb->status |= BTRFS_BALANCE_SHRINKING; 3732 3733 if (!NT_SUCCESS(Vcb->balance.status)) 3734 bqb->status |= BTRFS_BALANCE_ERROR; 3735 3736 bqb->chunks_left = Vcb->balance.chunks_left; 3737 bqb->total_chunks = Vcb->balance.total_chunks; 3738 bqb->error = Vcb->balance.status; 3739 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3740 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3741 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3742 3743 return STATUS_SUCCESS; 3744 } 3745 3746 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3747 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3748 return STATUS_PRIVILEGE_NOT_HELD; 3749 3750 if (!Vcb->balance.thread) 3751 return STATUS_DEVICE_NOT_READY; 3752 3753 if (Vcb->balance.paused) 3754 return STATUS_DEVICE_NOT_READY; 3755 3756 Vcb->balance.paused = true; 3757 KeClearEvent(&Vcb->balance.event); 3758 3759 return STATUS_SUCCESS; 3760 } 3761 3762 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3763 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3764 return STATUS_PRIVILEGE_NOT_HELD; 3765 3766 if (!Vcb->balance.thread) 3767 return STATUS_DEVICE_NOT_READY; 3768 3769 if (!Vcb->balance.paused) 3770 return STATUS_DEVICE_NOT_READY; 3771 3772 if (Vcb->readonly) 3773 return STATUS_MEDIA_WRITE_PROTECTED; 3774 3775 Vcb->balance.paused = false; 3776 KeSetEvent(&Vcb->balance.event, 0, false); 3777 3778 return STATUS_SUCCESS; 3779 } 3780 3781 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3782 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3783 return STATUS_PRIVILEGE_NOT_HELD; 3784 3785 if (!Vcb->balance.thread) 3786 return STATUS_DEVICE_NOT_READY; 3787 3788 Vcb->balance.paused = false; 3789 Vcb->balance.stopping = true; 3790 Vcb->balance.status = STATUS_SUCCESS; 3791 KeSetEvent(&Vcb->balance.event, 0, false); 3792 3793 return STATUS_SUCCESS; 3794 } 3795 3796 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3797 uint64_t devid; 3798 LIST_ENTRY* le; 3799 device* dev = NULL; 3800 NTSTATUS Status; 3801 int i; 3802 uint64_t num_rw_devices; 3803 OBJECT_ATTRIBUTES oa; 3804 3805 TRACE("(%p, %p, %x)\n", Vcb, data, length); 3806 3807 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3808 return STATUS_PRIVILEGE_NOT_HELD; 3809 3810 if (length < sizeof(uint64_t)) 3811 return STATUS_INVALID_PARAMETER; 3812 3813 devid = *(uint64_t*)data; 3814 3815 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 3816 3817 if (Vcb->readonly) { 3818 ExReleaseResourceLite(&Vcb->tree_lock); 3819 return STATUS_MEDIA_WRITE_PROTECTED; 3820 } 3821 3822 num_rw_devices = 0; 3823 3824 le = Vcb->devices.Flink; 3825 while (le != &Vcb->devices) { 3826 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3827 3828 if (dev2->devitem.dev_id == devid) 3829 dev = dev2; 3830 3831 if (!dev2->readonly) 3832 num_rw_devices++; 3833 3834 le = le->Flink; 3835 } 3836 3837 if (!dev) { 3838 ExReleaseResourceLite(&Vcb->tree_lock); 3839 WARN("device %I64x not found\n", devid); 3840 return STATUS_NOT_FOUND; 3841 } 3842 3843 if (!dev->readonly) { 3844 if (num_rw_devices == 1) { 3845 ExReleaseResourceLite(&Vcb->tree_lock); 3846 WARN("not removing last non-readonly device\n"); 3847 return STATUS_INVALID_PARAMETER; 3848 } 3849 3850 if (num_rw_devices == 4 && 3851 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) || 3852 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6)) 3853 ) { 3854 ExReleaseResourceLite(&Vcb->tree_lock); 3855 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n"); 3856 return STATUS_CANNOT_DELETE; 3857 } 3858 3859 if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) { 3860 ExReleaseResourceLite(&Vcb->tree_lock); 3861 ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n"); 3862 return STATUS_CANNOT_DELETE; 3863 } 3864 3865 if (num_rw_devices == 2 && 3866 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) || 3867 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1)) 3868 ) { 3869 ExReleaseResourceLite(&Vcb->tree_lock); 3870 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n"); 3871 return STATUS_CANNOT_DELETE; 3872 } 3873 } 3874 3875 ExReleaseResourceLite(&Vcb->tree_lock); 3876 3877 if (Vcb->balance.thread) { 3878 WARN("balance already running\n"); 3879 return STATUS_DEVICE_NOT_READY; 3880 } 3881 3882 dev->reloc = true; 3883 3884 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3); 3885 3886 for (i = 0; i < 3; i++) { 3887 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID; 3888 Vcb->balance.opts[i].devid = devid; 3889 } 3890 3891 Vcb->balance.paused = false; 3892 Vcb->balance.removing = true; 3893 Vcb->balance.shrinking = false; 3894 Vcb->balance.status = STATUS_SUCCESS; 3895 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3896 3897 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3898 3899 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3900 if (!NT_SUCCESS(Status)) { 3901 ERR("PsCreateSystemThread returned %08x\n", Status); 3902 dev->reloc = false; 3903 return Status; 3904 } 3905 3906 return STATUS_SUCCESS; 3907 } 3908