1 /* Copyright (c) Mark Harmstone 2016-17 2 * 3 * This file is part of WinBtrfs. 4 * 5 * WinBtrfs is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public Licence as published by 7 * the Free Software Foundation, either version 3 of the Licence, or 8 * (at your option) any later version. 9 * 10 * WinBtrfs is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public Licence for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public Licence 16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include "btrfs_drv.h" 19 #include "btrfsioctl.h" 20 #include "crc32c.h" 21 #include <ntddstor.h> 22 23 typedef struct { 24 uint64_t address; 25 uint64_t new_address; 26 tree_header* data; 27 EXTENT_ITEM* ei; 28 tree* t; 29 bool system; 30 LIST_ENTRY refs; 31 LIST_ENTRY list_entry; 32 } metadata_reloc; 33 34 typedef struct { 35 uint8_t type; 36 uint64_t hash; 37 38 union { 39 TREE_BLOCK_REF tbr; 40 SHARED_BLOCK_REF sbr; 41 }; 42 43 metadata_reloc* parent; 44 bool top; 45 LIST_ENTRY list_entry; 46 } metadata_reloc_ref; 47 48 typedef struct { 49 uint64_t address; 50 uint64_t size; 51 uint64_t new_address; 52 chunk* newchunk; 53 EXTENT_ITEM* ei; 54 LIST_ENTRY refs; 55 LIST_ENTRY list_entry; 56 } data_reloc; 57 58 typedef struct { 59 uint8_t type; 60 uint64_t hash; 61 62 union { 63 EXTENT_DATA_REF edr; 64 SHARED_DATA_REF sdr; 65 }; 66 67 metadata_reloc* parent; 68 LIST_ENTRY list_entry; 69 } data_reloc_ref; 70 71 #ifndef _MSC_VER // not in mingw yet 72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000 73 #endif 74 75 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time 76 77 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp, 78 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) { 79 NTSTATUS Status; 80 metadata_reloc* mr; 81 EXTENT_ITEM* ei; 82 uint16_t len; 83 uint64_t inline_rc; 84 uint8_t* ptr; 85 86 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG); 87 if (!mr) { 88 ERR("out of memory\n"); 89 return STATUS_INSUFFICIENT_RESOURCES; 90 } 91 92 mr->address = tp->item->key.obj_id; 93 mr->data = NULL; 94 mr->ei = (EXTENT_ITEM*)tp->item->data; 95 mr->system = false; 96 InitializeListHead(&mr->refs); 97 98 Status = delete_tree_item(Vcb, tp); 99 if (!NT_SUCCESS(Status)) { 100 ERR("delete_tree_item returned %08lx\n", Status); 101 ExFreePool(mr); 102 return Status; 103 } 104 105 if (!c) 106 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 107 108 if (c) { 109 acquire_chunk_lock(c, Vcb); 110 111 c->used -= Vcb->superblock.node_size; 112 113 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback); 114 115 release_chunk_lock(c, Vcb); 116 } 117 118 ei = (EXTENT_ITEM*)tp->item->data; 119 inline_rc = 0; 120 121 len = tp->item->size - sizeof(EXTENT_ITEM); 122 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 123 if (!skinny) { 124 len -= sizeof(EXTENT_ITEM2); 125 ptr += sizeof(EXTENT_ITEM2); 126 } 127 128 while (len > 0) { 129 uint8_t secttype = *ptr; 130 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0); 131 metadata_reloc_ref* ref; 132 133 len--; 134 135 if (sectlen > len) { 136 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 137 return STATUS_INTERNAL_ERROR; 138 } 139 140 if (sectlen == 0) { 141 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 142 return STATUS_INTERNAL_ERROR; 143 } 144 145 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 146 if (!ref) { 147 ERR("out of memory\n"); 148 return STATUS_INSUFFICIENT_RESOURCES; 149 } 150 151 if (secttype == TYPE_TREE_BLOCK_REF) { 152 ref->type = TYPE_TREE_BLOCK_REF; 153 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF)); 154 inline_rc++; 155 } else if (secttype == TYPE_SHARED_BLOCK_REF) { 156 ref->type = TYPE_SHARED_BLOCK_REF; 157 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF)); 158 inline_rc++; 159 } else { 160 ERR("unexpected tree type %x\n", secttype); 161 ExFreePool(ref); 162 return STATUS_INTERNAL_ERROR; 163 } 164 165 ref->parent = NULL; 166 ref->top = false; 167 InsertTailList(&mr->refs, &ref->list_entry); 168 169 len -= sectlen; 170 ptr += sizeof(uint8_t) + sectlen; 171 } 172 173 if (inline_rc < ei->refcount) { // look for non-inline entries 174 traverse_ptr tp2 = *tp, next_tp; 175 176 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 177 tp2 = next_tp; 178 179 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 180 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) { 181 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 182 if (!ref) { 183 ERR("out of memory\n"); 184 return STATUS_INSUFFICIENT_RESOURCES; 185 } 186 187 ref->type = TYPE_TREE_BLOCK_REF; 188 ref->tbr.offset = tp2.item->key.offset; 189 ref->parent = NULL; 190 ref->top = false; 191 InsertTailList(&mr->refs, &ref->list_entry); 192 193 Status = delete_tree_item(Vcb, &tp2); 194 if (!NT_SUCCESS(Status)) { 195 ERR("delete_tree_item returned %08lx\n", Status); 196 return Status; 197 } 198 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) { 199 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 200 if (!ref) { 201 ERR("out of memory\n"); 202 return STATUS_INSUFFICIENT_RESOURCES; 203 } 204 205 ref->type = TYPE_SHARED_BLOCK_REF; 206 ref->sbr.offset = tp2.item->key.offset; 207 ref->parent = NULL; 208 ref->top = false; 209 InsertTailList(&mr->refs, &ref->list_entry); 210 211 Status = delete_tree_item(Vcb, &tp2); 212 if (!NT_SUCCESS(Status)) { 213 ERR("delete_tree_item returned %08lx\n", Status); 214 return Status; 215 } 216 } 217 } else 218 break; 219 } 220 } 221 222 InsertTailList(items, &mr->list_entry); 223 224 if (mr2) 225 *mr2 = mr; 226 227 return STATUS_SUCCESS; 228 } 229 230 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 231 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) { 232 LIST_ENTRY* le; 233 KEY searchkey; 234 traverse_ptr tp; 235 bool skinny = false; 236 NTSTATUS Status; 237 238 le = items->Flink; 239 while (le != items) { 240 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 241 242 if (mr->address == address) { 243 *mr2 = mr; 244 return STATUS_SUCCESS; 245 } 246 247 le = le->Flink; 248 } 249 250 searchkey.obj_id = address; 251 searchkey.obj_type = TYPE_METADATA_ITEM; 252 searchkey.offset = 0xffffffffffffffff; 253 254 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 255 if (!NT_SUCCESS(Status)) { 256 ERR("find_item returned %08lx\n", Status); 257 return Status; 258 } 259 260 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) 261 skinny = true; 262 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 263 tp.item->size >= sizeof(EXTENT_ITEM)) { 264 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 265 266 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) { 267 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address); 268 return STATUS_INTERNAL_ERROR; 269 } 270 } else { 271 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address); 272 return STATUS_INTERNAL_ERROR; 273 } 274 275 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback); 276 if (!NT_SUCCESS(Status)) { 277 ERR("add_metadata_reloc returned %08lx\n", Status); 278 return Status; 279 } 280 281 return STATUS_SUCCESS; 282 } 283 284 static void sort_metadata_reloc_refs(metadata_reloc* mr) { 285 LIST_ENTRY newlist, *le; 286 287 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items 288 return; 289 290 // insertion sort 291 292 InitializeListHead(&newlist); 293 294 while (!IsListEmpty(&mr->refs)) { 295 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 296 bool inserted = false; 297 298 if (ref->type == TYPE_TREE_BLOCK_REF) 299 ref->hash = ref->tbr.offset; 300 else if (ref->type == TYPE_SHARED_BLOCK_REF) 301 ref->hash = ref->parent->new_address; 302 303 le = newlist.Flink; 304 while (le != &newlist) { 305 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 306 307 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 308 InsertHeadList(le->Blink, &ref->list_entry); 309 inserted = true; 310 break; 311 } 312 313 le = le->Flink; 314 } 315 316 if (!inserted) 317 InsertTailList(&newlist, &ref->list_entry); 318 } 319 320 newlist.Flink->Blink = &mr->refs; 321 newlist.Blink->Flink = &mr->refs; 322 mr->refs.Flink = newlist.Flink; 323 mr->refs.Blink = newlist.Blink; 324 } 325 326 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) { 327 NTSTATUS Status; 328 LIST_ENTRY* le; 329 uint64_t rc = 0; 330 uint16_t inline_len; 331 bool all_inline = true; 332 metadata_reloc_ref* first_noninline = NULL; 333 EXTENT_ITEM* ei; 334 uint8_t* ptr; 335 336 inline_len = sizeof(EXTENT_ITEM); 337 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) 338 inline_len += sizeof(EXTENT_ITEM2); 339 340 sort_metadata_reloc_refs(mr); 341 342 le = mr->refs.Flink; 343 while (le != &mr->refs) { 344 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 345 uint16_t extlen = 0; 346 347 rc++; 348 349 if (ref->type == TYPE_TREE_BLOCK_REF) 350 extlen += sizeof(TREE_BLOCK_REF); 351 else if (ref->type == TYPE_SHARED_BLOCK_REF) 352 extlen += sizeof(SHARED_BLOCK_REF); 353 354 if (all_inline) { 355 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 356 all_inline = false; 357 first_noninline = ref; 358 } else 359 inline_len += extlen + 1; 360 } 361 362 le = le->Flink; 363 } 364 365 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 366 if (!ei) { 367 ERR("out of memory\n"); 368 return STATUS_INSUFFICIENT_RESOURCES; 369 } 370 371 ei->refcount = rc; 372 ei->generation = mr->ei->generation; 373 ei->flags = mr->ei->flags; 374 ptr = (uint8_t*)&ei[1]; 375 376 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) { 377 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr; 378 379 ei2->firstitem = *(KEY*)&mr->data[1]; 380 ei2->level = mr->data->level; 381 382 ptr += sizeof(EXTENT_ITEM2); 383 } 384 385 le = mr->refs.Flink; 386 while (le != &mr->refs) { 387 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 388 389 if (ref == first_noninline) 390 break; 391 392 *ptr = ref->type; 393 ptr++; 394 395 if (ref->type == TYPE_TREE_BLOCK_REF) { 396 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr; 397 398 tbr->offset = ref->tbr.offset; 399 400 ptr += sizeof(TREE_BLOCK_REF); 401 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 402 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr; 403 404 sbr->offset = ref->parent->new_address; 405 406 ptr += sizeof(SHARED_BLOCK_REF); 407 } 408 409 le = le->Flink; 410 } 411 412 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA) 413 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL); 414 else 415 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL); 416 417 if (!NT_SUCCESS(Status)) { 418 ERR("insert_tree_item returned %08lx\n", Status); 419 ExFreePool(ei); 420 return Status; 421 } 422 423 if (!all_inline) { 424 le = &first_noninline->list_entry; 425 426 while (le != &mr->refs) { 427 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 428 429 if (ref->type == TYPE_TREE_BLOCK_REF) { 430 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL); 431 if (!NT_SUCCESS(Status)) { 432 ERR("insert_tree_item returned %08lx\n", Status); 433 return Status; 434 } 435 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 436 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL); 437 if (!NT_SUCCESS(Status)) { 438 ERR("insert_tree_item returned %08lx\n", Status); 439 return Status; 440 } 441 } 442 443 le = le->Flink; 444 } 445 } 446 447 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) { 448 if (mr->data->level > 0) { 449 uint16_t i; 450 internal_node* in = (internal_node*)&mr->data[1]; 451 452 for (i = 0; i < mr->data->num_items; i++) { 453 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL); 454 455 if (sbrrc > 0) { 456 SHARED_BLOCK_REF sbr; 457 458 sbr.offset = mr->new_address; 459 460 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL); 461 if (!NT_SUCCESS(Status)) { 462 ERR("increase_extent_refcount returned %08lx\n", Status); 463 return Status; 464 } 465 466 sbr.offset = mr->address; 467 468 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, 469 sbr.offset, false, NULL); 470 if (!NT_SUCCESS(Status)) { 471 ERR("decrease_extent_refcount returned %08lx\n", Status); 472 return Status; 473 } 474 } 475 } 476 } else { 477 uint16_t i; 478 leaf_node* ln = (leaf_node*)&mr->data[1]; 479 480 for (i = 0; i < mr->data->num_items; i++) { 481 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 482 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 483 484 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 485 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 486 487 if (ed2->size > 0) { // not sparse 488 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL); 489 490 if (sdrrc > 0) { 491 SHARED_DATA_REF sdr; 492 chunk* c; 493 494 sdr.offset = mr->new_address; 495 sdr.count = sdrrc; 496 497 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL); 498 if (!NT_SUCCESS(Status)) { 499 ERR("increase_extent_refcount returned %08lx\n", Status); 500 return Status; 501 } 502 503 sdr.offset = mr->address; 504 505 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, 506 sdr.offset, false, NULL); 507 if (!NT_SUCCESS(Status)) { 508 ERR("decrease_extent_refcount returned %08lx\n", Status); 509 return Status; 510 } 511 512 c = get_chunk_from_address(Vcb, ed2->address); 513 514 if (c) { 515 // check changed_extents 516 517 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true); 518 519 le = c->changed_extents.Flink; 520 521 while (le != &c->changed_extents) { 522 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 523 524 if (ce->address == ed2->address) { 525 LIST_ENTRY* le2; 526 527 le2 = ce->refs.Flink; 528 while (le2 != &ce->refs) { 529 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 530 531 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 532 cer->sdr.offset = mr->new_address; 533 break; 534 } 535 536 le2 = le2->Flink; 537 } 538 539 le2 = ce->old_refs.Flink; 540 while (le2 != &ce->old_refs) { 541 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 542 543 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 544 cer->sdr.offset = mr->new_address; 545 break; 546 } 547 548 le2 = le2->Flink; 549 } 550 551 break; 552 } 553 554 le = le->Flink; 555 } 556 557 ExReleaseResourceLite(&c->changed_extents_lock); 558 } 559 } 560 } 561 } 562 } 563 } 564 } 565 } 566 567 return STATUS_SUCCESS; 568 } 569 570 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 571 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) { 572 LIST_ENTRY tree_writes, *le; 573 NTSTATUS Status; 574 traverse_ptr tp; 575 uint8_t level, max_level = 0; 576 chunk* newchunk = NULL; 577 578 InitializeListHead(&tree_writes); 579 580 le = items->Flink; 581 while (le != items) { 582 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 583 LIST_ENTRY* le2; 584 chunk* pc; 585 586 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); 587 if (!mr->data) { 588 ERR("out of memory\n"); 589 return STATUS_INSUFFICIENT_RESOURCES; 590 } 591 592 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data, 593 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority); 594 if (!NT_SUCCESS(Status)) { 595 ERR("read_data returned %08lx\n", Status); 596 return Status; 597 } 598 599 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM) 600 mr->system = true; 601 602 if (data_items && mr->data->level == 0) { 603 le2 = data_items->Flink; 604 while (le2 != data_items) { 605 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 606 leaf_node* ln = (leaf_node*)&mr->data[1]; 607 uint16_t i; 608 609 for (i = 0; i < mr->data->num_items; i++) { 610 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 611 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 612 613 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 614 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 615 616 if (ed2->address == dr->address) 617 ed2->address = dr->new_address; 618 } 619 } 620 } 621 622 le2 = le2->Flink; 623 } 624 } 625 626 if (mr->data->level > max_level) 627 max_level = mr->data->level; 628 629 le2 = mr->refs.Flink; 630 while (le2 != &mr->refs) { 631 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 632 633 if (ref->type == TYPE_TREE_BLOCK_REF) { 634 KEY* firstitem; 635 root* r = NULL; 636 LIST_ENTRY* le3; 637 tree* t; 638 639 firstitem = (KEY*)&mr->data[1]; 640 641 le3 = Vcb->roots.Flink; 642 while (le3 != &Vcb->roots) { 643 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 644 645 if (r2->id == ref->tbr.offset) { 646 r = r2; 647 break; 648 } 649 650 le3 = le3->Flink; 651 } 652 653 if (!r) { 654 ERR("could not find subvol with id %I64x\n", ref->tbr.offset); 655 return STATUS_INTERNAL_ERROR; 656 } 657 658 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL); 659 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 660 ERR("find_item_to_level returned %08lx\n", Status); 661 return Status; 662 } 663 664 t = tp.tree; 665 while (t && t->header.level < mr->data->level + 1) { 666 t = t->parent; 667 } 668 669 if (!t) 670 ref->top = true; 671 else { 672 metadata_reloc* mr2; 673 674 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback); 675 if (!NT_SUCCESS(Status)) { 676 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 677 return Status; 678 } 679 680 ref->parent = mr2; 681 } 682 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 683 metadata_reloc* mr2; 684 685 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback); 686 if (!NT_SUCCESS(Status)) { 687 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 688 return Status; 689 } 690 691 ref->parent = mr2; 692 } 693 694 le2 = le2->Flink; 695 } 696 697 le = le->Flink; 698 } 699 700 le = items->Flink; 701 while (le != items) { 702 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 703 LIST_ENTRY* le2; 704 uint32_t hash; 705 706 mr->t = NULL; 707 708 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t)); 709 710 le2 = Vcb->trees_ptrs[hash >> 24]; 711 712 if (le2) { 713 while (le2 != &Vcb->trees_hash) { 714 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash); 715 716 if (t->header.address == mr->address) { 717 mr->t = t; 718 break; 719 } else if (t->hash > hash) 720 break; 721 722 le2 = le2->Flink; 723 } 724 } 725 726 le = le->Flink; 727 } 728 729 for (level = 0; level <= max_level; level++) { 730 le = items->Flink; 731 while (le != items) { 732 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 733 734 if (mr->data->level == level) { 735 bool done = false; 736 LIST_ENTRY* le2; 737 tree_write* tw; 738 uint64_t flags; 739 tree* t3; 740 741 if (mr->system) 742 flags = Vcb->system_flags; 743 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) 744 flags = Vcb->data_flags; 745 else 746 flags = Vcb->metadata_flags; 747 748 if (newchunk) { 749 acquire_chunk_lock(newchunk, Vcb); 750 751 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 752 newchunk->used += Vcb->superblock.node_size; 753 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 754 done = true; 755 } 756 757 release_chunk_lock(newchunk, Vcb); 758 } 759 760 if (!done) { 761 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 762 763 le2 = Vcb->chunks.Flink; 764 while (le2 != &Vcb->chunks) { 765 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 766 767 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) { 768 acquire_chunk_lock(c2, Vcb); 769 770 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) { 771 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) { 772 c2->used += Vcb->superblock.node_size; 773 space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback); 774 release_chunk_lock(c2, Vcb); 775 newchunk = c2; 776 done = true; 777 break; 778 } 779 } 780 781 release_chunk_lock(c2, Vcb); 782 } 783 784 le2 = le2->Flink; 785 } 786 787 // allocate new chunk if necessary 788 if (!done) { 789 Status = alloc_chunk(Vcb, flags, &newchunk, false); 790 791 if (!NT_SUCCESS(Status)) { 792 ERR("alloc_chunk returned %08lx\n", Status); 793 ExReleaseResourceLite(&Vcb->chunk_lock); 794 goto end; 795 } 796 797 acquire_chunk_lock(newchunk, Vcb); 798 799 newchunk->balance_num = Vcb->balance.balance_num; 800 801 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 802 release_chunk_lock(newchunk, Vcb); 803 ExReleaseResourceLite(&Vcb->chunk_lock); 804 ERR("could not find address in new chunk\n"); 805 Status = STATUS_DISK_FULL; 806 goto end; 807 } else { 808 newchunk->used += Vcb->superblock.node_size; 809 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 810 } 811 812 release_chunk_lock(newchunk, Vcb); 813 } 814 815 ExReleaseResourceLite(&Vcb->chunk_lock); 816 } 817 818 // update parents 819 le2 = mr->refs.Flink; 820 while (le2 != &mr->refs) { 821 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 822 823 if (ref->parent) { 824 uint16_t i; 825 internal_node* in = (internal_node*)&ref->parent->data[1]; 826 827 for (i = 0; i < ref->parent->data->num_items; i++) { 828 if (in[i].address == mr->address) { 829 in[i].address = mr->new_address; 830 break; 831 } 832 } 833 834 if (ref->parent->t) { 835 LIST_ENTRY* le3; 836 837 le3 = ref->parent->t->itemlist.Flink; 838 while (le3 != &ref->parent->t->itemlist) { 839 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 840 841 if (!td->inserted && td->treeholder.address == mr->address) 842 td->treeholder.address = mr->new_address; 843 844 le3 = le3->Flink; 845 } 846 } 847 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) { 848 LIST_ENTRY* le3; 849 root* r = NULL; 850 851 // alter ROOT_ITEM 852 853 le3 = Vcb->roots.Flink; 854 while (le3 != &Vcb->roots) { 855 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 856 857 if (r2->id == ref->tbr.offset) { 858 r = r2; 859 break; 860 } 861 862 le3 = le3->Flink; 863 } 864 865 if (r) { 866 r->treeholder.address = mr->new_address; 867 868 if (r == Vcb->root_root) 869 Vcb->superblock.root_tree_addr = mr->new_address; 870 else if (r == Vcb->chunk_root) 871 Vcb->superblock.chunk_tree_addr = mr->new_address; 872 else if (r->root_item.block_number == mr->address) { 873 KEY searchkey; 874 ROOT_ITEM* ri; 875 876 r->root_item.block_number = mr->new_address; 877 878 searchkey.obj_id = r->id; 879 searchkey.obj_type = TYPE_ROOT_ITEM; 880 searchkey.offset = 0xffffffffffffffff; 881 882 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 883 if (!NT_SUCCESS(Status)) { 884 ERR("find_item returned %08lx\n", Status); 885 goto end; 886 } 887 888 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) { 889 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id); 890 Status = STATUS_INTERNAL_ERROR; 891 goto end; 892 } 893 894 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG); 895 if (!ri) { 896 ERR("out of memory\n"); 897 Status = STATUS_INSUFFICIENT_RESOURCES; 898 goto end; 899 } 900 901 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM)); 902 903 Status = delete_tree_item(Vcb, &tp); 904 if (!NT_SUCCESS(Status)) { 905 ERR("delete_tree_item returned %08lx\n", Status); 906 goto end; 907 } 908 909 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL); 910 if (!NT_SUCCESS(Status)) { 911 ERR("insert_tree_item returned %08lx\n", Status); 912 goto end; 913 } 914 } 915 } 916 } 917 918 le2 = le2->Flink; 919 } 920 921 mr->data->address = mr->new_address; 922 923 t3 = mr->t; 924 925 while (t3) { 926 uint8_t h; 927 bool inserted; 928 tree* t4 = NULL; 929 930 // check if tree loaded more than once 931 if (t3->list_entry.Flink != &Vcb->trees_hash) { 932 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 933 934 if (nt->header.address == t3->header.address) 935 t4 = nt; 936 } 937 938 t3->header.address = mr->new_address; 939 940 h = t3->hash >> 24; 941 942 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) { 943 if (t3->list_entry_hash.Flink == &Vcb->trees_hash) 944 Vcb->trees_ptrs[h] = NULL; 945 else { 946 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 947 948 if (t2->hash >> 24 == h) 949 Vcb->trees_ptrs[h] = &t2->list_entry_hash; 950 else 951 Vcb->trees_ptrs[h] = NULL; 952 } 953 } 954 955 RemoveEntryList(&t3->list_entry_hash); 956 957 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t)); 958 h = t3->hash >> 24; 959 960 if (!Vcb->trees_ptrs[h]) { 961 uint8_t h2 = h; 962 963 le2 = Vcb->trees_hash.Flink; 964 965 if (h2 > 0) { 966 h2--; 967 do { 968 if (Vcb->trees_ptrs[h2]) { 969 le2 = Vcb->trees_ptrs[h2]; 970 break; 971 } 972 973 h2--; 974 } while (h2 > 0); 975 } 976 } else 977 le2 = Vcb->trees_ptrs[h]; 978 979 inserted = false; 980 while (le2 != &Vcb->trees_hash) { 981 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash); 982 983 if (t2->hash >= t3->hash) { 984 InsertHeadList(le2->Blink, &t3->list_entry_hash); 985 inserted = true; 986 break; 987 } 988 989 le2 = le2->Flink; 990 } 991 992 if (!inserted) 993 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash); 994 995 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h]) 996 Vcb->trees_ptrs[h] = &t3->list_entry_hash; 997 998 if (data_items && level == 0) { 999 le2 = data_items->Flink; 1000 1001 while (le2 != data_items) { 1002 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 1003 LIST_ENTRY* le3 = t3->itemlist.Flink; 1004 1005 while (le3 != &t3->itemlist) { 1006 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 1007 1008 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 1009 EXTENT_DATA* ed = (EXTENT_DATA*)td->data; 1010 1011 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 1012 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1013 1014 if (ed2->address == dr->address) 1015 ed2->address = dr->new_address; 1016 } 1017 } 1018 1019 le3 = le3->Flink; 1020 } 1021 1022 le2 = le2->Flink; 1023 } 1024 } 1025 1026 t3 = t4; 1027 } 1028 1029 calc_tree_checksum(Vcb, mr->data); 1030 1031 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG); 1032 if (!tw) { 1033 ERR("out of memory\n"); 1034 Status = STATUS_INSUFFICIENT_RESOURCES; 1035 goto end; 1036 } 1037 1038 tw->address = mr->new_address; 1039 tw->length = Vcb->superblock.node_size; 1040 tw->data = (uint8_t*)mr->data; 1041 tw->allocated = false; 1042 1043 if (IsListEmpty(&tree_writes)) 1044 InsertTailList(&tree_writes, &tw->list_entry); 1045 else { 1046 bool inserted = false; 1047 1048 le2 = tree_writes.Flink; 1049 while (le2 != &tree_writes) { 1050 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry); 1051 1052 if (tw2->address > tw->address) { 1053 InsertHeadList(le2->Blink, &tw->list_entry); 1054 inserted = true; 1055 break; 1056 } 1057 1058 le2 = le2->Flink; 1059 } 1060 1061 if (!inserted) 1062 InsertTailList(&tree_writes, &tw->list_entry); 1063 } 1064 } 1065 1066 le = le->Flink; 1067 } 1068 } 1069 1070 Status = do_tree_writes(Vcb, &tree_writes, true); 1071 if (!NT_SUCCESS(Status)) { 1072 ERR("do_tree_writes returned %08lx\n", Status); 1073 goto end; 1074 } 1075 1076 le = items->Flink; 1077 while (le != items) { 1078 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 1079 1080 Status = add_metadata_reloc_extent_item(Vcb, mr); 1081 if (!NT_SUCCESS(Status)) { 1082 ERR("add_metadata_reloc_extent_item returned %08lx\n", Status); 1083 goto end; 1084 } 1085 1086 le = le->Flink; 1087 } 1088 1089 Status = STATUS_SUCCESS; 1090 1091 end: 1092 while (!IsListEmpty(&tree_writes)) { 1093 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry); 1094 1095 if (tw->allocated) 1096 ExFreePool(tw->data); 1097 1098 ExFreePool(tw); 1099 } 1100 1101 return Status; 1102 } 1103 1104 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1105 KEY searchkey; 1106 traverse_ptr tp; 1107 NTSTATUS Status; 1108 bool b; 1109 LIST_ENTRY items, rollback; 1110 uint32_t loaded = 0; 1111 1112 TRACE("chunk %I64x\n", c->offset); 1113 1114 InitializeListHead(&rollback); 1115 InitializeListHead(&items); 1116 1117 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1118 1119 searchkey.obj_id = c->offset; 1120 searchkey.obj_type = TYPE_METADATA_ITEM; 1121 searchkey.offset = 0xffffffffffffffff; 1122 1123 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1124 if (!NT_SUCCESS(Status)) { 1125 ERR("find_item returned %08lx\n", Status); 1126 goto end; 1127 } 1128 1129 do { 1130 traverse_ptr next_tp; 1131 1132 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1133 break; 1134 1135 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { 1136 bool tree = false, skinny = false; 1137 1138 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1139 tree = true; 1140 skinny = true; 1141 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 1142 tp.item->size >= sizeof(EXTENT_ITEM)) { 1143 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1144 1145 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1146 tree = true; 1147 } 1148 1149 if (tree) { 1150 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback); 1151 1152 if (!NT_SUCCESS(Status)) { 1153 ERR("add_metadata_reloc returned %08lx\n", Status); 1154 goto end; 1155 } 1156 1157 loaded++; 1158 1159 if (loaded >= 64) // only do 64 at a time 1160 break; 1161 } 1162 } 1163 1164 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1165 1166 if (b) 1167 tp = next_tp; 1168 } while (b); 1169 1170 if (IsListEmpty(&items)) { 1171 *changed = false; 1172 Status = STATUS_SUCCESS; 1173 goto end; 1174 } else 1175 *changed = true; 1176 1177 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback); 1178 if (!NT_SUCCESS(Status)) { 1179 ERR("write_metadata_items returned %08lx\n", Status); 1180 goto end; 1181 } 1182 1183 Status = STATUS_SUCCESS; 1184 1185 Vcb->need_write = true; 1186 1187 end: 1188 if (NT_SUCCESS(Status)) { 1189 Status = do_write(Vcb, NULL); 1190 if (!NT_SUCCESS(Status)) 1191 ERR("do_write returned %08lx\n", Status); 1192 } 1193 1194 if (NT_SUCCESS(Status)) 1195 clear_rollback(&rollback); 1196 else 1197 do_rollback(Vcb, &rollback); 1198 1199 free_trees(Vcb); 1200 1201 ExReleaseResourceLite(&Vcb->tree_lock); 1202 1203 while (!IsListEmpty(&items)) { 1204 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry); 1205 1206 while (!IsListEmpty(&mr->refs)) { 1207 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 1208 1209 ExFreePool(ref); 1210 } 1211 1212 if (mr->data) 1213 ExFreePool(mr->data); 1214 1215 ExFreePool(mr); 1216 } 1217 1218 return Status; 1219 } 1220 1221 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items, 1222 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) { 1223 NTSTATUS Status; 1224 LIST_ENTRY* le; 1225 KEY searchkey; 1226 traverse_ptr tp; 1227 root* r = NULL; 1228 metadata_reloc* mr; 1229 uint64_t last_tree = 0; 1230 data_reloc_ref* ref; 1231 1232 le = Vcb->roots.Flink; 1233 while (le != &Vcb->roots) { 1234 root* r2 = CONTAINING_RECORD(le, root, list_entry); 1235 1236 if (r2->id == edr->root) { 1237 r = r2; 1238 break; 1239 } 1240 1241 le = le->Flink; 1242 } 1243 1244 if (!r) { 1245 ERR("could not find subvol %I64x\n", edr->root); 1246 return STATUS_INTERNAL_ERROR; 1247 } 1248 1249 searchkey.obj_id = edr->objid; 1250 searchkey.obj_type = TYPE_EXTENT_DATA; 1251 searchkey.offset = 0; 1252 1253 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL); 1254 if (!NT_SUCCESS(Status)) { 1255 ERR("find_item returned %08lx\n", Status); 1256 return Status; 1257 } 1258 1259 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) { 1260 traverse_ptr tp2; 1261 1262 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1263 tp = tp2; 1264 else { 1265 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id); 1266 return STATUS_INTERNAL_ERROR; 1267 } 1268 } 1269 1270 ref = NULL; 1271 1272 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) { 1273 traverse_ptr tp2; 1274 1275 if (tp.item->size >= sizeof(EXTENT_DATA)) { 1276 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data; 1277 1278 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) { 1279 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1280 1281 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) { 1282 if (ref && last_tree == tp.tree->header.address) 1283 ref->edr.count++; 1284 else { 1285 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1286 if (!ref) { 1287 ERR("out of memory\n"); 1288 return STATUS_INSUFFICIENT_RESOURCES; 1289 } 1290 1291 ref->type = TYPE_EXTENT_DATA_REF; 1292 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF)); 1293 ref->edr.count = 1; 1294 1295 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback); 1296 if (!NT_SUCCESS(Status)) { 1297 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1298 ExFreePool(ref); 1299 return Status; 1300 } 1301 1302 last_tree = tp.tree->header.address; 1303 ref->parent = mr; 1304 1305 InsertTailList(&dr->refs, &ref->list_entry); 1306 } 1307 } 1308 } 1309 } 1310 1311 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1312 tp = tp2; 1313 else 1314 break; 1315 } 1316 1317 return STATUS_SUCCESS; 1318 } 1319 1320 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items, 1321 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) { 1322 NTSTATUS Status; 1323 data_reloc* dr; 1324 EXTENT_ITEM* ei; 1325 uint16_t len; 1326 uint64_t inline_rc; 1327 uint8_t* ptr; 1328 1329 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG); 1330 if (!dr) { 1331 ERR("out of memory\n"); 1332 return STATUS_INSUFFICIENT_RESOURCES; 1333 } 1334 1335 dr->address = tp->item->key.obj_id; 1336 dr->size = tp->item->key.offset; 1337 dr->ei = (EXTENT_ITEM*)tp->item->data; 1338 InitializeListHead(&dr->refs); 1339 1340 Status = delete_tree_item(Vcb, tp); 1341 if (!NT_SUCCESS(Status)) { 1342 ERR("delete_tree_item returned %08lx\n", Status); 1343 return Status; 1344 } 1345 1346 if (!c) 1347 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 1348 1349 if (c) { 1350 acquire_chunk_lock(c, Vcb); 1351 1352 c->used -= tp->item->key.offset; 1353 1354 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback); 1355 1356 release_chunk_lock(c, Vcb); 1357 } 1358 1359 ei = (EXTENT_ITEM*)tp->item->data; 1360 inline_rc = 0; 1361 1362 len = tp->item->size - sizeof(EXTENT_ITEM); 1363 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 1364 1365 while (len > 0) { 1366 uint8_t secttype = *ptr; 1367 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0); 1368 1369 len--; 1370 1371 if (sectlen > len) { 1372 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 1373 return STATUS_INTERNAL_ERROR; 1374 } 1375 1376 if (sectlen == 0) { 1377 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 1378 return STATUS_INTERNAL_ERROR; 1379 } 1380 1381 if (secttype == TYPE_EXTENT_DATA_REF) { 1382 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t)); 1383 1384 inline_rc += edr->count; 1385 1386 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback); 1387 if (!NT_SUCCESS(Status)) { 1388 ERR("data_reloc_add_tree_edr returned %08lx\n", Status); 1389 return Status; 1390 } 1391 } else if (secttype == TYPE_SHARED_DATA_REF) { 1392 metadata_reloc* mr; 1393 data_reloc_ref* ref; 1394 1395 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1396 if (!ref) { 1397 ERR("out of memory\n"); 1398 return STATUS_INSUFFICIENT_RESOURCES; 1399 } 1400 1401 ref->type = TYPE_SHARED_DATA_REF; 1402 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF)); 1403 inline_rc += ref->sdr.count; 1404 1405 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1406 if (!NT_SUCCESS(Status)) { 1407 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1408 ExFreePool(ref); 1409 return Status; 1410 } 1411 1412 ref->parent = mr; 1413 1414 InsertTailList(&dr->refs, &ref->list_entry); 1415 } else { 1416 ERR("unexpected tree type %x\n", secttype); 1417 return STATUS_INTERNAL_ERROR; 1418 } 1419 1420 1421 len -= sectlen; 1422 ptr += sizeof(uint8_t) + sectlen; 1423 } 1424 1425 if (inline_rc < ei->refcount) { // look for non-inline entries 1426 traverse_ptr tp2 = *tp, next_tp; 1427 1428 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 1429 tp2 = next_tp; 1430 1431 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 1432 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) { 1433 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback); 1434 if (!NT_SUCCESS(Status)) { 1435 ERR("data_reloc_add_tree_edr returned %08lx\n", Status); 1436 return Status; 1437 } 1438 1439 Status = delete_tree_item(Vcb, &tp2); 1440 if (!NT_SUCCESS(Status)) { 1441 ERR("delete_tree_item returned %08lx\n", Status); 1442 return Status; 1443 } 1444 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) { 1445 metadata_reloc* mr; 1446 data_reloc_ref* ref; 1447 1448 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1449 if (!ref) { 1450 ERR("out of memory\n"); 1451 return STATUS_INSUFFICIENT_RESOURCES; 1452 } 1453 1454 ref->type = TYPE_SHARED_DATA_REF; 1455 ref->sdr.offset = tp2.item->key.offset; 1456 ref->sdr.count = *((uint32_t*)tp2.item->data); 1457 1458 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1459 if (!NT_SUCCESS(Status)) { 1460 ERR("add_metadata_reloc_parent returned %08lx\n", Status); 1461 ExFreePool(ref); 1462 return Status; 1463 } 1464 1465 ref->parent = mr; 1466 InsertTailList(&dr->refs, &ref->list_entry); 1467 1468 Status = delete_tree_item(Vcb, &tp2); 1469 if (!NT_SUCCESS(Status)) { 1470 ERR("delete_tree_item returned %08lx\n", Status); 1471 return Status; 1472 } 1473 } 1474 } else 1475 break; 1476 } 1477 } 1478 1479 InsertTailList(items, &dr->list_entry); 1480 1481 return STATUS_SUCCESS; 1482 } 1483 1484 static void sort_data_reloc_refs(data_reloc* dr) { 1485 LIST_ENTRY newlist, *le; 1486 1487 if (IsListEmpty(&dr->refs)) 1488 return; 1489 1490 // insertion sort 1491 1492 InitializeListHead(&newlist); 1493 1494 while (!IsListEmpty(&dr->refs)) { 1495 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 1496 bool inserted = false; 1497 1498 if (ref->type == TYPE_EXTENT_DATA_REF) 1499 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset); 1500 else if (ref->type == TYPE_SHARED_DATA_REF) 1501 ref->hash = ref->parent->new_address; 1502 1503 le = newlist.Flink; 1504 while (le != &newlist) { 1505 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1506 1507 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 1508 InsertHeadList(le->Blink, &ref->list_entry); 1509 inserted = true; 1510 break; 1511 } 1512 1513 le = le->Flink; 1514 } 1515 1516 if (!inserted) 1517 InsertTailList(&newlist, &ref->list_entry); 1518 } 1519 1520 le = newlist.Flink; 1521 while (le != &newlist) { 1522 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1523 1524 if (le->Flink != &newlist) { 1525 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry); 1526 1527 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root && 1528 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) { 1529 RemoveEntryList(&ref2->list_entry); 1530 ref->edr.count += ref2->edr.count; 1531 ExFreePool(ref2); 1532 continue; 1533 } 1534 } 1535 1536 le = le->Flink; 1537 } 1538 1539 newlist.Flink->Blink = &dr->refs; 1540 newlist.Blink->Flink = &dr->refs; 1541 dr->refs.Flink = newlist.Flink; 1542 dr->refs.Blink = newlist.Blink; 1543 } 1544 1545 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) { 1546 NTSTATUS Status; 1547 LIST_ENTRY* le; 1548 uint64_t rc = 0; 1549 uint16_t inline_len; 1550 bool all_inline = true; 1551 data_reloc_ref* first_noninline = NULL; 1552 EXTENT_ITEM* ei; 1553 uint8_t* ptr; 1554 1555 inline_len = sizeof(EXTENT_ITEM); 1556 1557 sort_data_reloc_refs(dr); 1558 1559 le = dr->refs.Flink; 1560 while (le != &dr->refs) { 1561 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1562 uint16_t extlen = 0; 1563 1564 if (ref->type == TYPE_EXTENT_DATA_REF) { 1565 extlen += sizeof(EXTENT_DATA_REF); 1566 rc += ref->edr.count; 1567 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1568 extlen += sizeof(SHARED_DATA_REF); 1569 rc++; 1570 } 1571 1572 if (all_inline) { 1573 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 1574 all_inline = false; 1575 first_noninline = ref; 1576 } else 1577 inline_len += extlen + 1; 1578 } 1579 1580 le = le->Flink; 1581 } 1582 1583 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 1584 if (!ei) { 1585 ERR("out of memory\n"); 1586 return STATUS_INSUFFICIENT_RESOURCES; 1587 } 1588 1589 ei->refcount = rc; 1590 ei->generation = dr->ei->generation; 1591 ei->flags = dr->ei->flags; 1592 ptr = (uint8_t*)&ei[1]; 1593 1594 le = dr->refs.Flink; 1595 while (le != &dr->refs) { 1596 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1597 1598 if (ref == first_noninline) 1599 break; 1600 1601 *ptr = ref->type; 1602 ptr++; 1603 1604 if (ref->type == TYPE_EXTENT_DATA_REF) { 1605 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr; 1606 1607 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1608 1609 ptr += sizeof(EXTENT_DATA_REF); 1610 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1611 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr; 1612 1613 sdr->offset = ref->parent->new_address; 1614 sdr->count = ref->sdr.count; 1615 1616 ptr += sizeof(SHARED_DATA_REF); 1617 } 1618 1619 le = le->Flink; 1620 } 1621 1622 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL); 1623 if (!NT_SUCCESS(Status)) { 1624 ERR("insert_tree_item returned %08lx\n", Status); 1625 return Status; 1626 } 1627 1628 if (!all_inline) { 1629 le = &first_noninline->list_entry; 1630 1631 while (le != &dr->refs) { 1632 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1633 1634 if (ref->type == TYPE_EXTENT_DATA_REF) { 1635 EXTENT_DATA_REF* edr; 1636 1637 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG); 1638 if (!edr) { 1639 ERR("out of memory\n"); 1640 return STATUS_INSUFFICIENT_RESOURCES; 1641 } 1642 1643 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1644 1645 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL); 1646 if (!NT_SUCCESS(Status)) { 1647 ERR("insert_tree_item returned %08lx\n", Status); 1648 return Status; 1649 } 1650 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1651 uint32_t* sdr; 1652 1653 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG); 1654 if (!sdr) { 1655 ERR("out of memory\n"); 1656 return STATUS_INSUFFICIENT_RESOURCES; 1657 } 1658 1659 *sdr = ref->sdr.count; 1660 1661 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL); 1662 if (!NT_SUCCESS(Status)) { 1663 ERR("insert_tree_item returned %08lx\n", Status); 1664 return Status; 1665 } 1666 } 1667 1668 le = le->Flink; 1669 } 1670 } 1671 1672 return STATUS_SUCCESS; 1673 } 1674 1675 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1676 KEY searchkey; 1677 traverse_ptr tp; 1678 NTSTATUS Status; 1679 bool b; 1680 LIST_ENTRY items, metadata_items, rollback, *le; 1681 uint64_t loaded = 0, num_loaded = 0; 1682 chunk* newchunk = NULL; 1683 uint8_t* data = NULL; 1684 1685 TRACE("chunk %I64x\n", c->offset); 1686 1687 InitializeListHead(&rollback); 1688 InitializeListHead(&items); 1689 InitializeListHead(&metadata_items); 1690 1691 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1692 1693 searchkey.obj_id = c->offset; 1694 searchkey.obj_type = TYPE_EXTENT_ITEM; 1695 searchkey.offset = 0xffffffffffffffff; 1696 1697 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1698 if (!NT_SUCCESS(Status)) { 1699 ERR("find_item returned %08lx\n", Status); 1700 goto end; 1701 } 1702 1703 do { 1704 traverse_ptr next_tp; 1705 1706 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1707 break; 1708 1709 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) { 1710 bool tree = false; 1711 1712 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1713 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1714 1715 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1716 tree = true; 1717 } 1718 1719 if (!tree) { 1720 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback); 1721 1722 if (!NT_SUCCESS(Status)) { 1723 ERR("add_data_reloc returned %08lx\n", Status); 1724 goto end; 1725 } 1726 1727 loaded += tp.item->key.offset; 1728 num_loaded++; 1729 1730 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously 1731 break; 1732 } 1733 } 1734 1735 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1736 1737 if (b) 1738 tp = next_tp; 1739 } while (b); 1740 1741 if (IsListEmpty(&items)) { 1742 *changed = false; 1743 Status = STATUS_SUCCESS; 1744 goto end; 1745 } else 1746 *changed = true; 1747 1748 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG); 1749 if (!data) { 1750 ERR("out of memory\n"); 1751 Status = STATUS_INSUFFICIENT_RESOURCES; 1752 goto end; 1753 } 1754 1755 le = items.Flink; 1756 while (le != &items) { 1757 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 1758 bool done = false; 1759 LIST_ENTRY* le2; 1760 void* csum; 1761 RTL_BITMAP bmp; 1762 ULONG* bmparr; 1763 ULONG bmplen, runlength, index, lastoff; 1764 1765 if (newchunk) { 1766 acquire_chunk_lock(newchunk, Vcb); 1767 1768 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1769 newchunk->used += dr->size; 1770 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1771 done = true; 1772 } 1773 1774 release_chunk_lock(newchunk, Vcb); 1775 } 1776 1777 if (!done) { 1778 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 1779 1780 le2 = Vcb->chunks.Flink; 1781 while (le2 != &Vcb->chunks) { 1782 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 1783 1784 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) { 1785 acquire_chunk_lock(c2, Vcb); 1786 1787 if ((c2->chunk_item->size - c2->used) >= dr->size) { 1788 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) { 1789 c2->used += dr->size; 1790 space_list_subtract(c2, false, dr->new_address, dr->size, &rollback); 1791 release_chunk_lock(c2, Vcb); 1792 newchunk = c2; 1793 done = true; 1794 break; 1795 } 1796 } 1797 1798 release_chunk_lock(c2, Vcb); 1799 } 1800 1801 le2 = le2->Flink; 1802 } 1803 1804 // allocate new chunk if necessary 1805 if (!done) { 1806 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false); 1807 1808 if (!NT_SUCCESS(Status)) { 1809 ERR("alloc_chunk returned %08lx\n", Status); 1810 ExReleaseResourceLite(&Vcb->chunk_lock); 1811 goto end; 1812 } 1813 1814 acquire_chunk_lock(newchunk, Vcb); 1815 1816 newchunk->balance_num = Vcb->balance.balance_num; 1817 1818 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1819 release_chunk_lock(newchunk, Vcb); 1820 ExReleaseResourceLite(&Vcb->chunk_lock); 1821 ERR("could not find address in new chunk\n"); 1822 Status = STATUS_DISK_FULL; 1823 goto end; 1824 } else { 1825 newchunk->used += dr->size; 1826 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1827 } 1828 1829 release_chunk_lock(newchunk, Vcb); 1830 } 1831 1832 ExReleaseResourceLite(&Vcb->chunk_lock); 1833 } 1834 1835 dr->newchunk = newchunk; 1836 1837 bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size); 1838 1839 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG); 1840 if (!bmparr) { 1841 ERR("out of memory\n"); 1842 Status = STATUS_INSUFFICIENT_RESOURCES; 1843 goto end; 1844 } 1845 1846 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * Vcb->csum_size / Vcb->superblock.sector_size), ALLOC_TAG); 1847 if (!csum) { 1848 ERR("out of memory\n"); 1849 ExFreePool(bmparr); 1850 Status = STATUS_INSUFFICIENT_RESOURCES; 1851 goto end; 1852 } 1853 1854 RtlInitializeBitMap(&bmp, bmparr, bmplen); 1855 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum 1856 1857 searchkey.obj_id = EXTENT_CSUM_ID; 1858 searchkey.obj_type = TYPE_EXTENT_CSUM; 1859 searchkey.offset = dr->address; 1860 1861 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL); 1862 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 1863 ERR("find_item returned %08lx\n", Status); 1864 ExFreePool(csum); 1865 ExFreePool(bmparr); 1866 goto end; 1867 } 1868 1869 if (Status != STATUS_NOT_FOUND) { 1870 do { 1871 traverse_ptr next_tp; 1872 1873 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) { 1874 if (tp.item->key.offset >= dr->address + dr->size) 1875 break; 1876 else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size) >= dr->address) { 1877 uint64_t cs = max(dr->address, tp.item->key.offset); 1878 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size)); 1879 1880 RtlCopyMemory((uint8_t*)csum + ((cs - dr->address) * Vcb->csum_size / Vcb->superblock.sector_size), 1881 tp.item->data + ((cs - tp.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size), 1882 (ULONG)((ce - cs) * Vcb->csum_size / Vcb->superblock.sector_size)); 1883 1884 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size)); 1885 1886 if (ce == dr->address + dr->size) 1887 break; 1888 } 1889 } 1890 1891 if (find_next_item(Vcb, &tp, &next_tp, false, NULL)) 1892 tp = next_tp; 1893 else 1894 break; 1895 } while (true); 1896 } 1897 1898 lastoff = 0; 1899 runlength = RtlFindFirstRunClear(&bmp, &index); 1900 1901 while (runlength != 0) { 1902 if (index >= bmplen) 1903 break; 1904 1905 if (index + runlength >= bmplen) { 1906 runlength = bmplen - index; 1907 1908 if (runlength == 0) 1909 break; 1910 } 1911 1912 if (index > lastoff) { 1913 ULONG off = lastoff; 1914 ULONG size = index - lastoff; 1915 1916 // handle no csum run 1917 do { 1918 ULONG rl; 1919 1920 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1921 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1922 else 1923 rl = size; 1924 1925 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 1926 c, NULL, NULL, 0, false, NormalPagePriority); 1927 if (!NT_SUCCESS(Status)) { 1928 ERR("read_data returned %08lx\n", Status); 1929 ExFreePool(csum); 1930 ExFreePool(bmparr); 1931 goto end; 1932 } 1933 1934 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1935 NULL, newchunk, false, 0, NormalPagePriority); 1936 if (!NT_SUCCESS(Status)) { 1937 ERR("write_data_complete returned %08lx\n", Status); 1938 ExFreePool(csum); 1939 ExFreePool(bmparr); 1940 goto end; 1941 } 1942 1943 size -= rl; 1944 off += rl; 1945 } while (size > 0); 1946 } 1947 1948 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL); 1949 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL); 1950 1951 // handle csum run 1952 do { 1953 ULONG rl; 1954 1955 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT) 1956 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1957 else 1958 rl = runlength; 1959 1960 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, 1961 (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority); 1962 if (!NT_SUCCESS(Status)) { 1963 ERR("read_data returned %08lx\n", Status); 1964 ExFreePool(csum); 1965 ExFreePool(bmparr); 1966 goto end; 1967 } 1968 1969 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1970 NULL, newchunk, false, 0, NormalPagePriority); 1971 if (!NT_SUCCESS(Status)) { 1972 ERR("write_data_complete returned %08lx\n", Status); 1973 ExFreePool(csum); 1974 ExFreePool(bmparr); 1975 goto end; 1976 } 1977 1978 runlength -= rl; 1979 index += rl; 1980 } while (runlength > 0); 1981 1982 lastoff = index; 1983 runlength = RtlFindNextForwardRunClear(&bmp, index, &index); 1984 } 1985 1986 ExFreePool(csum); 1987 ExFreePool(bmparr); 1988 1989 // handle final nocsum run 1990 if (lastoff < dr->size / Vcb->superblock.sector_size) { 1991 ULONG off = lastoff; 1992 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff); 1993 1994 do { 1995 ULONG rl; 1996 1997 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1998 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1999 else 2000 rl = size; 2001 2002 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 2003 c, NULL, NULL, 0, false, NormalPagePriority); 2004 if (!NT_SUCCESS(Status)) { 2005 ERR("read_data returned %08lx\n", Status); 2006 goto end; 2007 } 2008 2009 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 2010 NULL, newchunk, false, 0, NormalPagePriority); 2011 if (!NT_SUCCESS(Status)) { 2012 ERR("write_data_complete returned %08lx\n", Status); 2013 goto end; 2014 } 2015 2016 size -= rl; 2017 off += rl; 2018 } while (size > 0); 2019 } 2020 2021 le = le->Flink; 2022 } 2023 2024 ExFreePool(data); 2025 data = NULL; 2026 2027 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback); 2028 if (!NT_SUCCESS(Status)) { 2029 ERR("write_metadata_items returned %08lx\n", Status); 2030 goto end; 2031 } 2032 2033 le = items.Flink; 2034 while (le != &items) { 2035 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 2036 2037 Status = add_data_reloc_extent_item(Vcb, dr); 2038 if (!NT_SUCCESS(Status)) { 2039 ERR("add_data_reloc_extent_item returned %08lx\n", Status); 2040 goto end; 2041 } 2042 2043 le = le->Flink; 2044 } 2045 2046 le = c->changed_extents.Flink; 2047 while (le != &c->changed_extents) { 2048 LIST_ENTRY *le2, *le3; 2049 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 2050 2051 le3 = le->Flink; 2052 2053 le2 = items.Flink; 2054 while (le2 != &items) { 2055 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 2056 2057 if (ce->address == dr->address) { 2058 ce->address = dr->new_address; 2059 RemoveEntryList(&ce->list_entry); 2060 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry); 2061 break; 2062 } 2063 2064 le2 = le2->Flink; 2065 } 2066 2067 le = le3; 2068 } 2069 2070 Status = STATUS_SUCCESS; 2071 2072 Vcb->need_write = true; 2073 2074 end: 2075 if (NT_SUCCESS(Status)) { 2076 // update extents in cache inodes before we flush 2077 le = Vcb->chunks.Flink; 2078 while (le != &Vcb->chunks) { 2079 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry); 2080 2081 if (c2->cache) { 2082 LIST_ENTRY* le2; 2083 2084 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true); 2085 2086 le2 = c2->cache->extents.Flink; 2087 while (le2 != &c2->cache->extents) { 2088 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2089 2090 if (!ext->ignore) { 2091 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2092 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2093 2094 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2095 LIST_ENTRY* le3 = items.Flink; 2096 while (le3 != &items) { 2097 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2098 2099 if (ed2->address == dr->address) { 2100 ed2->address = dr->new_address; 2101 break; 2102 } 2103 2104 le3 = le3->Flink; 2105 } 2106 } 2107 } 2108 } 2109 2110 le2 = le2->Flink; 2111 } 2112 2113 ExReleaseResourceLite(c2->cache->Header.Resource); 2114 } 2115 2116 le = le->Flink; 2117 } 2118 2119 Status = do_write(Vcb, NULL); 2120 if (!NT_SUCCESS(Status)) 2121 ERR("do_write returned %08lx\n", Status); 2122 } 2123 2124 if (NT_SUCCESS(Status)) { 2125 clear_rollback(&rollback); 2126 2127 // update open FCBs 2128 // FIXME - speed this up(?) 2129 2130 le = Vcb->all_fcbs.Flink; 2131 while (le != &Vcb->all_fcbs) { 2132 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all); 2133 LIST_ENTRY* le2; 2134 2135 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true); 2136 2137 le2 = fcb->extents.Flink; 2138 while (le2 != &fcb->extents) { 2139 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2140 2141 if (!ext->ignore) { 2142 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2143 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2144 2145 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2146 LIST_ENTRY* le3 = items.Flink; 2147 while (le3 != &items) { 2148 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2149 2150 if (ed2->address == dr->address) { 2151 ed2->address = dr->new_address; 2152 break; 2153 } 2154 2155 le3 = le3->Flink; 2156 } 2157 } 2158 } 2159 } 2160 2161 le2 = le2->Flink; 2162 } 2163 2164 ExReleaseResourceLite(fcb->Header.Resource); 2165 2166 le = le->Flink; 2167 } 2168 } else 2169 do_rollback(Vcb, &rollback); 2170 2171 free_trees(Vcb); 2172 2173 ExReleaseResourceLite(&Vcb->tree_lock); 2174 2175 if (data) 2176 ExFreePool(data); 2177 2178 while (!IsListEmpty(&items)) { 2179 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry); 2180 2181 while (!IsListEmpty(&dr->refs)) { 2182 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 2183 2184 ExFreePool(ref); 2185 } 2186 2187 ExFreePool(dr); 2188 } 2189 2190 while (!IsListEmpty(&metadata_items)) { 2191 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry); 2192 2193 while (!IsListEmpty(&mr->refs)) { 2194 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 2195 2196 ExFreePool(ref); 2197 } 2198 2199 if (mr->data) 2200 ExFreePool(mr->data); 2201 2202 ExFreePool(mr); 2203 } 2204 2205 return Status; 2206 } 2207 2208 static __inline uint64_t get_chunk_dup_type(chunk* c) { 2209 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2210 return BLOCK_FLAG_RAID0; 2211 else if (c->chunk_item->type & BLOCK_FLAG_RAID1) 2212 return BLOCK_FLAG_RAID1; 2213 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE) 2214 return BLOCK_FLAG_DUPLICATE; 2215 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2216 return BLOCK_FLAG_RAID10; 2217 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2218 return BLOCK_FLAG_RAID5; 2219 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2220 return BLOCK_FLAG_RAID6; 2221 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3) 2222 return BLOCK_FLAG_RAID1C3; 2223 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4) 2224 return BLOCK_FLAG_RAID1C4; 2225 else 2226 return BLOCK_FLAG_SINGLE; 2227 } 2228 2229 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) { 2230 btrfs_balance_opts* opts; 2231 2232 opts = &Vcb->balance.opts[sort]; 2233 2234 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED)) 2235 return false; 2236 2237 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2238 uint64_t type = get_chunk_dup_type(c); 2239 2240 if (!(type & opts->profiles)) 2241 return false; 2242 } 2243 2244 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2245 uint16_t i; 2246 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2247 bool b = false; 2248 2249 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2250 if (cis[i].dev_id == opts->devid) { 2251 b = true; 2252 break; 2253 } 2254 } 2255 2256 if (!b) 2257 return false; 2258 } 2259 2260 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2261 uint16_t i, factor; 2262 uint64_t physsize; 2263 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2264 bool b = false; 2265 2266 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2267 factor = c->chunk_item->num_stripes; 2268 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2269 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 2270 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2271 factor = c->chunk_item->num_stripes - 1; 2272 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2273 factor = c->chunk_item->num_stripes - 2; 2274 else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4 2275 factor = 1; 2276 2277 physsize = c->chunk_item->size / factor; 2278 2279 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2280 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start && 2281 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) { 2282 b = true; 2283 break; 2284 } 2285 } 2286 2287 if (!b) 2288 return false; 2289 } 2290 2291 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2292 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end) 2293 return false; 2294 } 2295 2296 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2297 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end) 2298 return false; 2299 } 2300 2301 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2302 uint64_t usage = c->used * 100 / c->chunk_item->size; 2303 2304 // usage == 0 should mean completely empty, not just that usage rounds to 0% 2305 if (c->used > 0 && usage == 0) 2306 usage = 1; 2307 2308 if (usage < opts->usage_start || usage > opts->usage_end) 2309 return false; 2310 } 2311 2312 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) { 2313 uint64_t type = get_chunk_dup_type(c); 2314 2315 if (type == opts->convert) 2316 return false; 2317 } 2318 2319 return true; 2320 } 2321 2322 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2323 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2324 args->profiles = opts->profiles; 2325 args->flags |= BALANCE_ARGS_FLAGS_PROFILES; 2326 } 2327 2328 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2329 if (args->usage_start == 0) { 2330 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE; 2331 args->usage_start = opts->usage_start; 2332 args->usage_end = opts->usage_end; 2333 } else { 2334 args->flags |= BALANCE_ARGS_FLAGS_USAGE; 2335 args->usage = opts->usage_end; 2336 } 2337 } 2338 2339 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2340 args->devid = opts->devid; 2341 args->flags |= BALANCE_ARGS_FLAGS_DEVID; 2342 } 2343 2344 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2345 args->drange_start = opts->drange_start; 2346 args->drange_end = opts->drange_end; 2347 args->flags |= BALANCE_ARGS_FLAGS_DRANGE; 2348 } 2349 2350 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2351 args->vrange_start = opts->vrange_start; 2352 args->vrange_end = opts->vrange_end; 2353 args->flags |= BALANCE_ARGS_FLAGS_VRANGE; 2354 } 2355 2356 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) { 2357 args->convert = opts->convert; 2358 args->flags |= BALANCE_ARGS_FLAGS_CONVERT; 2359 2360 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT) 2361 args->flags |= BALANCE_ARGS_FLAGS_SOFT; 2362 } 2363 2364 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) { 2365 if (args->limit_start == 0) { 2366 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE; 2367 args->limit_start = (uint32_t)opts->limit_start; 2368 args->limit_end = (uint32_t)opts->limit_end; 2369 } else { 2370 args->flags |= BALANCE_ARGS_FLAGS_LIMIT; 2371 args->limit = opts->limit_end; 2372 } 2373 } 2374 2375 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2376 args->stripes_start = opts->stripes_start; 2377 args->stripes_end = opts->stripes_end; 2378 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE; 2379 } 2380 } 2381 2382 static NTSTATUS add_balance_item(device_extension* Vcb) { 2383 KEY searchkey; 2384 traverse_ptr tp; 2385 NTSTATUS Status; 2386 BALANCE_ITEM* bi; 2387 2388 searchkey.obj_id = BALANCE_ITEM_ID; 2389 searchkey.obj_type = TYPE_TEMP_ITEM; 2390 searchkey.offset = 0; 2391 2392 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2393 2394 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2395 if (!NT_SUCCESS(Status)) { 2396 ERR("find_item returned %08lx\n", Status); 2397 goto end; 2398 } 2399 2400 if (!keycmp(tp.item->key, searchkey)) { 2401 Status = delete_tree_item(Vcb, &tp); 2402 if (!NT_SUCCESS(Status)) { 2403 ERR("delete_tree_item returned %08lx\n", Status); 2404 goto end; 2405 } 2406 } 2407 2408 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG); 2409 if (!bi) { 2410 ERR("out of memory\n"); 2411 Status = STATUS_INSUFFICIENT_RESOURCES; 2412 goto end; 2413 } 2414 2415 RtlZeroMemory(bi, sizeof(BALANCE_ITEM)); 2416 2417 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2418 bi->flags |= BALANCE_FLAGS_DATA; 2419 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 2420 } 2421 2422 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2423 bi->flags |= BALANCE_FLAGS_METADATA; 2424 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 2425 } 2426 2427 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2428 bi->flags |= BALANCE_FLAGS_SYSTEM; 2429 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 2430 } 2431 2432 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL); 2433 if (!NT_SUCCESS(Status)) { 2434 ERR("insert_tree_item returned %08lx\n", Status); 2435 ExFreePool(bi); 2436 goto end; 2437 } 2438 2439 Status = STATUS_SUCCESS; 2440 2441 end: 2442 if (NT_SUCCESS(Status)) { 2443 Status = do_write(Vcb, NULL); 2444 if (!NT_SUCCESS(Status)) 2445 ERR("do_write returned %08lx\n", Status); 2446 } 2447 2448 free_trees(Vcb); 2449 2450 ExReleaseResourceLite(&Vcb->tree_lock); 2451 2452 return Status; 2453 } 2454 2455 static NTSTATUS remove_balance_item(device_extension* Vcb) { 2456 KEY searchkey; 2457 traverse_ptr tp; 2458 NTSTATUS Status; 2459 2460 searchkey.obj_id = BALANCE_ITEM_ID; 2461 searchkey.obj_type = TYPE_TEMP_ITEM; 2462 searchkey.offset = 0; 2463 2464 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2465 2466 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2467 if (!NT_SUCCESS(Status)) { 2468 ERR("find_item returned %08lx\n", Status); 2469 goto end; 2470 } 2471 2472 if (!keycmp(tp.item->key, searchkey)) { 2473 Status = delete_tree_item(Vcb, &tp); 2474 if (!NT_SUCCESS(Status)) { 2475 ERR("delete_tree_item returned %08lx\n", Status); 2476 goto end; 2477 } 2478 2479 Status = do_write(Vcb, NULL); 2480 if (!NT_SUCCESS(Status)) { 2481 ERR("do_write returned %08lx\n", Status); 2482 goto end; 2483 } 2484 2485 free_trees(Vcb); 2486 } 2487 2488 Status = STATUS_SUCCESS; 2489 2490 end: 2491 ExReleaseResourceLite(&Vcb->tree_lock); 2492 2493 return Status; 2494 } 2495 2496 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2497 opts->flags = BTRFS_BALANCE_OPTS_ENABLED; 2498 2499 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) { 2500 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES; 2501 opts->profiles = args->profiles; 2502 } 2503 2504 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) { 2505 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2506 2507 opts->usage_start = 0; 2508 opts->usage_end = (uint8_t)args->usage; 2509 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) { 2510 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2511 2512 opts->usage_start = (uint8_t)args->usage_start; 2513 opts->usage_end = (uint8_t)args->usage_end; 2514 } 2515 2516 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) { 2517 opts->flags |= BTRFS_BALANCE_OPTS_DEVID; 2518 opts->devid = args->devid; 2519 } 2520 2521 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) { 2522 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE; 2523 opts->drange_start = args->drange_start; 2524 opts->drange_end = args->drange_end; 2525 } 2526 2527 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) { 2528 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE; 2529 opts->vrange_start = args->vrange_start; 2530 opts->vrange_end = args->vrange_end; 2531 } 2532 2533 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) { 2534 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2535 2536 opts->limit_start = 0; 2537 opts->limit_end = args->limit; 2538 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) { 2539 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2540 2541 opts->limit_start = args->limit_start; 2542 opts->limit_end = args->limit_end; 2543 } 2544 2545 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) { 2546 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES; 2547 2548 opts->stripes_start = (uint16_t)args->stripes_start; 2549 opts->stripes_end = (uint16_t)args->stripes_end; 2550 } 2551 2552 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) { 2553 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT; 2554 opts->convert = args->convert; 2555 2556 if (args->flags & BALANCE_ARGS_FLAGS_SOFT) 2557 opts->flags |= BTRFS_BALANCE_OPTS_SOFT; 2558 } 2559 } 2560 2561 static NTSTATUS remove_superblocks(device* dev) { 2562 NTSTATUS Status; 2563 superblock* sb; 2564 int i = 0; 2565 2566 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG); 2567 if (!sb) { 2568 ERR("out of memory\n"); 2569 return STATUS_INSUFFICIENT_RESOURCES; 2570 } 2571 2572 RtlZeroMemory(sb, sizeof(superblock)); 2573 2574 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) { 2575 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock)); 2576 2577 if (!NT_SUCCESS(Status)) { 2578 ExFreePool(sb); 2579 return Status; 2580 } 2581 2582 i++; 2583 } 2584 2585 ExFreePool(sb); 2586 2587 return STATUS_SUCCESS; 2588 } 2589 2590 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2591 KEY searchkey; 2592 traverse_ptr tp; 2593 NTSTATUS Status; 2594 LIST_ENTRY* le; 2595 volume_device_extension* vde; 2596 2597 if (Vcb->need_write) { 2598 Status = do_write(Vcb, NULL); 2599 2600 if (!NT_SUCCESS(Status)) 2601 ERR("do_write returned %08lx\n", Status); 2602 } else 2603 Status = STATUS_SUCCESS; 2604 2605 free_trees(Vcb); 2606 2607 if (!NT_SUCCESS(Status)) 2608 return Status; 2609 2610 // remove entry in chunk tree 2611 2612 searchkey.obj_id = 1; 2613 searchkey.obj_type = TYPE_DEV_ITEM; 2614 searchkey.offset = dev->devitem.dev_id; 2615 2616 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL); 2617 if (!NT_SUCCESS(Status)) { 2618 ERR("find_item returned %08lx\n", Status); 2619 return Status; 2620 } 2621 2622 if (!keycmp(searchkey, tp.item->key)) { 2623 Status = delete_tree_item(Vcb, &tp); 2624 2625 if (!NT_SUCCESS(Status)) { 2626 ERR("delete_tree_item returned %08lx\n", Status); 2627 return Status; 2628 } 2629 } 2630 2631 // remove stats entry in device tree 2632 2633 searchkey.obj_id = 0; 2634 searchkey.obj_type = TYPE_DEV_STATS; 2635 searchkey.offset = dev->devitem.dev_id; 2636 2637 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2638 if (!NT_SUCCESS(Status)) { 2639 ERR("find_item returned %08lx\n", Status); 2640 return Status; 2641 } 2642 2643 if (!keycmp(searchkey, tp.item->key)) { 2644 Status = delete_tree_item(Vcb, &tp); 2645 2646 if (!NT_SUCCESS(Status)) { 2647 ERR("delete_tree_item returned %08lx\n", Status); 2648 return Status; 2649 } 2650 } 2651 2652 // update superblock 2653 2654 Vcb->superblock.num_devices--; 2655 Vcb->superblock.total_bytes -= dev->devitem.num_bytes; 2656 Vcb->devices_loaded--; 2657 2658 RemoveEntryList(&dev->list_entry); 2659 2660 // flush 2661 2662 Status = do_write(Vcb, NULL); 2663 if (!NT_SUCCESS(Status)) 2664 ERR("do_write returned %08lx\n", Status); 2665 2666 free_trees(Vcb); 2667 2668 if (!NT_SUCCESS(Status)) 2669 return Status; 2670 2671 if (!dev->readonly && dev->devobj) { 2672 Status = remove_superblocks(dev); 2673 if (!NT_SUCCESS(Status)) 2674 WARN("remove_superblocks returned %08lx\n", Status); 2675 } 2676 2677 // remove entry in volume list 2678 2679 vde = Vcb->vde; 2680 2681 if (dev->devobj) { 2682 pdo_device_extension* pdode = vde->pdode; 2683 2684 ExAcquireResourceExclusiveLite(&pdode->child_lock, true); 2685 2686 le = pdode->children.Flink; 2687 while (le != &pdode->children) { 2688 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2689 2690 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) { 2691 PFILE_OBJECT FileObject; 2692 PDEVICE_OBJECT mountmgr; 2693 UNICODE_STRING mmdevpath; 2694 2695 pdode->children_loaded--; 2696 2697 if (vc->had_drive_letter) { // re-add entry to mountmgr 2698 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME); 2699 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr); 2700 if (!NT_SUCCESS(Status)) 2701 ERR("IoGetDeviceObjectPointer returned %08lx\n", Status); 2702 else { 2703 MOUNTDEV_NAME mdn; 2704 2705 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL); 2706 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW) 2707 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status); 2708 else { 2709 MOUNTDEV_NAME* mdn2; 2710 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength; 2711 2712 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG); 2713 if (!mdn2) 2714 ERR("out of memory\n"); 2715 else { 2716 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL); 2717 if (!NT_SUCCESS(Status)) 2718 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status); 2719 else { 2720 UNICODE_STRING name; 2721 2722 name.Buffer = mdn2->Name; 2723 name.Length = name.MaximumLength = mdn2->NameLength; 2724 2725 Status = mountmgr_add_drive_letter(mountmgr, &name); 2726 if (!NT_SUCCESS(Status)) 2727 WARN("mountmgr_add_drive_letter returned %08lx\n", Status); 2728 } 2729 2730 ExFreePool(mdn2); 2731 } 2732 } 2733 2734 ObDereferenceObject(FileObject); 2735 } 2736 } 2737 2738 ExFreePool(vc->pnp_name.Buffer); 2739 RemoveEntryList(&vc->list_entry); 2740 ExFreePool(vc); 2741 2742 ObDereferenceObject(vc->fileobj); 2743 2744 break; 2745 } 2746 2747 le = le->Flink; 2748 } 2749 2750 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) { 2751 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA; 2752 2753 le = pdode->children.Flink; 2754 while (le != &pdode->children) { 2755 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2756 2757 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) { 2758 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA; 2759 break; 2760 } 2761 2762 le = le->Flink; 2763 } 2764 } 2765 2766 pdode->num_children = Vcb->superblock.num_devices; 2767 2768 ExReleaseResourceLite(&pdode->child_lock); 2769 2770 // free dev 2771 2772 if (dev->trim && !dev->readonly && !Vcb->options.no_trim) 2773 trim_whole_device(dev); 2774 } 2775 2776 while (!IsListEmpty(&dev->space)) { 2777 LIST_ENTRY* le2 = RemoveHeadList(&dev->space); 2778 space* s = CONTAINING_RECORD(le2, space, list_entry); 2779 2780 ExFreePool(s); 2781 } 2782 2783 ExFreePool(dev); 2784 2785 if (Vcb->trim) { 2786 Vcb->trim = false; 2787 2788 le = Vcb->devices.Flink; 2789 while (le != &Vcb->devices) { 2790 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 2791 2792 if (dev2->trim) { 2793 Vcb->trim = true; 2794 break; 2795 } 2796 2797 le = le->Flink; 2798 } 2799 } 2800 2801 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 2802 2803 return STATUS_SUCCESS; 2804 } 2805 2806 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2807 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa; 2808 DEVICE_DATA_SET_RANGE* ranges; 2809 ULONG datalen, i; 2810 KEY searchkey; 2811 traverse_ptr tp; 2812 NTSTATUS Status; 2813 bool b; 2814 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there 2815 LIST_ENTRY* le; 2816 2817 dev->num_trim_entries = 0; 2818 2819 searchkey.obj_id = dev->devitem.dev_id; 2820 searchkey.obj_type = TYPE_DEV_EXTENT; 2821 searchkey.offset = 0; 2822 2823 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2824 if (!NT_SUCCESS(Status)) { 2825 ERR("find_item returned %08lx\n", Status); 2826 return; 2827 } 2828 2829 do { 2830 traverse_ptr next_tp; 2831 2832 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) { 2833 if (tp.item->size >= sizeof(DEV_EXTENT)) { 2834 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data; 2835 2836 if (tp.item->key.offset > lastoff) 2837 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff); 2838 2839 lastoff = tp.item->key.offset + de->length; 2840 } else { 2841 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT)); 2842 return; 2843 } 2844 } 2845 2846 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 2847 2848 if (b) { 2849 tp = next_tp; 2850 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type)) 2851 break; 2852 } 2853 } while (b); 2854 2855 if (lastoff < dev->devitem.num_bytes) 2856 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff); 2857 2858 if (dev->num_trim_entries == 0) 2859 return; 2860 2861 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE)); 2862 2863 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG); 2864 if (!dmdsa) { 2865 ERR("out of memory\n"); 2866 goto end; 2867 } 2868 2869 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES); 2870 dmdsa->Action = DeviceDsmAction_Trim; 2871 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED; 2872 dmdsa->ParameterBlockOffset = 0; 2873 dmdsa->ParameterBlockLength = 0; 2874 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)); 2875 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE); 2876 2877 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset); 2878 2879 i = 0; 2880 le = dev->trim_list.Flink; 2881 while (le != &dev->trim_list) { 2882 space* s = CONTAINING_RECORD(le, space, list_entry); 2883 2884 ranges[i].StartingOffset = s->address; 2885 ranges[i].LengthInBytes = s->size; 2886 i++; 2887 2888 le = le->Flink; 2889 } 2890 2891 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL); 2892 if (!NT_SUCCESS(Status)) 2893 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status); 2894 2895 ExFreePool(dmdsa); 2896 2897 end: 2898 while (!IsListEmpty(&dev->trim_list)) { 2899 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry); 2900 ExFreePool(s); 2901 } 2902 2903 dev->num_trim_entries = 0; 2904 } 2905 2906 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) { 2907 NTSTATUS Status; 2908 bool changed; 2909 LIST_ENTRY* le; 2910 chunk* rc; 2911 2912 // FIXME - allow with metadata chunks? 2913 2914 while (true) { 2915 rc = NULL; 2916 2917 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 2918 2919 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 2920 2921 // choose the least-used chunk we haven't looked at yet 2922 le = Vcb->chunks.Flink; 2923 while (le != &Vcb->chunks) { 2924 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 2925 2926 // FIXME - skip full-size chunks over e.g. 90% full? 2927 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used)) 2928 rc = c; 2929 2930 le = le->Flink; 2931 } 2932 2933 ExReleaseResourceLite(&Vcb->chunk_lock); 2934 2935 if (!rc) { 2936 ExReleaseResourceLite(&Vcb->tree_lock); 2937 break; 2938 } 2939 2940 if (rc->list_entry_balance.Flink) { 2941 RemoveEntryList(&rc->list_entry_balance); 2942 Vcb->balance.chunks_left--; 2943 } 2944 2945 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped 2946 rc->reloc = true; 2947 2948 ExReleaseResourceLite(&Vcb->tree_lock); 2949 2950 do { 2951 changed = false; 2952 2953 Status = balance_data_chunk(Vcb, rc, &changed); 2954 if (!NT_SUCCESS(Status)) { 2955 ERR("balance_data_chunk returned %08lx\n", Status); 2956 Vcb->balance.status = Status; 2957 rc->list_entry_balance.Flink = NULL; 2958 rc->reloc = false; 2959 return Status; 2960 } 2961 2962 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 2963 2964 if (Vcb->readonly) 2965 Vcb->balance.stopping = true; 2966 2967 if (Vcb->balance.stopping) 2968 return STATUS_SUCCESS; 2969 } while (changed); 2970 2971 rc->list_entry_balance.Flink = NULL; 2972 2973 rc->changed = true; 2974 rc->space_changed = true; 2975 rc->balance_num = Vcb->balance.balance_num; 2976 2977 Status = do_write(Vcb, NULL); 2978 if (!NT_SUCCESS(Status)) { 2979 ERR("do_write returned %08lx\n", Status); 2980 return Status; 2981 } 2982 2983 free_trees(Vcb); 2984 } 2985 2986 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 2987 2988 Status = alloc_chunk(Vcb, flags, &rc, true); 2989 2990 ExReleaseResourceLite(&Vcb->chunk_lock); 2991 2992 if (NT_SUCCESS(Status)) { 2993 *newchunk = rc; 2994 return Status; 2995 } else { 2996 ERR("alloc_chunk returned %08lx\n", Status); 2997 return Status; 2998 } 2999 } 3000 3001 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) { 3002 LIST_ENTRY* le; 3003 3004 while (!IsListEmpty(&dev->space)) { 3005 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry); 3006 3007 ExFreePool(s); 3008 } 3009 3010 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device. 3011 3012 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL); 3013 3014 le = Vcb->chunks.Flink; 3015 while (le != &Vcb->chunks) { 3016 uint16_t n; 3017 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3018 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 3019 3020 for (n = 0; n < c->chunk_item->num_stripes; n++) { 3021 uint64_t stripe_size = 0; 3022 3023 if (cis[n].dev_id == dev->devitem.dev_id) { 3024 if (stripe_size == 0) { 3025 uint16_t factor; 3026 3027 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 3028 factor = c->chunk_item->num_stripes; 3029 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 3030 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 3031 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 3032 factor = c->chunk_item->num_stripes - 1; 3033 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 3034 factor = c->chunk_item->num_stripes - 2; 3035 else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4 3036 factor = 1; 3037 3038 stripe_size = c->chunk_item->size / factor; 3039 } 3040 3041 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL); 3042 } 3043 } 3044 3045 le = le->Flink; 3046 } 3047 3048 return STATUS_SUCCESS; 3049 } 3050 3051 _Function_class_(KSTART_ROUTINE) 3052 void __stdcall balance_thread(void* context) { 3053 device_extension* Vcb = (device_extension*)context; 3054 LIST_ENTRY chunks; 3055 LIST_ENTRY* le; 3056 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0; 3057 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0; 3058 NTSTATUS Status; 3059 3060 Vcb->balance.balance_num++; 3061 3062 Vcb->balance.stopping = false; 3063 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false); 3064 3065 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3066 old_data_flags = Vcb->data_flags; 3067 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert); 3068 3069 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3070 } 3071 3072 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3073 old_metadata_flags = Vcb->metadata_flags; 3074 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert); 3075 } 3076 3077 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3078 old_system_flags = Vcb->system_flags; 3079 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert); 3080 } 3081 3082 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) { 3083 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3084 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3085 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3086 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3087 } 3088 3089 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0; 3090 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0; 3091 3092 InitializeListHead(&chunks); 3093 3094 // FIXME - what are we supposed to do with limit_start? 3095 3096 if (!Vcb->readonly) { 3097 if (!Vcb->balance.removing && !Vcb->balance.shrinking) { 3098 Status = add_balance_item(Vcb); 3099 if (!NT_SUCCESS(Status)) { 3100 ERR("add_balance_item returned %08lx\n", Status); 3101 Vcb->balance.status = Status; 3102 goto end; 3103 } 3104 } else { 3105 if (Vcb->need_write) { 3106 Status = do_write(Vcb, NULL); 3107 3108 free_trees(Vcb); 3109 3110 if (!NT_SUCCESS(Status)) { 3111 ERR("do_write returned %08lx\n", Status); 3112 Vcb->balance.status = Status; 3113 goto end; 3114 } 3115 } 3116 } 3117 } 3118 3119 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3120 3121 if (Vcb->balance.stopping) 3122 goto end; 3123 3124 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3125 3126 le = Vcb->chunks.Flink; 3127 while (le != &Vcb->chunks) { 3128 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3129 uint8_t sort; 3130 3131 acquire_chunk_lock(c, Vcb); 3132 3133 if (c->chunk_item->type & BLOCK_FLAG_DATA) 3134 sort = BALANCE_OPTS_DATA; 3135 else if (c->chunk_item->type & BLOCK_FLAG_METADATA) 3136 sort = BALANCE_OPTS_METADATA; 3137 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM) 3138 sort = BALANCE_OPTS_SYSTEM; 3139 else { 3140 ERR("unexpected chunk type %I64x\n", c->chunk_item->type); 3141 release_chunk_lock(c, Vcb); 3142 break; 3143 } 3144 3145 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) && 3146 should_balance_chunk(Vcb, sort, c)) { 3147 InsertTailList(&chunks, &c->list_entry_balance); 3148 3149 num_chunks[sort]++; 3150 Vcb->balance.total_chunks++; 3151 Vcb->balance.chunks_left++; 3152 } else if (sort == BALANCE_OPTS_METADATA) 3153 okay_metadata_chunks++; 3154 else if (sort == BALANCE_OPTS_DATA) 3155 okay_data_chunks++; 3156 else if (sort == BALANCE_OPTS_SYSTEM) 3157 okay_system_chunks++; 3158 3159 if (!c->cache_loaded) { 3160 Status = load_cache_chunk(Vcb, c, NULL); 3161 3162 if (!NT_SUCCESS(Status)) { 3163 ERR("load_cache_chunk returned %08lx\n", Status); 3164 Vcb->balance.status = Status; 3165 release_chunk_lock(c, Vcb); 3166 ExReleaseResourceLite(&Vcb->chunk_lock); 3167 goto end; 3168 } 3169 } 3170 3171 release_chunk_lock(c, Vcb); 3172 3173 le = le->Flink; 3174 } 3175 3176 ExReleaseResourceLite(&Vcb->chunk_lock); 3177 3178 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up 3179 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) { 3180 bool consolidated = false; 3181 chunk* c; 3182 3183 if (okay_metadata_chunks == 0) { 3184 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3185 3186 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true); 3187 if (NT_SUCCESS(Status)) 3188 c->balance_num = Vcb->balance.balance_num; 3189 else if (Status != STATUS_DISK_FULL || consolidated) { 3190 ERR("alloc_chunk returned %08lx\n", Status); 3191 ExReleaseResourceLite(&Vcb->chunk_lock); 3192 Vcb->balance.status = Status; 3193 goto end; 3194 } 3195 3196 ExReleaseResourceLite(&Vcb->chunk_lock); 3197 3198 if (Status == STATUS_DISK_FULL) { 3199 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c); 3200 if (!NT_SUCCESS(Status)) { 3201 ERR("try_consolidation returned %08lx\n", Status); 3202 Vcb->balance.status = Status; 3203 goto end; 3204 } else 3205 c->balance_num = Vcb->balance.balance_num; 3206 3207 consolidated = true; 3208 3209 if (Vcb->balance.stopping) 3210 goto end; 3211 } 3212 } 3213 3214 if (okay_data_chunks == 0) { 3215 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3216 3217 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true); 3218 if (NT_SUCCESS(Status)) 3219 c->balance_num = Vcb->balance.balance_num; 3220 else if (Status != STATUS_DISK_FULL || consolidated) { 3221 ERR("alloc_chunk returned %08lx\n", Status); 3222 ExReleaseResourceLite(&Vcb->chunk_lock); 3223 Vcb->balance.status = Status; 3224 goto end; 3225 } 3226 3227 ExReleaseResourceLite(&Vcb->chunk_lock); 3228 3229 if (Status == STATUS_DISK_FULL) { 3230 Status = try_consolidation(Vcb, Vcb->data_flags, &c); 3231 if (!NT_SUCCESS(Status)) { 3232 ERR("try_consolidation returned %08lx\n", Status); 3233 Vcb->balance.status = Status; 3234 goto end; 3235 } else 3236 c->balance_num = Vcb->balance.balance_num; 3237 3238 consolidated = true; 3239 3240 if (Vcb->balance.stopping) 3241 goto end; 3242 } 3243 } 3244 3245 if (okay_system_chunks == 0) { 3246 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3247 3248 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true); 3249 if (NT_SUCCESS(Status)) 3250 c->balance_num = Vcb->balance.balance_num; 3251 else if (Status != STATUS_DISK_FULL || consolidated) { 3252 ERR("alloc_chunk returned %08lx\n", Status); 3253 ExReleaseResourceLite(&Vcb->chunk_lock); 3254 Vcb->balance.status = Status; 3255 goto end; 3256 } 3257 3258 ExReleaseResourceLite(&Vcb->chunk_lock); 3259 3260 if (Status == STATUS_DISK_FULL) { 3261 Status = try_consolidation(Vcb, Vcb->system_flags, &c); 3262 if (!NT_SUCCESS(Status)) { 3263 ERR("try_consolidation returned %08lx\n", Status); 3264 Vcb->balance.status = Status; 3265 goto end; 3266 } else 3267 c->balance_num = Vcb->balance.balance_num; 3268 3269 consolidated = true; 3270 3271 if (Vcb->balance.stopping) 3272 goto end; 3273 } 3274 } 3275 } 3276 3277 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3278 3279 le = chunks.Flink; 3280 while (le != &chunks) { 3281 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3282 3283 c->reloc = true; 3284 3285 le = le->Flink; 3286 } 3287 3288 ExReleaseResourceLite(&Vcb->chunk_lock); 3289 3290 // do data chunks before metadata 3291 le = chunks.Flink; 3292 while (le != &chunks) { 3293 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3294 LIST_ENTRY* le2 = le->Flink; 3295 3296 if (c->chunk_item->type & BLOCK_FLAG_DATA) { 3297 bool changed; 3298 3299 do { 3300 changed = false; 3301 3302 Status = balance_data_chunk(Vcb, c, &changed); 3303 if (!NT_SUCCESS(Status)) { 3304 ERR("balance_data_chunk returned %08lx\n", Status); 3305 Vcb->balance.status = Status; 3306 goto end; 3307 } 3308 3309 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3310 3311 if (Vcb->readonly) 3312 Vcb->balance.stopping = true; 3313 3314 if (Vcb->balance.stopping) 3315 break; 3316 } while (changed); 3317 3318 c->changed = true; 3319 c->space_changed = true; 3320 } 3321 3322 if (Vcb->balance.stopping) 3323 goto end; 3324 3325 if (c->chunk_item->type & BLOCK_FLAG_DATA && 3326 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) { 3327 RemoveEntryList(&c->list_entry_balance); 3328 c->list_entry_balance.Flink = NULL; 3329 3330 Vcb->balance.chunks_left--; 3331 } 3332 3333 le = le2; 3334 } 3335 3336 // do metadata chunks 3337 while (!IsListEmpty(&chunks)) { 3338 chunk* c; 3339 bool changed; 3340 3341 le = RemoveHeadList(&chunks); 3342 c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3343 3344 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) { 3345 do { 3346 Status = balance_metadata_chunk(Vcb, c, &changed); 3347 if (!NT_SUCCESS(Status)) { 3348 ERR("balance_metadata_chunk returned %08lx\n", Status); 3349 Vcb->balance.status = Status; 3350 goto end; 3351 } 3352 3353 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3354 3355 if (Vcb->readonly) 3356 Vcb->balance.stopping = true; 3357 3358 if (Vcb->balance.stopping) 3359 break; 3360 } while (changed); 3361 3362 c->changed = true; 3363 c->space_changed = true; 3364 } 3365 3366 if (Vcb->balance.stopping) 3367 break; 3368 3369 c->list_entry_balance.Flink = NULL; 3370 3371 Vcb->balance.chunks_left--; 3372 } 3373 3374 end: 3375 if (!Vcb->readonly) { 3376 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3377 le = chunks.Flink; 3378 while (le != &chunks) { 3379 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3380 c->reloc = false; 3381 3382 le = le->Flink; 3383 c->list_entry_balance.Flink = NULL; 3384 } 3385 3386 if (old_data_flags != 0) 3387 Vcb->data_flags = old_data_flags; 3388 3389 if (old_metadata_flags != 0) 3390 Vcb->metadata_flags = old_metadata_flags; 3391 3392 if (old_system_flags != 0) 3393 Vcb->system_flags = old_system_flags; 3394 } 3395 3396 if (Vcb->balance.removing) { 3397 device* dev = NULL; 3398 3399 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3400 3401 le = Vcb->devices.Flink; 3402 while (le != &Vcb->devices) { 3403 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3404 3405 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3406 dev = dev2; 3407 break; 3408 } 3409 3410 le = le->Flink; 3411 } 3412 3413 if (dev) { 3414 if (Vcb->balance.chunks_left == 0) { 3415 Status = finish_removing_device(Vcb, dev); 3416 3417 if (!NT_SUCCESS(Status)) { 3418 ERR("finish_removing_device returned %08lx\n", Status); 3419 dev->reloc = false; 3420 } 3421 } else 3422 dev->reloc = false; 3423 } 3424 3425 ExReleaseResourceLite(&Vcb->tree_lock); 3426 } else if (Vcb->balance.shrinking) { 3427 device* dev = NULL; 3428 3429 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3430 3431 le = Vcb->devices.Flink; 3432 while (le != &Vcb->devices) { 3433 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3434 3435 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3436 dev = dev2; 3437 break; 3438 } 3439 3440 le = le->Flink; 3441 } 3442 3443 if (!dev) { 3444 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid); 3445 Vcb->balance.status = STATUS_INTERNAL_ERROR; 3446 } 3447 3448 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3449 if (dev) { 3450 Status = regenerate_space_list(Vcb, dev); 3451 if (!NT_SUCCESS(Status)) 3452 WARN("regenerate_space_list returned %08lx\n", Status); 3453 } 3454 } else { 3455 uint64_t old_size; 3456 3457 old_size = dev->devitem.num_bytes; 3458 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start; 3459 3460 Status = update_dev_item(Vcb, dev, NULL); 3461 if (!NT_SUCCESS(Status)) { 3462 ERR("update_dev_item returned %08lx\n", Status); 3463 dev->devitem.num_bytes = old_size; 3464 Vcb->balance.status = Status; 3465 3466 Status = regenerate_space_list(Vcb, dev); 3467 if (!NT_SUCCESS(Status)) 3468 WARN("regenerate_space_list returned %08lx\n", Status); 3469 } else { 3470 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes; 3471 3472 Status = do_write(Vcb, NULL); 3473 if (!NT_SUCCESS(Status)) 3474 ERR("do_write returned %08lx\n", Status); 3475 3476 free_trees(Vcb); 3477 } 3478 } 3479 3480 ExReleaseResourceLite(&Vcb->tree_lock); 3481 3482 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status)) 3483 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3484 } else { 3485 Status = remove_balance_item(Vcb); 3486 if (!NT_SUCCESS(Status)) { 3487 ERR("remove_balance_item returned %08lx\n", Status); 3488 goto end; 3489 } 3490 } 3491 3492 if (Vcb->trim && !Vcb->options.no_trim) { 3493 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3494 3495 le = Vcb->devices.Flink; 3496 while (le != &Vcb->devices) { 3497 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3498 3499 if (dev2->devobj && !dev2->readonly && dev2->trim) 3500 trim_unalloc_space(Vcb, dev2); 3501 3502 le = le->Flink; 3503 } 3504 3505 ExReleaseResourceLite(&Vcb->tree_lock); 3506 } 3507 } 3508 3509 ZwClose(Vcb->balance.thread); 3510 Vcb->balance.thread = NULL; 3511 3512 KeSetEvent(&Vcb->balance.finished, 0, false); 3513 } 3514 3515 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3516 NTSTATUS Status; 3517 btrfs_start_balance* bsb = (btrfs_start_balance*)data; 3518 OBJECT_ATTRIBUTES oa; 3519 uint8_t i; 3520 3521 if (length < sizeof(btrfs_start_balance) || !data) 3522 return STATUS_INVALID_PARAMETER; 3523 3524 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3525 return STATUS_PRIVILEGE_NOT_HELD; 3526 3527 if (Vcb->locked) { 3528 WARN("cannot start balance while locked\n"); 3529 return STATUS_DEVICE_NOT_READY; 3530 } 3531 3532 if (Vcb->scrub.thread) { 3533 WARN("cannot start balance while scrub running\n"); 3534 return STATUS_DEVICE_NOT_READY; 3535 } 3536 3537 if (Vcb->balance.thread) { 3538 WARN("balance already running\n"); 3539 return STATUS_DEVICE_NOT_READY; 3540 } 3541 3542 if (Vcb->readonly) 3543 return STATUS_MEDIA_WRITE_PROTECTED; 3544 3545 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3546 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3547 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED)) 3548 return STATUS_SUCCESS; 3549 3550 for (i = 0; i < 3; i++) { 3551 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3552 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) { 3553 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 | 3554 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 | 3555 BLOCK_FLAG_RAID1C4; 3556 3557 if (bsb->opts[i].profiles == 0) 3558 return STATUS_INVALID_PARAMETER; 3559 } 3560 3561 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) { 3562 if (bsb->opts[i].devid == 0) 3563 return STATUS_INVALID_PARAMETER; 3564 } 3565 3566 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) { 3567 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end) 3568 return STATUS_INVALID_PARAMETER; 3569 } 3570 3571 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) { 3572 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end) 3573 return STATUS_INVALID_PARAMETER; 3574 } 3575 3576 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) { 3577 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start); 3578 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end); 3579 3580 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end) 3581 return STATUS_INVALID_PARAMETER; 3582 } 3583 3584 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) { 3585 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start); 3586 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end); 3587 3588 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3589 return STATUS_INVALID_PARAMETER; 3590 } 3591 3592 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) { 3593 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start); 3594 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end); 3595 3596 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3597 return STATUS_INVALID_PARAMETER; 3598 } 3599 3600 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3601 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 && 3602 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 && 3603 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 && 3604 bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 && 3605 bsb->opts[i].convert != BLOCK_FLAG_RAID1C4) 3606 return STATUS_INVALID_PARAMETER; 3607 } 3608 } 3609 } 3610 3611 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3612 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3613 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3614 3615 Vcb->balance.paused = false; 3616 Vcb->balance.removing = false; 3617 Vcb->balance.shrinking = false; 3618 Vcb->balance.status = STATUS_SUCCESS; 3619 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3620 3621 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3622 3623 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3624 if (!NT_SUCCESS(Status)) { 3625 ERR("PsCreateSystemThread returned %08lx\n", Status); 3626 return Status; 3627 } 3628 3629 return STATUS_SUCCESS; 3630 } 3631 3632 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) { 3633 KEY searchkey; 3634 traverse_ptr tp; 3635 NTSTATUS Status; 3636 BALANCE_ITEM* bi; 3637 OBJECT_ATTRIBUTES oa; 3638 int i; 3639 3640 searchkey.obj_id = BALANCE_ITEM_ID; 3641 searchkey.obj_type = TYPE_TEMP_ITEM; 3642 searchkey.offset = 0; 3643 3644 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 3645 if (!NT_SUCCESS(Status)) { 3646 ERR("find_item returned %08lx\n", Status); 3647 return Status; 3648 } 3649 3650 if (keycmp(tp.item->key, searchkey)) { 3651 TRACE("no balance item found\n"); 3652 return STATUS_NOT_FOUND; 3653 } 3654 3655 if (tp.item->size < sizeof(BALANCE_ITEM)) { 3656 WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, 3657 tp.item->size, sizeof(BALANCE_ITEM)); 3658 return STATUS_INTERNAL_ERROR; 3659 } 3660 3661 bi = (BALANCE_ITEM*)tp.item->data; 3662 3663 if (bi->flags & BALANCE_FLAGS_DATA) 3664 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 3665 3666 if (bi->flags & BALANCE_FLAGS_METADATA) 3667 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 3668 3669 if (bi->flags & BALANCE_FLAGS_SYSTEM) 3670 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 3671 3672 // do the heuristics that Linux driver does 3673 3674 for (i = 0; i < 3; i++) { 3675 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3676 // if converting, don't redo chunks already done 3677 3678 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3679 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT; 3680 3681 // don't balance chunks more than 90% filled - presumably these 3682 // have already been done 3683 3684 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) && 3685 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3686 ) { 3687 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE; 3688 Vcb->balance.opts[i].usage_start = 0; 3689 Vcb->balance.opts[i].usage_end = 90; 3690 } 3691 } 3692 } 3693 3694 if (Vcb->readonly || Vcb->options.skip_balance) 3695 Vcb->balance.paused = true; 3696 else 3697 Vcb->balance.paused = false; 3698 3699 Vcb->balance.removing = false; 3700 Vcb->balance.shrinking = false; 3701 Vcb->balance.status = STATUS_SUCCESS; 3702 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3703 3704 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3705 3706 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3707 if (!NT_SUCCESS(Status)) { 3708 ERR("PsCreateSystemThread returned %08lx\n", Status); 3709 return Status; 3710 } 3711 3712 return STATUS_SUCCESS; 3713 } 3714 3715 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) { 3716 btrfs_query_balance* bqb = (btrfs_query_balance*)data; 3717 3718 if (length < sizeof(btrfs_query_balance) || !data) 3719 return STATUS_INVALID_PARAMETER; 3720 3721 if (!Vcb->balance.thread) { 3722 bqb->status = BTRFS_BALANCE_STOPPED; 3723 3724 if (!NT_SUCCESS(Vcb->balance.status)) { 3725 bqb->status |= BTRFS_BALANCE_ERROR; 3726 bqb->error = Vcb->balance.status; 3727 } 3728 3729 return STATUS_SUCCESS; 3730 } 3731 3732 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING; 3733 3734 if (Vcb->balance.removing) 3735 bqb->status |= BTRFS_BALANCE_REMOVAL; 3736 3737 if (Vcb->balance.shrinking) 3738 bqb->status |= BTRFS_BALANCE_SHRINKING; 3739 3740 if (!NT_SUCCESS(Vcb->balance.status)) 3741 bqb->status |= BTRFS_BALANCE_ERROR; 3742 3743 bqb->chunks_left = Vcb->balance.chunks_left; 3744 bqb->total_chunks = Vcb->balance.total_chunks; 3745 bqb->error = Vcb->balance.status; 3746 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3747 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3748 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3749 3750 return STATUS_SUCCESS; 3751 } 3752 3753 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3754 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3755 return STATUS_PRIVILEGE_NOT_HELD; 3756 3757 if (!Vcb->balance.thread) 3758 return STATUS_DEVICE_NOT_READY; 3759 3760 if (Vcb->balance.paused) 3761 return STATUS_DEVICE_NOT_READY; 3762 3763 Vcb->balance.paused = true; 3764 KeClearEvent(&Vcb->balance.event); 3765 3766 return STATUS_SUCCESS; 3767 } 3768 3769 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3770 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3771 return STATUS_PRIVILEGE_NOT_HELD; 3772 3773 if (!Vcb->balance.thread) 3774 return STATUS_DEVICE_NOT_READY; 3775 3776 if (!Vcb->balance.paused) 3777 return STATUS_DEVICE_NOT_READY; 3778 3779 if (Vcb->readonly) 3780 return STATUS_MEDIA_WRITE_PROTECTED; 3781 3782 Vcb->balance.paused = false; 3783 KeSetEvent(&Vcb->balance.event, 0, false); 3784 3785 return STATUS_SUCCESS; 3786 } 3787 3788 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3789 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3790 return STATUS_PRIVILEGE_NOT_HELD; 3791 3792 if (!Vcb->balance.thread) 3793 return STATUS_DEVICE_NOT_READY; 3794 3795 Vcb->balance.paused = false; 3796 Vcb->balance.stopping = true; 3797 Vcb->balance.status = STATUS_SUCCESS; 3798 KeSetEvent(&Vcb->balance.event, 0, false); 3799 3800 return STATUS_SUCCESS; 3801 } 3802 3803 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3804 uint64_t devid; 3805 LIST_ENTRY* le; 3806 device* dev = NULL; 3807 NTSTATUS Status; 3808 int i; 3809 uint64_t num_rw_devices; 3810 OBJECT_ATTRIBUTES oa; 3811 3812 TRACE("(%p, %p, %lx)\n", Vcb, data, length); 3813 3814 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3815 return STATUS_PRIVILEGE_NOT_HELD; 3816 3817 if (length < sizeof(uint64_t)) 3818 return STATUS_INVALID_PARAMETER; 3819 3820 devid = *(uint64_t*)data; 3821 3822 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 3823 3824 if (Vcb->readonly) { 3825 ExReleaseResourceLite(&Vcb->tree_lock); 3826 return STATUS_MEDIA_WRITE_PROTECTED; 3827 } 3828 3829 num_rw_devices = 0; 3830 3831 le = Vcb->devices.Flink; 3832 while (le != &Vcb->devices) { 3833 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3834 3835 if (dev2->devitem.dev_id == devid) 3836 dev = dev2; 3837 3838 if (!dev2->readonly) 3839 num_rw_devices++; 3840 3841 le = le->Flink; 3842 } 3843 3844 if (!dev) { 3845 ExReleaseResourceLite(&Vcb->tree_lock); 3846 WARN("device %I64x not found\n", devid); 3847 return STATUS_NOT_FOUND; 3848 } 3849 3850 if (!dev->readonly) { 3851 if (num_rw_devices == 1) { 3852 ExReleaseResourceLite(&Vcb->tree_lock); 3853 WARN("not removing last non-readonly device\n"); 3854 return STATUS_INVALID_PARAMETER; 3855 } 3856 3857 if (num_rw_devices == 4 && 3858 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) || 3859 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) || 3860 (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4) 3861 ) 3862 ) { 3863 ExReleaseResourceLite(&Vcb->tree_lock); 3864 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n"); 3865 return STATUS_CANNOT_DELETE; 3866 } 3867 3868 if (num_rw_devices == 3 && 3869 ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) || 3870 (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3)) 3871 ) { 3872 ExReleaseResourceLite(&Vcb->tree_lock); 3873 ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n"); 3874 return STATUS_CANNOT_DELETE; 3875 } 3876 3877 if (num_rw_devices == 2 && 3878 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) || 3879 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1)) 3880 ) { 3881 ExReleaseResourceLite(&Vcb->tree_lock); 3882 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n"); 3883 return STATUS_CANNOT_DELETE; 3884 } 3885 } 3886 3887 ExReleaseResourceLite(&Vcb->tree_lock); 3888 3889 if (Vcb->balance.thread) { 3890 WARN("balance already running\n"); 3891 return STATUS_DEVICE_NOT_READY; 3892 } 3893 3894 dev->reloc = true; 3895 3896 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3); 3897 3898 for (i = 0; i < 3; i++) { 3899 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID; 3900 Vcb->balance.opts[i].devid = devid; 3901 } 3902 3903 Vcb->balance.paused = false; 3904 Vcb->balance.removing = true; 3905 Vcb->balance.shrinking = false; 3906 Vcb->balance.status = STATUS_SUCCESS; 3907 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3908 3909 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL); 3910 3911 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb); 3912 if (!NT_SUCCESS(Status)) { 3913 ERR("PsCreateSystemThread returned %08lx\n", Status); 3914 dev->reloc = false; 3915 return Status; 3916 } 3917 3918 return STATUS_SUCCESS; 3919 } 3920