1 /* Copyright (c) Mark Harmstone 2016-17 2 * 3 * This file is part of WinBtrfs. 4 * 5 * WinBtrfs is free software: you can redistribute it and/or modify 6 * it under the terms of the GNU Lesser General Public Licence as published by 7 * the Free Software Foundation, either version 3 of the Licence, or 8 * (at your option) any later version. 9 * 10 * WinBtrfs is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public Licence for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public Licence 16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include "btrfs_drv.h" 19 #include "btrfsioctl.h" 20 #include <ntddstor.h> 21 22 typedef struct { 23 uint64_t address; 24 uint64_t new_address; 25 tree_header* data; 26 EXTENT_ITEM* ei; 27 tree* t; 28 bool system; 29 LIST_ENTRY refs; 30 LIST_ENTRY list_entry; 31 } metadata_reloc; 32 33 typedef struct { 34 uint8_t type; 35 uint64_t hash; 36 37 union { 38 TREE_BLOCK_REF tbr; 39 SHARED_BLOCK_REF sbr; 40 }; 41 42 metadata_reloc* parent; 43 bool top; 44 LIST_ENTRY list_entry; 45 } metadata_reloc_ref; 46 47 typedef struct { 48 uint64_t address; 49 uint64_t size; 50 uint64_t new_address; 51 chunk* newchunk; 52 EXTENT_ITEM* ei; 53 LIST_ENTRY refs; 54 LIST_ENTRY list_entry; 55 } data_reloc; 56 57 typedef struct { 58 uint8_t type; 59 uint64_t hash; 60 61 union { 62 EXTENT_DATA_REF edr; 63 SHARED_DATA_REF sdr; 64 }; 65 66 metadata_reloc* parent; 67 LIST_ENTRY list_entry; 68 } data_reloc_ref; 69 70 #ifndef _MSC_VER // not in mingw yet 71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000 72 #endif 73 74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time 75 76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp, 77 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) { 78 NTSTATUS Status; 79 metadata_reloc* mr; 80 EXTENT_ITEM* ei; 81 uint16_t len; 82 uint64_t inline_rc; 83 uint8_t* ptr; 84 85 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG); 86 if (!mr) { 87 ERR("out of memory\n"); 88 return STATUS_INSUFFICIENT_RESOURCES; 89 } 90 91 mr->address = tp->item->key.obj_id; 92 mr->data = NULL; 93 mr->ei = (EXTENT_ITEM*)tp->item->data; 94 mr->system = false; 95 InitializeListHead(&mr->refs); 96 97 Status = delete_tree_item(Vcb, tp); 98 if (!NT_SUCCESS(Status)) { 99 ERR("delete_tree_item returned %08x\n", Status); 100 ExFreePool(mr); 101 return Status; 102 } 103 104 if (!c) 105 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 106 107 if (c) { 108 acquire_chunk_lock(c, Vcb); 109 110 c->used -= Vcb->superblock.node_size; 111 112 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback); 113 114 release_chunk_lock(c, Vcb); 115 } 116 117 ei = (EXTENT_ITEM*)tp->item->data; 118 inline_rc = 0; 119 120 len = tp->item->size - sizeof(EXTENT_ITEM); 121 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 122 if (!skinny) { 123 len -= sizeof(EXTENT_ITEM2); 124 ptr += sizeof(EXTENT_ITEM2); 125 } 126 127 while (len > 0) { 128 uint8_t secttype = *ptr; 129 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0); 130 metadata_reloc_ref* ref; 131 132 len--; 133 134 if (sectlen > len) { 135 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 136 return STATUS_INTERNAL_ERROR; 137 } 138 139 if (sectlen == 0) { 140 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 141 return STATUS_INTERNAL_ERROR; 142 } 143 144 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 145 if (!ref) { 146 ERR("out of memory\n"); 147 return STATUS_INSUFFICIENT_RESOURCES; 148 } 149 150 if (secttype == TYPE_TREE_BLOCK_REF) { 151 ref->type = TYPE_TREE_BLOCK_REF; 152 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF)); 153 inline_rc++; 154 } else if (secttype == TYPE_SHARED_BLOCK_REF) { 155 ref->type = TYPE_SHARED_BLOCK_REF; 156 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF)); 157 inline_rc++; 158 } else { 159 ERR("unexpected tree type %x\n", secttype); 160 ExFreePool(ref); 161 return STATUS_INTERNAL_ERROR; 162 } 163 164 ref->parent = NULL; 165 ref->top = false; 166 InsertTailList(&mr->refs, &ref->list_entry); 167 168 len -= sectlen; 169 ptr += sizeof(uint8_t) + sectlen; 170 } 171 172 if (inline_rc < ei->refcount) { // look for non-inline entries 173 traverse_ptr tp2 = *tp, next_tp; 174 175 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 176 tp2 = next_tp; 177 178 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 179 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) { 180 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 181 if (!ref) { 182 ERR("out of memory\n"); 183 return STATUS_INSUFFICIENT_RESOURCES; 184 } 185 186 ref->type = TYPE_TREE_BLOCK_REF; 187 ref->tbr.offset = tp2.item->key.offset; 188 ref->parent = NULL; 189 ref->top = false; 190 InsertTailList(&mr->refs, &ref->list_entry); 191 192 Status = delete_tree_item(Vcb, &tp2); 193 if (!NT_SUCCESS(Status)) { 194 ERR("delete_tree_item returned %08x\n", Status); 195 return Status; 196 } 197 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) { 198 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG); 199 if (!ref) { 200 ERR("out of memory\n"); 201 return STATUS_INSUFFICIENT_RESOURCES; 202 } 203 204 ref->type = TYPE_SHARED_BLOCK_REF; 205 ref->sbr.offset = tp2.item->key.offset; 206 ref->parent = NULL; 207 ref->top = false; 208 InsertTailList(&mr->refs, &ref->list_entry); 209 210 Status = delete_tree_item(Vcb, &tp2); 211 if (!NT_SUCCESS(Status)) { 212 ERR("delete_tree_item returned %08x\n", Status); 213 return Status; 214 } 215 } 216 } else 217 break; 218 } 219 } 220 221 InsertTailList(items, &mr->list_entry); 222 223 if (mr2) 224 *mr2 = mr; 225 226 return STATUS_SUCCESS; 227 } 228 229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 230 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) { 231 LIST_ENTRY* le; 232 KEY searchkey; 233 traverse_ptr tp; 234 bool skinny = false; 235 NTSTATUS Status; 236 237 le = items->Flink; 238 while (le != items) { 239 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 240 241 if (mr->address == address) { 242 *mr2 = mr; 243 return STATUS_SUCCESS; 244 } 245 246 le = le->Flink; 247 } 248 249 searchkey.obj_id = address; 250 searchkey.obj_type = TYPE_METADATA_ITEM; 251 searchkey.offset = 0xffffffffffffffff; 252 253 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 254 if (!NT_SUCCESS(Status)) { 255 ERR("find_item returned %08x\n", Status); 256 return Status; 257 } 258 259 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) 260 skinny = true; 261 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 262 tp.item->size >= sizeof(EXTENT_ITEM)) { 263 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 264 265 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) { 266 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address); 267 return STATUS_INTERNAL_ERROR; 268 } 269 } else { 270 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address); 271 return STATUS_INTERNAL_ERROR; 272 } 273 274 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback); 275 if (!NT_SUCCESS(Status)) { 276 ERR("add_metadata_reloc returned %08x\n", Status); 277 return Status; 278 } 279 280 return STATUS_SUCCESS; 281 } 282 283 static void sort_metadata_reloc_refs(metadata_reloc* mr) { 284 LIST_ENTRY newlist, *le; 285 286 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items 287 return; 288 289 // insertion sort 290 291 InitializeListHead(&newlist); 292 293 while (!IsListEmpty(&mr->refs)) { 294 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 295 bool inserted = false; 296 297 if (ref->type == TYPE_TREE_BLOCK_REF) 298 ref->hash = ref->tbr.offset; 299 else if (ref->type == TYPE_SHARED_BLOCK_REF) 300 ref->hash = ref->parent->new_address; 301 302 le = newlist.Flink; 303 while (le != &newlist) { 304 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 305 306 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 307 InsertHeadList(le->Blink, &ref->list_entry); 308 inserted = true; 309 break; 310 } 311 312 le = le->Flink; 313 } 314 315 if (!inserted) 316 InsertTailList(&newlist, &ref->list_entry); 317 } 318 319 newlist.Flink->Blink = &mr->refs; 320 newlist.Blink->Flink = &mr->refs; 321 mr->refs.Flink = newlist.Flink; 322 mr->refs.Blink = newlist.Blink; 323 } 324 325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) { 326 NTSTATUS Status; 327 LIST_ENTRY* le; 328 uint64_t rc = 0; 329 uint16_t inline_len; 330 bool all_inline = true; 331 metadata_reloc_ref* first_noninline = NULL; 332 EXTENT_ITEM* ei; 333 uint8_t* ptr; 334 335 inline_len = sizeof(EXTENT_ITEM); 336 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) 337 inline_len += sizeof(EXTENT_ITEM2); 338 339 sort_metadata_reloc_refs(mr); 340 341 le = mr->refs.Flink; 342 while (le != &mr->refs) { 343 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 344 uint16_t extlen = 0; 345 346 rc++; 347 348 if (ref->type == TYPE_TREE_BLOCK_REF) 349 extlen += sizeof(TREE_BLOCK_REF); 350 else if (ref->type == TYPE_SHARED_BLOCK_REF) 351 extlen += sizeof(SHARED_BLOCK_REF); 352 353 if (all_inline) { 354 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 355 all_inline = false; 356 first_noninline = ref; 357 } else 358 inline_len += extlen + 1; 359 } 360 361 le = le->Flink; 362 } 363 364 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 365 if (!ei) { 366 ERR("out of memory\n"); 367 return STATUS_INSUFFICIENT_RESOURCES; 368 } 369 370 ei->refcount = rc; 371 ei->generation = mr->ei->generation; 372 ei->flags = mr->ei->flags; 373 ptr = (uint8_t*)&ei[1]; 374 375 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) { 376 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr; 377 378 ei2->firstitem = *(KEY*)&mr->data[1]; 379 ei2->level = mr->data->level; 380 381 ptr += sizeof(EXTENT_ITEM2); 382 } 383 384 le = mr->refs.Flink; 385 while (le != &mr->refs) { 386 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 387 388 if (ref == first_noninline) 389 break; 390 391 *ptr = ref->type; 392 ptr++; 393 394 if (ref->type == TYPE_TREE_BLOCK_REF) { 395 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr; 396 397 tbr->offset = ref->tbr.offset; 398 399 ptr += sizeof(TREE_BLOCK_REF); 400 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 401 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr; 402 403 sbr->offset = ref->parent->new_address; 404 405 ptr += sizeof(SHARED_BLOCK_REF); 406 } 407 408 le = le->Flink; 409 } 410 411 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA) 412 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL); 413 else 414 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL); 415 416 if (!NT_SUCCESS(Status)) { 417 ERR("insert_tree_item returned %08x\n", Status); 418 ExFreePool(ei); 419 return Status; 420 } 421 422 if (!all_inline) { 423 le = &first_noninline->list_entry; 424 425 while (le != &mr->refs) { 426 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry); 427 428 if (ref->type == TYPE_TREE_BLOCK_REF) { 429 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL); 430 if (!NT_SUCCESS(Status)) { 431 ERR("insert_tree_item returned %08x\n", Status); 432 return Status; 433 } 434 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 435 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL); 436 if (!NT_SUCCESS(Status)) { 437 ERR("insert_tree_item returned %08x\n", Status); 438 return Status; 439 } 440 } 441 442 le = le->Flink; 443 } 444 } 445 446 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) { 447 if (mr->data->level > 0) { 448 uint16_t i; 449 internal_node* in = (internal_node*)&mr->data[1]; 450 451 for (i = 0; i < mr->data->num_items; i++) { 452 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL); 453 454 if (sbrrc > 0) { 455 SHARED_BLOCK_REF sbr; 456 457 sbr.offset = mr->new_address; 458 459 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL); 460 if (!NT_SUCCESS(Status)) { 461 ERR("increase_extent_refcount returned %08x\n", Status); 462 return Status; 463 } 464 465 sbr.offset = mr->address; 466 467 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, 468 sbr.offset, false, NULL); 469 if (!NT_SUCCESS(Status)) { 470 ERR("decrease_extent_refcount returned %08x\n", Status); 471 return Status; 472 } 473 } 474 } 475 } else { 476 uint16_t i; 477 leaf_node* ln = (leaf_node*)&mr->data[1]; 478 479 for (i = 0; i < mr->data->num_items; i++) { 480 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 481 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 482 483 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 484 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 485 486 if (ed2->size > 0) { // not sparse 487 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL); 488 489 if (sdrrc > 0) { 490 SHARED_DATA_REF sdr; 491 chunk* c; 492 493 sdr.offset = mr->new_address; 494 sdr.count = sdrrc; 495 496 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL); 497 if (!NT_SUCCESS(Status)) { 498 ERR("increase_extent_refcount returned %08x\n", Status); 499 return Status; 500 } 501 502 sdr.offset = mr->address; 503 504 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, 505 sdr.offset, false, NULL); 506 if (!NT_SUCCESS(Status)) { 507 ERR("decrease_extent_refcount returned %08x\n", Status); 508 return Status; 509 } 510 511 c = get_chunk_from_address(Vcb, ed2->address); 512 513 if (c) { 514 // check changed_extents 515 516 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true); 517 518 le = c->changed_extents.Flink; 519 520 while (le != &c->changed_extents) { 521 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 522 523 if (ce->address == ed2->address) { 524 LIST_ENTRY* le2; 525 526 le2 = ce->refs.Flink; 527 while (le2 != &ce->refs) { 528 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 529 530 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 531 cer->sdr.offset = mr->new_address; 532 break; 533 } 534 535 le2 = le2->Flink; 536 } 537 538 le2 = ce->old_refs.Flink; 539 while (le2 != &ce->old_refs) { 540 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry); 541 542 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) { 543 cer->sdr.offset = mr->new_address; 544 break; 545 } 546 547 le2 = le2->Flink; 548 } 549 550 break; 551 } 552 553 le = le->Flink; 554 } 555 556 ExReleaseResourceLite(&c->changed_extents_lock); 557 } 558 } 559 } 560 } 561 } 562 } 563 } 564 } 565 566 return STATUS_SUCCESS; 567 } 568 569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, 570 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) { 571 LIST_ENTRY tree_writes, *le; 572 NTSTATUS Status; 573 traverse_ptr tp; 574 uint8_t level, max_level = 0; 575 chunk* newchunk = NULL; 576 577 InitializeListHead(&tree_writes); 578 579 le = items->Flink; 580 while (le != items) { 581 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 582 LIST_ENTRY* le2; 583 chunk* pc; 584 585 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG); 586 if (!mr->data) { 587 ERR("out of memory\n"); 588 return STATUS_INSUFFICIENT_RESOURCES; 589 } 590 591 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data, 592 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority); 593 if (!NT_SUCCESS(Status)) { 594 ERR("read_data returned %08x\n", Status); 595 return Status; 596 } 597 598 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM) 599 mr->system = true; 600 601 if (data_items && mr->data->level == 0) { 602 le2 = data_items->Flink; 603 while (le2 != data_items) { 604 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 605 leaf_node* ln = (leaf_node*)&mr->data[1]; 606 uint16_t i; 607 608 for (i = 0; i < mr->data->num_items; i++) { 609 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 610 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset); 611 612 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 613 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 614 615 if (ed2->address == dr->address) 616 ed2->address = dr->new_address; 617 } 618 } 619 } 620 621 le2 = le2->Flink; 622 } 623 } 624 625 if (mr->data->level > max_level) 626 max_level = mr->data->level; 627 628 le2 = mr->refs.Flink; 629 while (le2 != &mr->refs) { 630 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 631 632 if (ref->type == TYPE_TREE_BLOCK_REF) { 633 KEY* firstitem; 634 root* r = NULL; 635 LIST_ENTRY* le3; 636 tree* t; 637 638 firstitem = (KEY*)&mr->data[1]; 639 640 le3 = Vcb->roots.Flink; 641 while (le3 != &Vcb->roots) { 642 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 643 644 if (r2->id == ref->tbr.offset) { 645 r = r2; 646 break; 647 } 648 649 le3 = le3->Flink; 650 } 651 652 if (!r) { 653 ERR("could not find subvol with id %I64x\n", ref->tbr.offset); 654 return STATUS_INTERNAL_ERROR; 655 } 656 657 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL); 658 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 659 ERR("find_item_to_level returned %08x\n", Status); 660 return Status; 661 } 662 663 t = tp.tree; 664 while (t && t->header.level < mr->data->level + 1) { 665 t = t->parent; 666 } 667 668 if (!t) 669 ref->top = true; 670 else { 671 metadata_reloc* mr2; 672 673 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback); 674 if (!NT_SUCCESS(Status)) { 675 ERR("add_metadata_reloc_parent returned %08x\n", Status); 676 return Status; 677 } 678 679 ref->parent = mr2; 680 } 681 } else if (ref->type == TYPE_SHARED_BLOCK_REF) { 682 metadata_reloc* mr2; 683 684 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback); 685 if (!NT_SUCCESS(Status)) { 686 ERR("add_metadata_reloc_parent returned %08x\n", Status); 687 return Status; 688 } 689 690 ref->parent = mr2; 691 } 692 693 le2 = le2->Flink; 694 } 695 696 le = le->Flink; 697 } 698 699 le = items->Flink; 700 while (le != items) { 701 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 702 LIST_ENTRY* le2; 703 uint32_t hash; 704 705 mr->t = NULL; 706 707 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t)); 708 709 le2 = Vcb->trees_ptrs[hash >> 24]; 710 711 if (le2) { 712 while (le2 != &Vcb->trees_hash) { 713 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash); 714 715 if (t->header.address == mr->address) { 716 mr->t = t; 717 break; 718 } else if (t->hash > hash) 719 break; 720 721 le2 = le2->Flink; 722 } 723 } 724 725 le = le->Flink; 726 } 727 728 for (level = 0; level <= max_level; level++) { 729 le = items->Flink; 730 while (le != items) { 731 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 732 733 if (mr->data->level == level) { 734 bool done = false; 735 LIST_ENTRY* le2; 736 tree_write* tw; 737 uint64_t flags; 738 tree* t3; 739 740 if (mr->system) 741 flags = Vcb->system_flags; 742 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) 743 flags = Vcb->data_flags; 744 else 745 flags = Vcb->metadata_flags; 746 747 if (newchunk) { 748 acquire_chunk_lock(newchunk, Vcb); 749 750 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 751 newchunk->used += Vcb->superblock.node_size; 752 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 753 done = true; 754 } 755 756 release_chunk_lock(newchunk, Vcb); 757 } 758 759 if (!done) { 760 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 761 762 le2 = Vcb->chunks.Flink; 763 while (le2 != &Vcb->chunks) { 764 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 765 766 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) { 767 acquire_chunk_lock(c2, Vcb); 768 769 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) { 770 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) { 771 c2->used += Vcb->superblock.node_size; 772 space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback); 773 release_chunk_lock(c2, Vcb); 774 newchunk = c2; 775 done = true; 776 break; 777 } 778 } 779 780 release_chunk_lock(c2, Vcb); 781 } 782 783 le2 = le2->Flink; 784 } 785 786 // allocate new chunk if necessary 787 if (!done) { 788 Status = alloc_chunk(Vcb, flags, &newchunk, false); 789 790 if (!NT_SUCCESS(Status)) { 791 ERR("alloc_chunk returned %08x\n", Status); 792 ExReleaseResourceLite(&Vcb->chunk_lock); 793 goto end; 794 } 795 796 acquire_chunk_lock(newchunk, Vcb); 797 798 newchunk->balance_num = Vcb->balance.balance_num; 799 800 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) { 801 release_chunk_lock(newchunk, Vcb); 802 ExReleaseResourceLite(&Vcb->chunk_lock); 803 ERR("could not find address in new chunk\n"); 804 Status = STATUS_DISK_FULL; 805 goto end; 806 } else { 807 newchunk->used += Vcb->superblock.node_size; 808 space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback); 809 } 810 811 release_chunk_lock(newchunk, Vcb); 812 } 813 814 ExReleaseResourceLite(&Vcb->chunk_lock); 815 } 816 817 // update parents 818 le2 = mr->refs.Flink; 819 while (le2 != &mr->refs) { 820 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry); 821 822 if (ref->parent) { 823 uint16_t i; 824 internal_node* in = (internal_node*)&ref->parent->data[1]; 825 826 for (i = 0; i < ref->parent->data->num_items; i++) { 827 if (in[i].address == mr->address) { 828 in[i].address = mr->new_address; 829 break; 830 } 831 } 832 833 if (ref->parent->t) { 834 LIST_ENTRY* le3; 835 836 le3 = ref->parent->t->itemlist.Flink; 837 while (le3 != &ref->parent->t->itemlist) { 838 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 839 840 if (!td->inserted && td->treeholder.address == mr->address) 841 td->treeholder.address = mr->new_address; 842 843 le3 = le3->Flink; 844 } 845 } 846 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) { 847 LIST_ENTRY* le3; 848 root* r = NULL; 849 850 // alter ROOT_ITEM 851 852 le3 = Vcb->roots.Flink; 853 while (le3 != &Vcb->roots) { 854 root* r2 = CONTAINING_RECORD(le3, root, list_entry); 855 856 if (r2->id == ref->tbr.offset) { 857 r = r2; 858 break; 859 } 860 861 le3 = le3->Flink; 862 } 863 864 if (r) { 865 r->treeholder.address = mr->new_address; 866 867 if (r == Vcb->root_root) 868 Vcb->superblock.root_tree_addr = mr->new_address; 869 else if (r == Vcb->chunk_root) 870 Vcb->superblock.chunk_tree_addr = mr->new_address; 871 else if (r->root_item.block_number == mr->address) { 872 KEY searchkey; 873 ROOT_ITEM* ri; 874 875 r->root_item.block_number = mr->new_address; 876 877 searchkey.obj_id = r->id; 878 searchkey.obj_type = TYPE_ROOT_ITEM; 879 searchkey.offset = 0xffffffffffffffff; 880 881 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 882 if (!NT_SUCCESS(Status)) { 883 ERR("find_item returned %08x\n", Status); 884 goto end; 885 } 886 887 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) { 888 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id); 889 Status = STATUS_INTERNAL_ERROR; 890 goto end; 891 } 892 893 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG); 894 if (!ri) { 895 ERR("out of memory\n"); 896 Status = STATUS_INSUFFICIENT_RESOURCES; 897 goto end; 898 } 899 900 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM)); 901 902 Status = delete_tree_item(Vcb, &tp); 903 if (!NT_SUCCESS(Status)) { 904 ERR("delete_tree_item returned %08x\n", Status); 905 goto end; 906 } 907 908 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL); 909 if (!NT_SUCCESS(Status)) { 910 ERR("insert_tree_item returned %08x\n", Status); 911 goto end; 912 } 913 } 914 } 915 } 916 917 le2 = le2->Flink; 918 } 919 920 mr->data->address = mr->new_address; 921 922 t3 = mr->t; 923 924 while (t3) { 925 uint8_t h; 926 bool inserted; 927 tree* t4 = NULL; 928 929 // check if tree loaded more than once 930 if (t3->list_entry.Flink != &Vcb->trees_hash) { 931 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 932 933 if (nt->header.address == t3->header.address) 934 t4 = nt; 935 } 936 937 t3->header.address = mr->new_address; 938 939 h = t3->hash >> 24; 940 941 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) { 942 if (t3->list_entry_hash.Flink == &Vcb->trees_hash) 943 Vcb->trees_ptrs[h] = NULL; 944 else { 945 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash); 946 947 if (t2->hash >> 24 == h) 948 Vcb->trees_ptrs[h] = &t2->list_entry_hash; 949 else 950 Vcb->trees_ptrs[h] = NULL; 951 } 952 } 953 954 RemoveEntryList(&t3->list_entry_hash); 955 956 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t)); 957 h = t3->hash >> 24; 958 959 if (!Vcb->trees_ptrs[h]) { 960 uint8_t h2 = h; 961 962 le2 = Vcb->trees_hash.Flink; 963 964 if (h2 > 0) { 965 h2--; 966 do { 967 if (Vcb->trees_ptrs[h2]) { 968 le2 = Vcb->trees_ptrs[h2]; 969 break; 970 } 971 972 h2--; 973 } while (h2 > 0); 974 } 975 } else 976 le2 = Vcb->trees_ptrs[h]; 977 978 inserted = false; 979 while (le2 != &Vcb->trees_hash) { 980 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash); 981 982 if (t2->hash >= t3->hash) { 983 InsertHeadList(le2->Blink, &t3->list_entry_hash); 984 inserted = true; 985 break; 986 } 987 988 le2 = le2->Flink; 989 } 990 991 if (!inserted) 992 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash); 993 994 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h]) 995 Vcb->trees_ptrs[h] = &t3->list_entry_hash; 996 997 if (data_items && level == 0) { 998 le2 = data_items->Flink; 999 1000 while (le2 != data_items) { 1001 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 1002 LIST_ENTRY* le3 = t3->itemlist.Flink; 1003 1004 while (le3 != &t3->itemlist) { 1005 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry); 1006 1007 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) { 1008 EXTENT_DATA* ed = (EXTENT_DATA*)td->data; 1009 1010 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) { 1011 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1012 1013 if (ed2->address == dr->address) 1014 ed2->address = dr->new_address; 1015 } 1016 } 1017 1018 le3 = le3->Flink; 1019 } 1020 1021 le2 = le2->Flink; 1022 } 1023 } 1024 1025 t3 = t4; 1026 } 1027 1028 *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum)); 1029 1030 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG); 1031 if (!tw) { 1032 ERR("out of memory\n"); 1033 Status = STATUS_INSUFFICIENT_RESOURCES; 1034 goto end; 1035 } 1036 1037 tw->address = mr->new_address; 1038 tw->length = Vcb->superblock.node_size; 1039 tw->data = (uint8_t*)mr->data; 1040 1041 if (IsListEmpty(&tree_writes)) 1042 InsertTailList(&tree_writes, &tw->list_entry); 1043 else { 1044 bool inserted = false; 1045 1046 le2 = tree_writes.Flink; 1047 while (le2 != &tree_writes) { 1048 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry); 1049 1050 if (tw2->address > tw->address) { 1051 InsertHeadList(le2->Blink, &tw->list_entry); 1052 inserted = true; 1053 break; 1054 } 1055 1056 le2 = le2->Flink; 1057 } 1058 1059 if (!inserted) 1060 InsertTailList(&tree_writes, &tw->list_entry); 1061 } 1062 } 1063 1064 le = le->Flink; 1065 } 1066 } 1067 1068 Status = do_tree_writes(Vcb, &tree_writes, true); 1069 if (!NT_SUCCESS(Status)) { 1070 ERR("do_tree_writes returned %08x\n", Status); 1071 goto end; 1072 } 1073 1074 le = items->Flink; 1075 while (le != items) { 1076 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry); 1077 1078 Status = add_metadata_reloc_extent_item(Vcb, mr); 1079 if (!NT_SUCCESS(Status)) { 1080 ERR("add_metadata_reloc_extent_item returned %08x\n", Status); 1081 goto end; 1082 } 1083 1084 le = le->Flink; 1085 } 1086 1087 Status = STATUS_SUCCESS; 1088 1089 end: 1090 while (!IsListEmpty(&tree_writes)) { 1091 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry); 1092 ExFreePool(tw); 1093 } 1094 1095 return Status; 1096 } 1097 1098 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1099 KEY searchkey; 1100 traverse_ptr tp; 1101 NTSTATUS Status; 1102 bool b; 1103 LIST_ENTRY items, rollback; 1104 uint32_t loaded = 0; 1105 1106 TRACE("chunk %I64x\n", c->offset); 1107 1108 InitializeListHead(&rollback); 1109 InitializeListHead(&items); 1110 1111 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1112 1113 searchkey.obj_id = c->offset; 1114 searchkey.obj_type = TYPE_METADATA_ITEM; 1115 searchkey.offset = 0xffffffffffffffff; 1116 1117 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1118 if (!NT_SUCCESS(Status)) { 1119 ERR("find_item returned %08x\n", Status); 1120 goto end; 1121 } 1122 1123 do { 1124 traverse_ptr next_tp; 1125 1126 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1127 break; 1128 1129 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) { 1130 bool tree = false, skinny = false; 1131 1132 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1133 tree = true; 1134 skinny = true; 1135 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size && 1136 tp.item->size >= sizeof(EXTENT_ITEM)) { 1137 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1138 1139 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1140 tree = true; 1141 } 1142 1143 if (tree) { 1144 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback); 1145 1146 if (!NT_SUCCESS(Status)) { 1147 ERR("add_metadata_reloc returned %08x\n", Status); 1148 goto end; 1149 } 1150 1151 loaded++; 1152 1153 if (loaded >= 64) // only do 64 at a time 1154 break; 1155 } 1156 } 1157 1158 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1159 1160 if (b) 1161 tp = next_tp; 1162 } while (b); 1163 1164 if (IsListEmpty(&items)) { 1165 *changed = false; 1166 Status = STATUS_SUCCESS; 1167 goto end; 1168 } else 1169 *changed = true; 1170 1171 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback); 1172 if (!NT_SUCCESS(Status)) { 1173 ERR("write_metadata_items returned %08x\n", Status); 1174 goto end; 1175 } 1176 1177 Status = STATUS_SUCCESS; 1178 1179 Vcb->need_write = true; 1180 1181 end: 1182 if (NT_SUCCESS(Status)) { 1183 Status = do_write(Vcb, NULL); 1184 if (!NT_SUCCESS(Status)) 1185 ERR("do_write returned %08x\n", Status); 1186 } 1187 1188 if (NT_SUCCESS(Status)) 1189 clear_rollback(&rollback); 1190 else 1191 do_rollback(Vcb, &rollback); 1192 1193 free_trees(Vcb); 1194 1195 ExReleaseResourceLite(&Vcb->tree_lock); 1196 1197 while (!IsListEmpty(&items)) { 1198 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry); 1199 1200 while (!IsListEmpty(&mr->refs)) { 1201 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 1202 1203 ExFreePool(ref); 1204 } 1205 1206 ExFreePool(mr); 1207 } 1208 1209 return Status; 1210 } 1211 1212 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items, 1213 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) { 1214 NTSTATUS Status; 1215 LIST_ENTRY* le; 1216 KEY searchkey; 1217 traverse_ptr tp; 1218 root* r = NULL; 1219 metadata_reloc* mr; 1220 uint64_t last_tree = 0; 1221 data_reloc_ref* ref; 1222 1223 le = Vcb->roots.Flink; 1224 while (le != &Vcb->roots) { 1225 root* r2 = CONTAINING_RECORD(le, root, list_entry); 1226 1227 if (r2->id == edr->root) { 1228 r = r2; 1229 break; 1230 } 1231 1232 le = le->Flink; 1233 } 1234 1235 if (!r) { 1236 ERR("could not find subvol %I64x\n", edr->count); 1237 return STATUS_INTERNAL_ERROR; 1238 } 1239 1240 searchkey.obj_id = edr->objid; 1241 searchkey.obj_type = TYPE_EXTENT_DATA; 1242 searchkey.offset = 0; 1243 1244 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL); 1245 if (!NT_SUCCESS(Status)) { 1246 ERR("find_item returned %08x\n", Status); 1247 return Status; 1248 } 1249 1250 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) { 1251 traverse_ptr tp2; 1252 1253 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1254 tp = tp2; 1255 else { 1256 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id); 1257 return STATUS_INTERNAL_ERROR; 1258 } 1259 } 1260 1261 ref = NULL; 1262 1263 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) { 1264 traverse_ptr tp2; 1265 1266 if (tp.item->size >= sizeof(EXTENT_DATA)) { 1267 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data; 1268 1269 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) { 1270 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data; 1271 1272 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) { 1273 if (ref && last_tree == tp.tree->header.address) 1274 ref->edr.count++; 1275 else { 1276 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1277 if (!ref) { 1278 ERR("out of memory\n"); 1279 return STATUS_INSUFFICIENT_RESOURCES; 1280 } 1281 1282 ref->type = TYPE_EXTENT_DATA_REF; 1283 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF)); 1284 ref->edr.count = 1; 1285 1286 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback); 1287 if (!NT_SUCCESS(Status)) { 1288 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1289 ExFreePool(ref); 1290 return Status; 1291 } 1292 1293 last_tree = tp.tree->header.address; 1294 ref->parent = mr; 1295 1296 InsertTailList(&dr->refs, &ref->list_entry); 1297 } 1298 } 1299 } 1300 } 1301 1302 if (find_next_item(Vcb, &tp, &tp2, false, NULL)) 1303 tp = tp2; 1304 else 1305 break; 1306 } 1307 1308 return STATUS_SUCCESS; 1309 } 1310 1311 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items, 1312 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) { 1313 NTSTATUS Status; 1314 data_reloc* dr; 1315 EXTENT_ITEM* ei; 1316 uint16_t len; 1317 uint64_t inline_rc; 1318 uint8_t* ptr; 1319 1320 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG); 1321 if (!dr) { 1322 ERR("out of memory\n"); 1323 return STATUS_INSUFFICIENT_RESOURCES; 1324 } 1325 1326 dr->address = tp->item->key.obj_id; 1327 dr->size = tp->item->key.offset; 1328 dr->ei = (EXTENT_ITEM*)tp->item->data; 1329 InitializeListHead(&dr->refs); 1330 1331 Status = delete_tree_item(Vcb, tp); 1332 if (!NT_SUCCESS(Status)) { 1333 ERR("delete_tree_item returned %08x\n", Status); 1334 return Status; 1335 } 1336 1337 if (!c) 1338 c = get_chunk_from_address(Vcb, tp->item->key.obj_id); 1339 1340 if (c) { 1341 acquire_chunk_lock(c, Vcb); 1342 1343 c->used -= tp->item->key.offset; 1344 1345 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback); 1346 1347 release_chunk_lock(c, Vcb); 1348 } 1349 1350 ei = (EXTENT_ITEM*)tp->item->data; 1351 inline_rc = 0; 1352 1353 len = tp->item->size - sizeof(EXTENT_ITEM); 1354 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM); 1355 1356 while (len > 0) { 1357 uint8_t secttype = *ptr; 1358 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0); 1359 1360 len--; 1361 1362 if (sectlen > len) { 1363 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen); 1364 return STATUS_INTERNAL_ERROR; 1365 } 1366 1367 if (sectlen == 0) { 1368 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype); 1369 return STATUS_INTERNAL_ERROR; 1370 } 1371 1372 if (secttype == TYPE_EXTENT_DATA_REF) { 1373 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t)); 1374 1375 inline_rc += edr->count; 1376 1377 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback); 1378 if (!NT_SUCCESS(Status)) { 1379 ERR("data_reloc_add_tree_edr returned %08x\n", Status); 1380 return Status; 1381 } 1382 } else if (secttype == TYPE_SHARED_DATA_REF) { 1383 metadata_reloc* mr; 1384 data_reloc_ref* ref; 1385 1386 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1387 if (!ref) { 1388 ERR("out of memory\n"); 1389 return STATUS_INSUFFICIENT_RESOURCES; 1390 } 1391 1392 ref->type = TYPE_SHARED_DATA_REF; 1393 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF)); 1394 inline_rc += ref->sdr.count; 1395 1396 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1397 if (!NT_SUCCESS(Status)) { 1398 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1399 ExFreePool(ref); 1400 return Status; 1401 } 1402 1403 ref->parent = mr; 1404 1405 InsertTailList(&dr->refs, &ref->list_entry); 1406 } else { 1407 ERR("unexpected tree type %x\n", secttype); 1408 return STATUS_INTERNAL_ERROR; 1409 } 1410 1411 1412 len -= sectlen; 1413 ptr += sizeof(uint8_t) + sectlen; 1414 } 1415 1416 if (inline_rc < ei->refcount) { // look for non-inline entries 1417 traverse_ptr tp2 = *tp, next_tp; 1418 1419 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) { 1420 tp2 = next_tp; 1421 1422 if (tp2.item->key.obj_id == tp->item->key.obj_id) { 1423 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) { 1424 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback); 1425 if (!NT_SUCCESS(Status)) { 1426 ERR("data_reloc_add_tree_edr returned %08x\n", Status); 1427 return Status; 1428 } 1429 1430 Status = delete_tree_item(Vcb, &tp2); 1431 if (!NT_SUCCESS(Status)) { 1432 ERR("delete_tree_item returned %08x\n", Status); 1433 return Status; 1434 } 1435 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) { 1436 metadata_reloc* mr; 1437 data_reloc_ref* ref; 1438 1439 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG); 1440 if (!ref) { 1441 ERR("out of memory\n"); 1442 return STATUS_INSUFFICIENT_RESOURCES; 1443 } 1444 1445 ref->type = TYPE_SHARED_DATA_REF; 1446 ref->sdr.offset = tp2.item->key.offset; 1447 ref->sdr.count = *((uint32_t*)tp2.item->data); 1448 1449 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback); 1450 if (!NT_SUCCESS(Status)) { 1451 ERR("add_metadata_reloc_parent returned %08x\n", Status); 1452 ExFreePool(ref); 1453 return Status; 1454 } 1455 1456 ref->parent = mr; 1457 InsertTailList(&dr->refs, &ref->list_entry); 1458 1459 Status = delete_tree_item(Vcb, &tp2); 1460 if (!NT_SUCCESS(Status)) { 1461 ERR("delete_tree_item returned %08x\n", Status); 1462 return Status; 1463 } 1464 } 1465 } else 1466 break; 1467 } 1468 } 1469 1470 InsertTailList(items, &dr->list_entry); 1471 1472 return STATUS_SUCCESS; 1473 } 1474 1475 static void sort_data_reloc_refs(data_reloc* dr) { 1476 LIST_ENTRY newlist, *le; 1477 1478 if (IsListEmpty(&dr->refs)) 1479 return; 1480 1481 // insertion sort 1482 1483 InitializeListHead(&newlist); 1484 1485 while (!IsListEmpty(&dr->refs)) { 1486 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 1487 bool inserted = false; 1488 1489 if (ref->type == TYPE_EXTENT_DATA_REF) 1490 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset); 1491 else if (ref->type == TYPE_SHARED_DATA_REF) 1492 ref->hash = ref->parent->new_address; 1493 1494 le = newlist.Flink; 1495 while (le != &newlist) { 1496 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1497 1498 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) { 1499 InsertHeadList(le->Blink, &ref->list_entry); 1500 inserted = true; 1501 break; 1502 } 1503 1504 le = le->Flink; 1505 } 1506 1507 if (!inserted) 1508 InsertTailList(&newlist, &ref->list_entry); 1509 } 1510 1511 le = newlist.Flink; 1512 while (le != &newlist) { 1513 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1514 1515 if (le->Flink != &newlist) { 1516 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry); 1517 1518 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root && 1519 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) { 1520 RemoveEntryList(&ref2->list_entry); 1521 ref->edr.count += ref2->edr.count; 1522 ExFreePool(ref2); 1523 continue; 1524 } 1525 } 1526 1527 le = le->Flink; 1528 } 1529 1530 newlist.Flink->Blink = &dr->refs; 1531 newlist.Blink->Flink = &dr->refs; 1532 dr->refs.Flink = newlist.Flink; 1533 dr->refs.Blink = newlist.Blink; 1534 } 1535 1536 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) { 1537 NTSTATUS Status; 1538 LIST_ENTRY* le; 1539 uint64_t rc = 0; 1540 uint16_t inline_len; 1541 bool all_inline = true; 1542 data_reloc_ref* first_noninline = NULL; 1543 EXTENT_ITEM* ei; 1544 uint8_t* ptr; 1545 1546 inline_len = sizeof(EXTENT_ITEM); 1547 1548 sort_data_reloc_refs(dr); 1549 1550 le = dr->refs.Flink; 1551 while (le != &dr->refs) { 1552 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1553 uint16_t extlen = 0; 1554 1555 if (ref->type == TYPE_EXTENT_DATA_REF) { 1556 extlen += sizeof(EXTENT_DATA_REF); 1557 rc += ref->edr.count; 1558 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1559 extlen += sizeof(SHARED_DATA_REF); 1560 rc++; 1561 } 1562 1563 if (all_inline) { 1564 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) { 1565 all_inline = false; 1566 first_noninline = ref; 1567 } else 1568 inline_len += extlen + 1; 1569 } 1570 1571 le = le->Flink; 1572 } 1573 1574 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG); 1575 if (!ei) { 1576 ERR("out of memory\n"); 1577 return STATUS_INSUFFICIENT_RESOURCES; 1578 } 1579 1580 ei->refcount = rc; 1581 ei->generation = dr->ei->generation; 1582 ei->flags = dr->ei->flags; 1583 ptr = (uint8_t*)&ei[1]; 1584 1585 le = dr->refs.Flink; 1586 while (le != &dr->refs) { 1587 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1588 1589 if (ref == first_noninline) 1590 break; 1591 1592 *ptr = ref->type; 1593 ptr++; 1594 1595 if (ref->type == TYPE_EXTENT_DATA_REF) { 1596 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr; 1597 1598 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1599 1600 ptr += sizeof(EXTENT_DATA_REF); 1601 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1602 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr; 1603 1604 sdr->offset = ref->parent->new_address; 1605 sdr->count = ref->sdr.count; 1606 1607 ptr += sizeof(SHARED_DATA_REF); 1608 } 1609 1610 le = le->Flink; 1611 } 1612 1613 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL); 1614 if (!NT_SUCCESS(Status)) { 1615 ERR("insert_tree_item returned %08x\n", Status); 1616 return Status; 1617 } 1618 1619 if (!all_inline) { 1620 le = &first_noninline->list_entry; 1621 1622 while (le != &dr->refs) { 1623 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry); 1624 1625 if (ref->type == TYPE_EXTENT_DATA_REF) { 1626 EXTENT_DATA_REF* edr; 1627 1628 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG); 1629 if (!edr) { 1630 ERR("out of memory\n"); 1631 return STATUS_INSUFFICIENT_RESOURCES; 1632 } 1633 1634 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF)); 1635 1636 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL); 1637 if (!NT_SUCCESS(Status)) { 1638 ERR("insert_tree_item returned %08x\n", Status); 1639 return Status; 1640 } 1641 } else if (ref->type == TYPE_SHARED_DATA_REF) { 1642 uint32_t* sdr; 1643 1644 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG); 1645 if (!sdr) { 1646 ERR("out of memory\n"); 1647 return STATUS_INSUFFICIENT_RESOURCES; 1648 } 1649 1650 *sdr = ref->sdr.count; 1651 1652 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL); 1653 if (!NT_SUCCESS(Status)) { 1654 ERR("insert_tree_item returned %08x\n", Status); 1655 return Status; 1656 } 1657 } 1658 1659 le = le->Flink; 1660 } 1661 } 1662 1663 return STATUS_SUCCESS; 1664 } 1665 1666 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) { 1667 KEY searchkey; 1668 traverse_ptr tp; 1669 NTSTATUS Status; 1670 bool b; 1671 LIST_ENTRY items, metadata_items, rollback, *le; 1672 uint64_t loaded = 0, num_loaded = 0; 1673 chunk* newchunk = NULL; 1674 uint8_t* data = NULL; 1675 1676 TRACE("chunk %I64x\n", c->offset); 1677 1678 InitializeListHead(&rollback); 1679 InitializeListHead(&items); 1680 InitializeListHead(&metadata_items); 1681 1682 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 1683 1684 searchkey.obj_id = c->offset; 1685 searchkey.obj_type = TYPE_EXTENT_ITEM; 1686 searchkey.offset = 0xffffffffffffffff; 1687 1688 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL); 1689 if (!NT_SUCCESS(Status)) { 1690 ERR("find_item returned %08x\n", Status); 1691 goto end; 1692 } 1693 1694 do { 1695 traverse_ptr next_tp; 1696 1697 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size) 1698 break; 1699 1700 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) { 1701 bool tree = false; 1702 1703 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) { 1704 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data; 1705 1706 if (ei->flags & EXTENT_ITEM_TREE_BLOCK) 1707 tree = true; 1708 } 1709 1710 if (!tree) { 1711 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback); 1712 1713 if (!NT_SUCCESS(Status)) { 1714 ERR("add_data_reloc returned %08x\n", Status); 1715 goto end; 1716 } 1717 1718 loaded += tp.item->key.offset; 1719 num_loaded++; 1720 1721 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously 1722 break; 1723 } 1724 } 1725 1726 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 1727 1728 if (b) 1729 tp = next_tp; 1730 } while (b); 1731 1732 if (IsListEmpty(&items)) { 1733 *changed = false; 1734 Status = STATUS_SUCCESS; 1735 goto end; 1736 } else 1737 *changed = true; 1738 1739 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG); 1740 if (!data) { 1741 ERR("out of memory\n"); 1742 Status = STATUS_INSUFFICIENT_RESOURCES; 1743 goto end; 1744 } 1745 1746 le = items.Flink; 1747 while (le != &items) { 1748 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 1749 bool done = false; 1750 LIST_ENTRY* le2; 1751 uint32_t* csum; 1752 RTL_BITMAP bmp; 1753 ULONG* bmparr; 1754 ULONG bmplen, runlength, index, lastoff; 1755 1756 if (newchunk) { 1757 acquire_chunk_lock(newchunk, Vcb); 1758 1759 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1760 newchunk->used += dr->size; 1761 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1762 done = true; 1763 } 1764 1765 release_chunk_lock(newchunk, Vcb); 1766 } 1767 1768 if (!done) { 1769 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 1770 1771 le2 = Vcb->chunks.Flink; 1772 while (le2 != &Vcb->chunks) { 1773 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry); 1774 1775 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) { 1776 acquire_chunk_lock(c2, Vcb); 1777 1778 if ((c2->chunk_item->size - c2->used) >= dr->size) { 1779 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) { 1780 c2->used += dr->size; 1781 space_list_subtract(c2, false, dr->new_address, dr->size, &rollback); 1782 release_chunk_lock(c2, Vcb); 1783 newchunk = c2; 1784 done = true; 1785 break; 1786 } 1787 } 1788 1789 release_chunk_lock(c2, Vcb); 1790 } 1791 1792 le2 = le2->Flink; 1793 } 1794 1795 // allocate new chunk if necessary 1796 if (!done) { 1797 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false); 1798 1799 if (!NT_SUCCESS(Status)) { 1800 ERR("alloc_chunk returned %08x\n", Status); 1801 ExReleaseResourceLite(&Vcb->chunk_lock); 1802 goto end; 1803 } 1804 1805 acquire_chunk_lock(newchunk, Vcb); 1806 1807 newchunk->balance_num = Vcb->balance.balance_num; 1808 1809 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) { 1810 release_chunk_lock(newchunk, Vcb); 1811 ExReleaseResourceLite(&Vcb->chunk_lock); 1812 ERR("could not find address in new chunk\n"); 1813 Status = STATUS_DISK_FULL; 1814 goto end; 1815 } else { 1816 newchunk->used += dr->size; 1817 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback); 1818 } 1819 1820 release_chunk_lock(newchunk, Vcb); 1821 } 1822 1823 ExReleaseResourceLite(&Vcb->chunk_lock); 1824 } 1825 1826 dr->newchunk = newchunk; 1827 1828 bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size); 1829 1830 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG); 1831 if (!bmparr) { 1832 ERR("out of memory\n"); 1833 Status = STATUS_INSUFFICIENT_RESOURCES; 1834 goto end; 1835 } 1836 1837 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG); 1838 if (!csum) { 1839 ERR("out of memory\n"); 1840 ExFreePool(bmparr); 1841 Status = STATUS_INSUFFICIENT_RESOURCES; 1842 goto end; 1843 } 1844 1845 RtlInitializeBitMap(&bmp, bmparr, bmplen); 1846 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum 1847 1848 searchkey.obj_id = EXTENT_CSUM_ID; 1849 searchkey.obj_type = TYPE_EXTENT_CSUM; 1850 searchkey.offset = dr->address; 1851 1852 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL); 1853 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) { 1854 ERR("find_item returned %08x\n", Status); 1855 ExFreePool(csum); 1856 ExFreePool(bmparr); 1857 goto end; 1858 } 1859 1860 if (Status != STATUS_NOT_FOUND) { 1861 do { 1862 traverse_ptr next_tp; 1863 1864 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) { 1865 if (tp.item->key.offset >= dr->address + dr->size) 1866 break; 1867 else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) { 1868 uint64_t cs = max(dr->address, tp.item->key.offset); 1869 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t))); 1870 1871 RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size), 1872 tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size), 1873 (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size)); 1874 1875 RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size)); 1876 1877 if (ce == dr->address + dr->size) 1878 break; 1879 } 1880 } 1881 1882 if (find_next_item(Vcb, &tp, &next_tp, false, NULL)) 1883 tp = next_tp; 1884 else 1885 break; 1886 } while (true); 1887 } 1888 1889 lastoff = 0; 1890 runlength = RtlFindFirstRunClear(&bmp, &index); 1891 1892 while (runlength != 0) { 1893 if (index >= bmplen) 1894 break; 1895 1896 if (index + runlength >= bmplen) { 1897 runlength = bmplen - index; 1898 1899 if (runlength == 0) 1900 break; 1901 } 1902 1903 if (index > lastoff) { 1904 ULONG off = lastoff; 1905 ULONG size = index - lastoff; 1906 1907 // handle no csum run 1908 do { 1909 ULONG rl; 1910 1911 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1912 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1913 else 1914 rl = size; 1915 1916 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 1917 c, NULL, NULL, 0, false, NormalPagePriority); 1918 if (!NT_SUCCESS(Status)) { 1919 ERR("read_data returned %08x\n", Status); 1920 ExFreePool(csum); 1921 ExFreePool(bmparr); 1922 goto end; 1923 } 1924 1925 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1926 NULL, newchunk, false, 0, NormalPagePriority); 1927 if (!NT_SUCCESS(Status)) { 1928 ERR("write_data_complete returned %08x\n", Status); 1929 ExFreePool(csum); 1930 ExFreePool(bmparr); 1931 goto end; 1932 } 1933 1934 size -= rl; 1935 off += rl; 1936 } while (size > 0); 1937 } 1938 1939 add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL); 1940 add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL); 1941 1942 // handle csum run 1943 do { 1944 ULONG rl; 1945 1946 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT) 1947 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1948 else 1949 rl = runlength; 1950 1951 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data, 1952 c, NULL, NULL, 0, false, NormalPagePriority); 1953 if (!NT_SUCCESS(Status)) { 1954 ERR("read_data returned %08x\n", Status); 1955 ExFreePool(csum); 1956 ExFreePool(bmparr); 1957 goto end; 1958 } 1959 1960 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 1961 NULL, newchunk, false, 0, NormalPagePriority); 1962 if (!NT_SUCCESS(Status)) { 1963 ERR("write_data_complete returned %08x\n", Status); 1964 ExFreePool(csum); 1965 ExFreePool(bmparr); 1966 goto end; 1967 } 1968 1969 runlength -= rl; 1970 index += rl; 1971 } while (runlength > 0); 1972 1973 lastoff = index; 1974 runlength = RtlFindNextForwardRunClear(&bmp, index, &index); 1975 } 1976 1977 ExFreePool(csum); 1978 ExFreePool(bmparr); 1979 1980 // handle final nocsum run 1981 if (lastoff < dr->size / Vcb->superblock.sector_size) { 1982 ULONG off = lastoff; 1983 ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff); 1984 1985 do { 1986 ULONG rl; 1987 1988 if (size * Vcb->superblock.sector_size > BALANCE_UNIT) 1989 rl = BALANCE_UNIT / Vcb->superblock.sector_size; 1990 else 1991 rl = size; 1992 1993 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data, 1994 c, NULL, NULL, 0, false, NormalPagePriority); 1995 if (!NT_SUCCESS(Status)) { 1996 ERR("read_data returned %08x\n", Status); 1997 goto end; 1998 } 1999 2000 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size, 2001 NULL, newchunk, false, 0, NormalPagePriority); 2002 if (!NT_SUCCESS(Status)) { 2003 ERR("write_data_complete returned %08x\n", Status); 2004 goto end; 2005 } 2006 2007 size -= rl; 2008 off += rl; 2009 } while (size > 0); 2010 } 2011 2012 le = le->Flink; 2013 } 2014 2015 ExFreePool(data); 2016 data = NULL; 2017 2018 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback); 2019 if (!NT_SUCCESS(Status)) { 2020 ERR("write_metadata_items returned %08x\n", Status); 2021 goto end; 2022 } 2023 2024 le = items.Flink; 2025 while (le != &items) { 2026 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry); 2027 2028 Status = add_data_reloc_extent_item(Vcb, dr); 2029 if (!NT_SUCCESS(Status)) { 2030 ERR("add_data_reloc_extent_item returned %08x\n", Status); 2031 goto end; 2032 } 2033 2034 le = le->Flink; 2035 } 2036 2037 le = c->changed_extents.Flink; 2038 while (le != &c->changed_extents) { 2039 LIST_ENTRY *le2, *le3; 2040 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry); 2041 2042 le3 = le->Flink; 2043 2044 le2 = items.Flink; 2045 while (le2 != &items) { 2046 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry); 2047 2048 if (ce->address == dr->address) { 2049 ce->address = dr->new_address; 2050 RemoveEntryList(&ce->list_entry); 2051 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry); 2052 break; 2053 } 2054 2055 le2 = le2->Flink; 2056 } 2057 2058 le = le3; 2059 } 2060 2061 Status = STATUS_SUCCESS; 2062 2063 Vcb->need_write = true; 2064 2065 end: 2066 if (NT_SUCCESS(Status)) { 2067 // update extents in cache inodes before we flush 2068 le = Vcb->chunks.Flink; 2069 while (le != &Vcb->chunks) { 2070 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry); 2071 2072 if (c2->cache) { 2073 LIST_ENTRY* le2; 2074 2075 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true); 2076 2077 le2 = c2->cache->extents.Flink; 2078 while (le2 != &c2->cache->extents) { 2079 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2080 2081 if (!ext->ignore) { 2082 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2083 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2084 2085 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2086 LIST_ENTRY* le3 = items.Flink; 2087 while (le3 != &items) { 2088 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2089 2090 if (ed2->address == dr->address) { 2091 ed2->address = dr->new_address; 2092 break; 2093 } 2094 2095 le3 = le3->Flink; 2096 } 2097 } 2098 } 2099 } 2100 2101 le2 = le2->Flink; 2102 } 2103 2104 ExReleaseResourceLite(c2->cache->Header.Resource); 2105 } 2106 2107 le = le->Flink; 2108 } 2109 2110 Status = do_write(Vcb, NULL); 2111 if (!NT_SUCCESS(Status)) 2112 ERR("do_write returned %08x\n", Status); 2113 } 2114 2115 if (NT_SUCCESS(Status)) { 2116 clear_rollback(&rollback); 2117 2118 // update open FCBs 2119 // FIXME - speed this up(?) 2120 2121 le = Vcb->all_fcbs.Flink; 2122 while (le != &Vcb->all_fcbs) { 2123 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all); 2124 LIST_ENTRY* le2; 2125 2126 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true); 2127 2128 le2 = fcb->extents.Flink; 2129 while (le2 != &fcb->extents) { 2130 extent* ext = CONTAINING_RECORD(le2, extent, list_entry); 2131 2132 if (!ext->ignore) { 2133 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) { 2134 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data; 2135 2136 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) { 2137 LIST_ENTRY* le3 = items.Flink; 2138 while (le3 != &items) { 2139 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry); 2140 2141 if (ed2->address == dr->address) { 2142 ed2->address = dr->new_address; 2143 break; 2144 } 2145 2146 le3 = le3->Flink; 2147 } 2148 } 2149 } 2150 } 2151 2152 le2 = le2->Flink; 2153 } 2154 2155 ExReleaseResourceLite(fcb->Header.Resource); 2156 2157 le = le->Flink; 2158 } 2159 } else 2160 do_rollback(Vcb, &rollback); 2161 2162 free_trees(Vcb); 2163 2164 ExReleaseResourceLite(&Vcb->tree_lock); 2165 2166 if (data) 2167 ExFreePool(data); 2168 2169 while (!IsListEmpty(&items)) { 2170 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry); 2171 2172 while (!IsListEmpty(&dr->refs)) { 2173 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry); 2174 2175 ExFreePool(ref); 2176 } 2177 2178 ExFreePool(dr); 2179 } 2180 2181 while (!IsListEmpty(&metadata_items)) { 2182 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry); 2183 2184 while (!IsListEmpty(&mr->refs)) { 2185 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry); 2186 2187 ExFreePool(ref); 2188 } 2189 2190 ExFreePool(mr); 2191 } 2192 2193 return Status; 2194 } 2195 2196 static __inline uint64_t get_chunk_dup_type(chunk* c) { 2197 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2198 return BLOCK_FLAG_RAID0; 2199 else if (c->chunk_item->type & BLOCK_FLAG_RAID1) 2200 return BLOCK_FLAG_RAID1; 2201 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE) 2202 return BLOCK_FLAG_DUPLICATE; 2203 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2204 return BLOCK_FLAG_RAID10; 2205 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2206 return BLOCK_FLAG_RAID5; 2207 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2208 return BLOCK_FLAG_RAID6; 2209 else 2210 return BLOCK_FLAG_SINGLE; 2211 } 2212 2213 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) { 2214 btrfs_balance_opts* opts; 2215 2216 opts = &Vcb->balance.opts[sort]; 2217 2218 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED)) 2219 return false; 2220 2221 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2222 uint64_t type = get_chunk_dup_type(c); 2223 2224 if (!(type & opts->profiles)) 2225 return false; 2226 } 2227 2228 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2229 uint16_t i; 2230 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2231 bool b = false; 2232 2233 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2234 if (cis[i].dev_id == opts->devid) { 2235 b = true; 2236 break; 2237 } 2238 } 2239 2240 if (!b) 2241 return false; 2242 } 2243 2244 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2245 uint16_t i, factor; 2246 uint64_t physsize; 2247 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 2248 bool b = false; 2249 2250 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 2251 factor = c->chunk_item->num_stripes; 2252 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 2253 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 2254 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 2255 factor = c->chunk_item->num_stripes - 1; 2256 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 2257 factor = c->chunk_item->num_stripes - 2; 2258 else // SINGLE, DUPLICATE, RAID1 2259 factor = 1; 2260 2261 physsize = c->chunk_item->size / factor; 2262 2263 for (i = 0; i < c->chunk_item->num_stripes; i++) { 2264 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start && 2265 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) { 2266 b = true; 2267 break; 2268 } 2269 } 2270 2271 if (!b) 2272 return false; 2273 } 2274 2275 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2276 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end) 2277 return false; 2278 } 2279 2280 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2281 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end) 2282 return false; 2283 } 2284 2285 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2286 uint64_t usage = c->used * 100 / c->chunk_item->size; 2287 2288 // usage == 0 should mean completely empty, not just that usage rounds to 0% 2289 if (c->used > 0 && usage == 0) 2290 usage = 1; 2291 2292 if (usage < opts->usage_start || usage > opts->usage_end) 2293 return false; 2294 } 2295 2296 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) { 2297 uint64_t type = get_chunk_dup_type(c); 2298 2299 if (type == opts->convert) 2300 return false; 2301 } 2302 2303 return true; 2304 } 2305 2306 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2307 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) { 2308 args->profiles = opts->profiles; 2309 args->flags |= BALANCE_ARGS_FLAGS_PROFILES; 2310 } 2311 2312 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) { 2313 if (args->usage_start == 0) { 2314 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE; 2315 args->usage_start = opts->usage_start; 2316 args->usage_end = opts->usage_end; 2317 } else { 2318 args->flags |= BALANCE_ARGS_FLAGS_USAGE; 2319 args->usage = opts->usage_end; 2320 } 2321 } 2322 2323 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) { 2324 args->devid = opts->devid; 2325 args->flags |= BALANCE_ARGS_FLAGS_DEVID; 2326 } 2327 2328 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) { 2329 args->drange_start = opts->drange_start; 2330 args->drange_end = opts->drange_end; 2331 args->flags |= BALANCE_ARGS_FLAGS_DRANGE; 2332 } 2333 2334 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) { 2335 args->vrange_start = opts->vrange_start; 2336 args->vrange_end = opts->vrange_end; 2337 args->flags |= BALANCE_ARGS_FLAGS_VRANGE; 2338 } 2339 2340 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) { 2341 args->convert = opts->convert; 2342 args->flags |= BALANCE_ARGS_FLAGS_CONVERT; 2343 2344 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT) 2345 args->flags |= BALANCE_ARGS_FLAGS_SOFT; 2346 } 2347 2348 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) { 2349 if (args->limit_start == 0) { 2350 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE; 2351 args->limit_start = (uint32_t)opts->limit_start; 2352 args->limit_end = (uint32_t)opts->limit_end; 2353 } else { 2354 args->flags |= BALANCE_ARGS_FLAGS_LIMIT; 2355 args->limit = opts->limit_end; 2356 } 2357 } 2358 2359 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) { 2360 args->stripes_start = opts->stripes_start; 2361 args->stripes_end = opts->stripes_end; 2362 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE; 2363 } 2364 } 2365 2366 static NTSTATUS add_balance_item(device_extension* Vcb) { 2367 KEY searchkey; 2368 traverse_ptr tp; 2369 NTSTATUS Status; 2370 BALANCE_ITEM* bi; 2371 2372 searchkey.obj_id = BALANCE_ITEM_ID; 2373 searchkey.obj_type = TYPE_TEMP_ITEM; 2374 searchkey.offset = 0; 2375 2376 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2377 2378 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2379 if (!NT_SUCCESS(Status)) { 2380 ERR("find_item returned %08x\n", Status); 2381 goto end; 2382 } 2383 2384 if (!keycmp(tp.item->key, searchkey)) { 2385 Status = delete_tree_item(Vcb, &tp); 2386 if (!NT_SUCCESS(Status)) { 2387 ERR("delete_tree_item returned %08x\n", Status); 2388 goto end; 2389 } 2390 } 2391 2392 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG); 2393 if (!bi) { 2394 ERR("out of memory\n"); 2395 Status = STATUS_INSUFFICIENT_RESOURCES; 2396 goto end; 2397 } 2398 2399 RtlZeroMemory(bi, sizeof(BALANCE_ITEM)); 2400 2401 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2402 bi->flags |= BALANCE_FLAGS_DATA; 2403 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 2404 } 2405 2406 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2407 bi->flags |= BALANCE_FLAGS_METADATA; 2408 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 2409 } 2410 2411 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) { 2412 bi->flags |= BALANCE_FLAGS_SYSTEM; 2413 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 2414 } 2415 2416 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL); 2417 if (!NT_SUCCESS(Status)) { 2418 ERR("insert_tree_item returned %08x\n", Status); 2419 ExFreePool(bi); 2420 goto end; 2421 } 2422 2423 Status = STATUS_SUCCESS; 2424 2425 end: 2426 if (NT_SUCCESS(Status)) { 2427 Status = do_write(Vcb, NULL); 2428 if (!NT_SUCCESS(Status)) 2429 ERR("do_write returned %08x\n", Status); 2430 } 2431 2432 free_trees(Vcb); 2433 2434 ExReleaseResourceLite(&Vcb->tree_lock); 2435 2436 return Status; 2437 } 2438 2439 static NTSTATUS remove_balance_item(device_extension* Vcb) { 2440 KEY searchkey; 2441 traverse_ptr tp; 2442 NTSTATUS Status; 2443 2444 searchkey.obj_id = BALANCE_ITEM_ID; 2445 searchkey.obj_type = TYPE_TEMP_ITEM; 2446 searchkey.offset = 0; 2447 2448 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 2449 2450 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 2451 if (!NT_SUCCESS(Status)) { 2452 ERR("find_item returned %08x\n", Status); 2453 goto end; 2454 } 2455 2456 if (!keycmp(tp.item->key, searchkey)) { 2457 Status = delete_tree_item(Vcb, &tp); 2458 if (!NT_SUCCESS(Status)) { 2459 ERR("delete_tree_item returned %08x\n", Status); 2460 goto end; 2461 } 2462 2463 Status = do_write(Vcb, NULL); 2464 if (!NT_SUCCESS(Status)) { 2465 ERR("do_write returned %08x\n", Status); 2466 goto end; 2467 } 2468 2469 free_trees(Vcb); 2470 } 2471 2472 Status = STATUS_SUCCESS; 2473 2474 end: 2475 ExReleaseResourceLite(&Vcb->tree_lock); 2476 2477 return Status; 2478 } 2479 2480 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) { 2481 opts->flags = BTRFS_BALANCE_OPTS_ENABLED; 2482 2483 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) { 2484 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES; 2485 opts->profiles = args->profiles; 2486 } 2487 2488 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) { 2489 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2490 2491 opts->usage_start = 0; 2492 opts->usage_end = (uint8_t)args->usage; 2493 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) { 2494 opts->flags |= BTRFS_BALANCE_OPTS_USAGE; 2495 2496 opts->usage_start = (uint8_t)args->usage_start; 2497 opts->usage_end = (uint8_t)args->usage_end; 2498 } 2499 2500 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) { 2501 opts->flags |= BTRFS_BALANCE_OPTS_DEVID; 2502 opts->devid = args->devid; 2503 } 2504 2505 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) { 2506 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE; 2507 opts->drange_start = args->drange_start; 2508 opts->drange_end = args->drange_end; 2509 } 2510 2511 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) { 2512 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE; 2513 opts->vrange_start = args->vrange_start; 2514 opts->vrange_end = args->vrange_end; 2515 } 2516 2517 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) { 2518 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2519 2520 opts->limit_start = 0; 2521 opts->limit_end = args->limit; 2522 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) { 2523 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT; 2524 2525 opts->limit_start = args->limit_start; 2526 opts->limit_end = args->limit_end; 2527 } 2528 2529 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) { 2530 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES; 2531 2532 opts->stripes_start = (uint16_t)args->stripes_start; 2533 opts->stripes_end = (uint16_t)args->stripes_end; 2534 } 2535 2536 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) { 2537 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT; 2538 opts->convert = args->convert; 2539 2540 if (args->flags & BALANCE_ARGS_FLAGS_SOFT) 2541 opts->flags |= BTRFS_BALANCE_OPTS_SOFT; 2542 } 2543 } 2544 2545 static NTSTATUS remove_superblocks(device* dev) { 2546 NTSTATUS Status; 2547 superblock* sb; 2548 int i = 0; 2549 2550 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG); 2551 if (!sb) { 2552 ERR("out of memory\n"); 2553 return STATUS_INSUFFICIENT_RESOURCES; 2554 } 2555 2556 RtlZeroMemory(sb, sizeof(superblock)); 2557 2558 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) { 2559 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock)); 2560 2561 if (!NT_SUCCESS(Status)) { 2562 ExFreePool(sb); 2563 return Status; 2564 } 2565 2566 i++; 2567 } 2568 2569 ExFreePool(sb); 2570 2571 return STATUS_SUCCESS; 2572 } 2573 2574 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2575 KEY searchkey; 2576 traverse_ptr tp; 2577 NTSTATUS Status; 2578 LIST_ENTRY* le; 2579 volume_device_extension* vde; 2580 2581 if (Vcb->need_write) { 2582 Status = do_write(Vcb, NULL); 2583 2584 if (!NT_SUCCESS(Status)) 2585 ERR("do_write returned %08x\n", Status); 2586 } else 2587 Status = STATUS_SUCCESS; 2588 2589 free_trees(Vcb); 2590 2591 if (!NT_SUCCESS(Status)) 2592 return Status; 2593 2594 // remove entry in chunk tree 2595 2596 searchkey.obj_id = 1; 2597 searchkey.obj_type = TYPE_DEV_ITEM; 2598 searchkey.offset = dev->devitem.dev_id; 2599 2600 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL); 2601 if (!NT_SUCCESS(Status)) { 2602 ERR("find_item returned %08x\n", Status); 2603 return Status; 2604 } 2605 2606 if (!keycmp(searchkey, tp.item->key)) { 2607 Status = delete_tree_item(Vcb, &tp); 2608 2609 if (!NT_SUCCESS(Status)) { 2610 ERR("delete_tree_item returned %08x\n", Status); 2611 return Status; 2612 } 2613 } 2614 2615 // remove stats entry in device tree 2616 2617 searchkey.obj_id = 0; 2618 searchkey.obj_type = TYPE_DEV_STATS; 2619 searchkey.offset = dev->devitem.dev_id; 2620 2621 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2622 if (!NT_SUCCESS(Status)) { 2623 ERR("find_item returned %08x\n", Status); 2624 return Status; 2625 } 2626 2627 if (!keycmp(searchkey, tp.item->key)) { 2628 Status = delete_tree_item(Vcb, &tp); 2629 2630 if (!NT_SUCCESS(Status)) { 2631 ERR("delete_tree_item returned %08x\n", Status); 2632 return Status; 2633 } 2634 } 2635 2636 // update superblock 2637 2638 Vcb->superblock.num_devices--; 2639 Vcb->superblock.total_bytes -= dev->devitem.num_bytes; 2640 Vcb->devices_loaded--; 2641 2642 RemoveEntryList(&dev->list_entry); 2643 2644 // flush 2645 2646 Status = do_write(Vcb, NULL); 2647 if (!NT_SUCCESS(Status)) 2648 ERR("do_write returned %08x\n", Status); 2649 2650 free_trees(Vcb); 2651 2652 if (!NT_SUCCESS(Status)) 2653 return Status; 2654 2655 if (!dev->readonly && dev->devobj) { 2656 Status = remove_superblocks(dev); 2657 if (!NT_SUCCESS(Status)) 2658 WARN("remove_superblocks returned %08x\n", Status); 2659 } 2660 2661 // remove entry in volume list 2662 2663 vde = Vcb->vde; 2664 2665 if (dev->devobj) { 2666 pdo_device_extension* pdode = vde->pdode; 2667 2668 ExAcquireResourceExclusiveLite(&pdode->child_lock, true); 2669 2670 le = pdode->children.Flink; 2671 while (le != &pdode->children) { 2672 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2673 2674 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) { 2675 PFILE_OBJECT FileObject; 2676 PDEVICE_OBJECT mountmgr; 2677 UNICODE_STRING mmdevpath; 2678 2679 pdode->children_loaded--; 2680 2681 if (vc->had_drive_letter) { // re-add entry to mountmgr 2682 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME); 2683 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr); 2684 if (!NT_SUCCESS(Status)) 2685 ERR("IoGetDeviceObjectPointer returned %08x\n", Status); 2686 else { 2687 MOUNTDEV_NAME mdn; 2688 2689 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL); 2690 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW) 2691 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status); 2692 else { 2693 MOUNTDEV_NAME* mdn2; 2694 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength; 2695 2696 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG); 2697 if (!mdn2) 2698 ERR("out of memory\n"); 2699 else { 2700 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL); 2701 if (!NT_SUCCESS(Status)) 2702 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status); 2703 else { 2704 UNICODE_STRING name; 2705 2706 name.Buffer = mdn2->Name; 2707 name.Length = name.MaximumLength = mdn2->NameLength; 2708 2709 Status = mountmgr_add_drive_letter(mountmgr, &name); 2710 if (!NT_SUCCESS(Status)) 2711 WARN("mountmgr_add_drive_letter returned %08x\n", Status); 2712 } 2713 2714 ExFreePool(mdn2); 2715 } 2716 } 2717 2718 ObDereferenceObject(FileObject); 2719 } 2720 } 2721 2722 ExFreePool(vc->pnp_name.Buffer); 2723 RemoveEntryList(&vc->list_entry); 2724 ExFreePool(vc); 2725 2726 ObDereferenceObject(vc->fileobj); 2727 2728 break; 2729 } 2730 2731 le = le->Flink; 2732 } 2733 2734 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) { 2735 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA; 2736 2737 le = pdode->children.Flink; 2738 while (le != &pdode->children) { 2739 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry); 2740 2741 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) { 2742 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA; 2743 break; 2744 } 2745 2746 le = le->Flink; 2747 } 2748 } 2749 2750 pdode->num_children = Vcb->superblock.num_devices; 2751 2752 ExReleaseResourceLite(&pdode->child_lock); 2753 2754 // free dev 2755 2756 if (dev->trim && !dev->readonly && !Vcb->options.no_trim) 2757 trim_whole_device(dev); 2758 } 2759 2760 while (!IsListEmpty(&dev->space)) { 2761 LIST_ENTRY* le2 = RemoveHeadList(&dev->space); 2762 space* s = CONTAINING_RECORD(le2, space, list_entry); 2763 2764 ExFreePool(s); 2765 } 2766 2767 ExFreePool(dev); 2768 2769 if (Vcb->trim) { 2770 Vcb->trim = false; 2771 2772 le = Vcb->devices.Flink; 2773 while (le != &Vcb->devices) { 2774 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 2775 2776 if (dev2->trim) { 2777 Vcb->trim = true; 2778 break; 2779 } 2780 2781 le = le->Flink; 2782 } 2783 } 2784 2785 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 2786 2787 return STATUS_SUCCESS; 2788 } 2789 2790 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) { 2791 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa; 2792 DEVICE_DATA_SET_RANGE* ranges; 2793 ULONG datalen, i; 2794 KEY searchkey; 2795 traverse_ptr tp; 2796 NTSTATUS Status; 2797 bool b; 2798 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there 2799 LIST_ENTRY* le; 2800 2801 dev->num_trim_entries = 0; 2802 2803 searchkey.obj_id = dev->devitem.dev_id; 2804 searchkey.obj_type = TYPE_DEV_EXTENT; 2805 searchkey.offset = 0; 2806 2807 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL); 2808 if (!NT_SUCCESS(Status)) { 2809 ERR("find_item returned %08x\n", Status); 2810 return; 2811 } 2812 2813 do { 2814 traverse_ptr next_tp; 2815 2816 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) { 2817 if (tp.item->size >= sizeof(DEV_EXTENT)) { 2818 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data; 2819 2820 if (tp.item->key.offset > lastoff) 2821 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff); 2822 2823 lastoff = tp.item->key.offset + de->length; 2824 } else { 2825 ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT)); 2826 return; 2827 } 2828 } 2829 2830 b = find_next_item(Vcb, &tp, &next_tp, false, NULL); 2831 2832 if (b) { 2833 tp = next_tp; 2834 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type)) 2835 break; 2836 } 2837 } while (b); 2838 2839 if (lastoff < dev->devitem.num_bytes) 2840 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff); 2841 2842 if (dev->num_trim_entries == 0) 2843 return; 2844 2845 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE)); 2846 2847 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG); 2848 if (!dmdsa) { 2849 ERR("out of memory\n"); 2850 goto end; 2851 } 2852 2853 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES); 2854 dmdsa->Action = DeviceDsmAction_Trim; 2855 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED; 2856 dmdsa->ParameterBlockOffset = 0; 2857 dmdsa->ParameterBlockLength = 0; 2858 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)); 2859 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE); 2860 2861 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset); 2862 2863 i = 0; 2864 le = dev->trim_list.Flink; 2865 while (le != &dev->trim_list) { 2866 space* s = CONTAINING_RECORD(le, space, list_entry); 2867 2868 ranges[i].StartingOffset = s->address; 2869 ranges[i].LengthInBytes = s->size; 2870 i++; 2871 2872 le = le->Flink; 2873 } 2874 2875 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL); 2876 if (!NT_SUCCESS(Status)) 2877 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status); 2878 2879 ExFreePool(dmdsa); 2880 2881 end: 2882 while (!IsListEmpty(&dev->trim_list)) { 2883 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry); 2884 ExFreePool(s); 2885 } 2886 2887 dev->num_trim_entries = 0; 2888 } 2889 2890 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) { 2891 NTSTATUS Status; 2892 bool changed; 2893 LIST_ENTRY* le; 2894 chunk* rc; 2895 2896 // FIXME - allow with metadata chunks? 2897 2898 while (true) { 2899 rc = NULL; 2900 2901 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 2902 2903 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 2904 2905 // choose the least-used chunk we haven't looked at yet 2906 le = Vcb->chunks.Flink; 2907 while (le != &Vcb->chunks) { 2908 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 2909 2910 // FIXME - skip full-size chunks over e.g. 90% full? 2911 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used)) 2912 rc = c; 2913 2914 le = le->Flink; 2915 } 2916 2917 ExReleaseResourceLite(&Vcb->chunk_lock); 2918 2919 if (!rc) { 2920 ExReleaseResourceLite(&Vcb->tree_lock); 2921 break; 2922 } 2923 2924 if (rc->list_entry_balance.Flink) { 2925 RemoveEntryList(&rc->list_entry_balance); 2926 Vcb->balance.chunks_left--; 2927 } 2928 2929 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped 2930 rc->reloc = true; 2931 2932 ExReleaseResourceLite(&Vcb->tree_lock); 2933 2934 do { 2935 changed = false; 2936 2937 Status = balance_data_chunk(Vcb, rc, &changed); 2938 if (!NT_SUCCESS(Status)) { 2939 ERR("balance_data_chunk returned %08x\n", Status); 2940 Vcb->balance.status = Status; 2941 rc->list_entry_balance.Flink = NULL; 2942 rc->reloc = false; 2943 return Status; 2944 } 2945 2946 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 2947 2948 if (Vcb->readonly) 2949 Vcb->balance.stopping = true; 2950 2951 if (Vcb->balance.stopping) 2952 return STATUS_SUCCESS; 2953 } while (changed); 2954 2955 rc->list_entry_balance.Flink = NULL; 2956 2957 rc->changed = true; 2958 rc->space_changed = true; 2959 rc->balance_num = Vcb->balance.balance_num; 2960 2961 Status = do_write(Vcb, NULL); 2962 if (!NT_SUCCESS(Status)) { 2963 ERR("do_write returned %08x\n", Status); 2964 return Status; 2965 } 2966 2967 free_trees(Vcb); 2968 } 2969 2970 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 2971 2972 Status = alloc_chunk(Vcb, flags, &rc, true); 2973 2974 ExReleaseResourceLite(&Vcb->chunk_lock); 2975 2976 if (NT_SUCCESS(Status)) { 2977 *newchunk = rc; 2978 return Status; 2979 } else { 2980 ERR("alloc_chunk returned %08x\n", Status); 2981 return Status; 2982 } 2983 } 2984 2985 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) { 2986 LIST_ENTRY* le; 2987 2988 while (!IsListEmpty(&dev->space)) { 2989 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry); 2990 2991 ExFreePool(s); 2992 } 2993 2994 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device. 2995 2996 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL); 2997 2998 le = Vcb->chunks.Flink; 2999 while (le != &Vcb->chunks) { 3000 uint16_t n; 3001 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3002 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1]; 3003 3004 for (n = 0; n < c->chunk_item->num_stripes; n++) { 3005 uint64_t stripe_size = 0; 3006 3007 if (cis[n].dev_id == dev->devitem.dev_id) { 3008 if (stripe_size == 0) { 3009 uint16_t factor; 3010 3011 if (c->chunk_item->type & BLOCK_FLAG_RAID0) 3012 factor = c->chunk_item->num_stripes; 3013 else if (c->chunk_item->type & BLOCK_FLAG_RAID10) 3014 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes; 3015 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) 3016 factor = c->chunk_item->num_stripes - 1; 3017 else if (c->chunk_item->type & BLOCK_FLAG_RAID6) 3018 factor = c->chunk_item->num_stripes - 2; 3019 else // SINGLE, DUP, RAID1 3020 factor = 1; 3021 3022 stripe_size = c->chunk_item->size / factor; 3023 } 3024 3025 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL); 3026 } 3027 } 3028 3029 le = le->Flink; 3030 } 3031 3032 return STATUS_SUCCESS; 3033 } 3034 3035 _Function_class_(KSTART_ROUTINE) 3036 void __stdcall balance_thread(void* context) { 3037 device_extension* Vcb = (device_extension*)context; 3038 LIST_ENTRY chunks; 3039 LIST_ENTRY* le; 3040 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0; 3041 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0; 3042 NTSTATUS Status; 3043 3044 Vcb->balance.balance_num++; 3045 3046 Vcb->balance.stopping = false; 3047 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false); 3048 3049 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3050 old_data_flags = Vcb->data_flags; 3051 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert); 3052 3053 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3054 } 3055 3056 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3057 old_metadata_flags = Vcb->metadata_flags; 3058 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert); 3059 } 3060 3061 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3062 old_system_flags = Vcb->system_flags; 3063 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert); 3064 } 3065 3066 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) { 3067 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3068 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3069 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) 3070 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3071 } 3072 3073 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0; 3074 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0; 3075 3076 InitializeListHead(&chunks); 3077 3078 // FIXME - what are we supposed to do with limit_start? 3079 3080 if (!Vcb->readonly) { 3081 if (!Vcb->balance.removing && !Vcb->balance.shrinking) { 3082 Status = add_balance_item(Vcb); 3083 if (!NT_SUCCESS(Status)) { 3084 ERR("add_balance_item returned %08x\n", Status); 3085 Vcb->balance.status = Status; 3086 goto end; 3087 } 3088 } else { 3089 if (Vcb->need_write) { 3090 Status = do_write(Vcb, NULL); 3091 3092 free_trees(Vcb); 3093 3094 if (!NT_SUCCESS(Status)) { 3095 ERR("do_write returned %08x\n", Status); 3096 Vcb->balance.status = Status; 3097 goto end; 3098 } 3099 } 3100 } 3101 } 3102 3103 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3104 3105 if (Vcb->balance.stopping) 3106 goto end; 3107 3108 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3109 3110 le = Vcb->chunks.Flink; 3111 while (le != &Vcb->chunks) { 3112 chunk* c = CONTAINING_RECORD(le, chunk, list_entry); 3113 uint8_t sort; 3114 3115 acquire_chunk_lock(c, Vcb); 3116 3117 if (c->chunk_item->type & BLOCK_FLAG_DATA) 3118 sort = BALANCE_OPTS_DATA; 3119 else if (c->chunk_item->type & BLOCK_FLAG_METADATA) 3120 sort = BALANCE_OPTS_METADATA; 3121 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM) 3122 sort = BALANCE_OPTS_SYSTEM; 3123 else { 3124 ERR("unexpected chunk type %I64x\n", c->chunk_item->type); 3125 release_chunk_lock(c, Vcb); 3126 break; 3127 } 3128 3129 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) && 3130 should_balance_chunk(Vcb, sort, c)) { 3131 InsertTailList(&chunks, &c->list_entry_balance); 3132 3133 num_chunks[sort]++; 3134 Vcb->balance.total_chunks++; 3135 Vcb->balance.chunks_left++; 3136 } else if (sort == BALANCE_OPTS_METADATA) 3137 okay_metadata_chunks++; 3138 else if (sort == BALANCE_OPTS_DATA) 3139 okay_data_chunks++; 3140 else if (sort == BALANCE_OPTS_SYSTEM) 3141 okay_system_chunks++; 3142 3143 if (!c->cache_loaded) { 3144 Status = load_cache_chunk(Vcb, c, NULL); 3145 3146 if (!NT_SUCCESS(Status)) { 3147 ERR("load_cache_chunk returned %08x\n", Status); 3148 Vcb->balance.status = Status; 3149 release_chunk_lock(c, Vcb); 3150 ExReleaseResourceLite(&Vcb->chunk_lock); 3151 goto end; 3152 } 3153 } 3154 3155 release_chunk_lock(c, Vcb); 3156 3157 le = le->Flink; 3158 } 3159 3160 ExReleaseResourceLite(&Vcb->chunk_lock); 3161 3162 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up 3163 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) { 3164 bool consolidated = false; 3165 chunk* c; 3166 3167 if (okay_metadata_chunks == 0) { 3168 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3169 3170 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true); 3171 if (NT_SUCCESS(Status)) 3172 c->balance_num = Vcb->balance.balance_num; 3173 else if (Status != STATUS_DISK_FULL || consolidated) { 3174 ERR("alloc_chunk returned %08x\n", Status); 3175 ExReleaseResourceLite(&Vcb->chunk_lock); 3176 Vcb->balance.status = Status; 3177 goto end; 3178 } 3179 3180 ExReleaseResourceLite(&Vcb->chunk_lock); 3181 3182 if (Status == STATUS_DISK_FULL) { 3183 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c); 3184 if (!NT_SUCCESS(Status)) { 3185 ERR("try_consolidation returned %08x\n", Status); 3186 Vcb->balance.status = Status; 3187 goto end; 3188 } else 3189 c->balance_num = Vcb->balance.balance_num; 3190 3191 consolidated = true; 3192 3193 if (Vcb->balance.stopping) 3194 goto end; 3195 } 3196 } 3197 3198 if (okay_data_chunks == 0) { 3199 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3200 3201 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true); 3202 if (NT_SUCCESS(Status)) 3203 c->balance_num = Vcb->balance.balance_num; 3204 else if (Status != STATUS_DISK_FULL || consolidated) { 3205 ERR("alloc_chunk returned %08x\n", Status); 3206 ExReleaseResourceLite(&Vcb->chunk_lock); 3207 Vcb->balance.status = Status; 3208 goto end; 3209 } 3210 3211 ExReleaseResourceLite(&Vcb->chunk_lock); 3212 3213 if (Status == STATUS_DISK_FULL) { 3214 Status = try_consolidation(Vcb, Vcb->data_flags, &c); 3215 if (!NT_SUCCESS(Status)) { 3216 ERR("try_consolidation returned %08x\n", Status); 3217 Vcb->balance.status = Status; 3218 goto end; 3219 } else 3220 c->balance_num = Vcb->balance.balance_num; 3221 3222 consolidated = true; 3223 3224 if (Vcb->balance.stopping) 3225 goto end; 3226 } 3227 } 3228 3229 if (okay_system_chunks == 0) { 3230 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true); 3231 3232 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true); 3233 if (NT_SUCCESS(Status)) 3234 c->balance_num = Vcb->balance.balance_num; 3235 else if (Status != STATUS_DISK_FULL || consolidated) { 3236 ERR("alloc_chunk returned %08x\n", Status); 3237 ExReleaseResourceLite(&Vcb->chunk_lock); 3238 Vcb->balance.status = Status; 3239 goto end; 3240 } 3241 3242 ExReleaseResourceLite(&Vcb->chunk_lock); 3243 3244 if (Status == STATUS_DISK_FULL) { 3245 Status = try_consolidation(Vcb, Vcb->system_flags, &c); 3246 if (!NT_SUCCESS(Status)) { 3247 ERR("try_consolidation returned %08x\n", Status); 3248 Vcb->balance.status = Status; 3249 goto end; 3250 } else 3251 c->balance_num = Vcb->balance.balance_num; 3252 3253 consolidated = true; 3254 3255 if (Vcb->balance.stopping) 3256 goto end; 3257 } 3258 } 3259 } 3260 3261 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true); 3262 3263 le = chunks.Flink; 3264 while (le != &chunks) { 3265 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3266 3267 c->reloc = true; 3268 3269 le = le->Flink; 3270 } 3271 3272 ExReleaseResourceLite(&Vcb->chunk_lock); 3273 3274 // do data chunks before metadata 3275 le = chunks.Flink; 3276 while (le != &chunks) { 3277 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3278 LIST_ENTRY* le2 = le->Flink; 3279 3280 if (c->chunk_item->type & BLOCK_FLAG_DATA) { 3281 bool changed; 3282 3283 do { 3284 changed = false; 3285 3286 Status = balance_data_chunk(Vcb, c, &changed); 3287 if (!NT_SUCCESS(Status)) { 3288 ERR("balance_data_chunk returned %08x\n", Status); 3289 Vcb->balance.status = Status; 3290 goto end; 3291 } 3292 3293 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3294 3295 if (Vcb->readonly) 3296 Vcb->balance.stopping = true; 3297 3298 if (Vcb->balance.stopping) 3299 break; 3300 } while (changed); 3301 3302 c->changed = true; 3303 c->space_changed = true; 3304 } 3305 3306 if (Vcb->balance.stopping) 3307 goto end; 3308 3309 if (c->chunk_item->type & BLOCK_FLAG_DATA && 3310 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) { 3311 RemoveEntryList(&c->list_entry_balance); 3312 c->list_entry_balance.Flink = NULL; 3313 3314 Vcb->balance.chunks_left--; 3315 } 3316 3317 le = le2; 3318 } 3319 3320 // do metadata chunks 3321 while (!IsListEmpty(&chunks)) { 3322 chunk* c; 3323 bool changed; 3324 3325 le = RemoveHeadList(&chunks); 3326 c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3327 3328 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) { 3329 do { 3330 Status = balance_metadata_chunk(Vcb, c, &changed); 3331 if (!NT_SUCCESS(Status)) { 3332 ERR("balance_metadata_chunk returned %08x\n", Status); 3333 Vcb->balance.status = Status; 3334 goto end; 3335 } 3336 3337 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL); 3338 3339 if (Vcb->readonly) 3340 Vcb->balance.stopping = true; 3341 3342 if (Vcb->balance.stopping) 3343 break; 3344 } while (changed); 3345 3346 c->changed = true; 3347 c->space_changed = true; 3348 } 3349 3350 if (Vcb->balance.stopping) 3351 break; 3352 3353 c->list_entry_balance.Flink = NULL; 3354 3355 Vcb->balance.chunks_left--; 3356 } 3357 3358 end: 3359 if (!Vcb->readonly) { 3360 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3361 le = chunks.Flink; 3362 while (le != &chunks) { 3363 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance); 3364 c->reloc = false; 3365 3366 le = le->Flink; 3367 c->list_entry_balance.Flink = NULL; 3368 } 3369 3370 if (old_data_flags != 0) 3371 Vcb->data_flags = old_data_flags; 3372 3373 if (old_metadata_flags != 0) 3374 Vcb->metadata_flags = old_metadata_flags; 3375 3376 if (old_system_flags != 0) 3377 Vcb->system_flags = old_system_flags; 3378 } 3379 3380 if (Vcb->balance.removing) { 3381 device* dev = NULL; 3382 3383 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3384 3385 le = Vcb->devices.Flink; 3386 while (le != &Vcb->devices) { 3387 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3388 3389 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3390 dev = dev2; 3391 break; 3392 } 3393 3394 le = le->Flink; 3395 } 3396 3397 if (dev) { 3398 if (Vcb->balance.chunks_left == 0) { 3399 Status = finish_removing_device(Vcb, dev); 3400 3401 if (!NT_SUCCESS(Status)) { 3402 ERR("finish_removing_device returned %08x\n", Status); 3403 dev->reloc = false; 3404 } 3405 } else 3406 dev->reloc = false; 3407 } 3408 3409 ExReleaseResourceLite(&Vcb->tree_lock); 3410 } else if (Vcb->balance.shrinking) { 3411 device* dev = NULL; 3412 3413 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3414 3415 le = Vcb->devices.Flink; 3416 while (le != &Vcb->devices) { 3417 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3418 3419 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) { 3420 dev = dev2; 3421 break; 3422 } 3423 3424 le = le->Flink; 3425 } 3426 3427 if (!dev) { 3428 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid); 3429 Vcb->balance.status = STATUS_INTERNAL_ERROR; 3430 } 3431 3432 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) { 3433 if (dev) { 3434 Status = regenerate_space_list(Vcb, dev); 3435 if (!NT_SUCCESS(Status)) 3436 WARN("regenerate_space_list returned %08x\n", Status); 3437 } 3438 } else { 3439 uint64_t old_size; 3440 3441 old_size = dev->devitem.num_bytes; 3442 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start; 3443 3444 Status = update_dev_item(Vcb, dev, NULL); 3445 if (!NT_SUCCESS(Status)) { 3446 ERR("update_dev_item returned %08x\n", Status); 3447 dev->devitem.num_bytes = old_size; 3448 Vcb->balance.status = Status; 3449 3450 Status = regenerate_space_list(Vcb, dev); 3451 if (!NT_SUCCESS(Status)) 3452 WARN("regenerate_space_list returned %08x\n", Status); 3453 } else { 3454 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes; 3455 3456 Status = do_write(Vcb, NULL); 3457 if (!NT_SUCCESS(Status)) 3458 ERR("do_write returned %08x\n", Status); 3459 3460 free_trees(Vcb); 3461 } 3462 } 3463 3464 ExReleaseResourceLite(&Vcb->tree_lock); 3465 3466 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status)) 3467 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE); 3468 } else { 3469 Status = remove_balance_item(Vcb); 3470 if (!NT_SUCCESS(Status)) { 3471 ERR("remove_balance_item returned %08x\n", Status); 3472 goto end; 3473 } 3474 } 3475 3476 if (Vcb->trim && !Vcb->options.no_trim) { 3477 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true); 3478 3479 le = Vcb->devices.Flink; 3480 while (le != &Vcb->devices) { 3481 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3482 3483 if (dev2->devobj && !dev2->readonly && dev2->trim) 3484 trim_unalloc_space(Vcb, dev2); 3485 3486 le = le->Flink; 3487 } 3488 3489 ExReleaseResourceLite(&Vcb->tree_lock); 3490 } 3491 } 3492 3493 ZwClose(Vcb->balance.thread); 3494 Vcb->balance.thread = NULL; 3495 3496 KeSetEvent(&Vcb->balance.finished, 0, false); 3497 } 3498 3499 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3500 NTSTATUS Status; 3501 btrfs_start_balance* bsb = (btrfs_start_balance*)data; 3502 uint8_t i; 3503 3504 if (length < sizeof(btrfs_start_balance) || !data) 3505 return STATUS_INVALID_PARAMETER; 3506 3507 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3508 return STATUS_PRIVILEGE_NOT_HELD; 3509 3510 if (Vcb->locked) { 3511 WARN("cannot start balance while locked\n"); 3512 return STATUS_DEVICE_NOT_READY; 3513 } 3514 3515 if (Vcb->scrub.thread) { 3516 WARN("cannot start balance while scrub running\n"); 3517 return STATUS_DEVICE_NOT_READY; 3518 } 3519 3520 if (Vcb->balance.thread) { 3521 WARN("balance already running\n"); 3522 return STATUS_DEVICE_NOT_READY; 3523 } 3524 3525 if (Vcb->readonly) 3526 return STATUS_MEDIA_WRITE_PROTECTED; 3527 3528 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3529 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) && 3530 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED)) 3531 return STATUS_SUCCESS; 3532 3533 for (i = 0; i < 3; i++) { 3534 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3535 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) { 3536 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 | 3537 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE; 3538 3539 if (bsb->opts[i].profiles == 0) 3540 return STATUS_INVALID_PARAMETER; 3541 } 3542 3543 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) { 3544 if (bsb->opts[i].devid == 0) 3545 return STATUS_INVALID_PARAMETER; 3546 } 3547 3548 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) { 3549 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end) 3550 return STATUS_INVALID_PARAMETER; 3551 } 3552 3553 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) { 3554 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end) 3555 return STATUS_INVALID_PARAMETER; 3556 } 3557 3558 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) { 3559 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start); 3560 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end); 3561 3562 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end) 3563 return STATUS_INVALID_PARAMETER; 3564 } 3565 3566 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) { 3567 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start); 3568 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end); 3569 3570 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3571 return STATUS_INVALID_PARAMETER; 3572 } 3573 3574 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) { 3575 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start); 3576 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end); 3577 3578 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end) 3579 return STATUS_INVALID_PARAMETER; 3580 } 3581 3582 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) { 3583 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 && 3584 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 && 3585 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 && 3586 bsb->opts[i].convert != BLOCK_FLAG_SINGLE) 3587 return STATUS_INVALID_PARAMETER; 3588 } 3589 } 3590 } 3591 3592 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3593 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3594 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3595 3596 Vcb->balance.paused = false; 3597 Vcb->balance.removing = false; 3598 Vcb->balance.shrinking = false; 3599 Vcb->balance.status = STATUS_SUCCESS; 3600 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3601 3602 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb); 3603 if (!NT_SUCCESS(Status)) { 3604 ERR("PsCreateSystemThread returned %08x\n", Status); 3605 return Status; 3606 } 3607 3608 return STATUS_SUCCESS; 3609 } 3610 3611 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) { 3612 KEY searchkey; 3613 traverse_ptr tp; 3614 NTSTATUS Status; 3615 BALANCE_ITEM* bi; 3616 int i; 3617 3618 searchkey.obj_id = BALANCE_ITEM_ID; 3619 searchkey.obj_type = TYPE_TEMP_ITEM; 3620 searchkey.offset = 0; 3621 3622 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL); 3623 if (!NT_SUCCESS(Status)) { 3624 ERR("find_item returned %08x\n", Status); 3625 return Status; 3626 } 3627 3628 if (keycmp(tp.item->key, searchkey)) { 3629 TRACE("no balance item found\n"); 3630 return STATUS_NOT_FOUND; 3631 } 3632 3633 if (tp.item->size < sizeof(BALANCE_ITEM)) { 3634 WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, 3635 tp.item->size, sizeof(BALANCE_ITEM)); 3636 return STATUS_INTERNAL_ERROR; 3637 } 3638 3639 bi = (BALANCE_ITEM*)tp.item->data; 3640 3641 if (bi->flags & BALANCE_FLAGS_DATA) 3642 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data); 3643 3644 if (bi->flags & BALANCE_FLAGS_METADATA) 3645 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata); 3646 3647 if (bi->flags & BALANCE_FLAGS_SYSTEM) 3648 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system); 3649 3650 // do the heuristics that Linux driver does 3651 3652 for (i = 0; i < 3; i++) { 3653 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) { 3654 // if converting, don't redo chunks already done 3655 3656 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3657 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT; 3658 3659 // don't balance chunks more than 90% filled - presumably these 3660 // have already been done 3661 3662 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) && 3663 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) 3664 ) { 3665 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE; 3666 Vcb->balance.opts[i].usage_start = 0; 3667 Vcb->balance.opts[i].usage_end = 90; 3668 } 3669 } 3670 } 3671 3672 if (Vcb->readonly || Vcb->options.skip_balance) 3673 Vcb->balance.paused = true; 3674 else 3675 Vcb->balance.paused = false; 3676 3677 Vcb->balance.removing = false; 3678 Vcb->balance.shrinking = false; 3679 Vcb->balance.status = STATUS_SUCCESS; 3680 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3681 3682 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb); 3683 if (!NT_SUCCESS(Status)) { 3684 ERR("PsCreateSystemThread returned %08x\n", Status); 3685 return Status; 3686 } 3687 3688 return STATUS_SUCCESS; 3689 } 3690 3691 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) { 3692 btrfs_query_balance* bqb = (btrfs_query_balance*)data; 3693 3694 if (length < sizeof(btrfs_query_balance) || !data) 3695 return STATUS_INVALID_PARAMETER; 3696 3697 if (!Vcb->balance.thread) { 3698 bqb->status = BTRFS_BALANCE_STOPPED; 3699 3700 if (!NT_SUCCESS(Vcb->balance.status)) { 3701 bqb->status |= BTRFS_BALANCE_ERROR; 3702 bqb->error = Vcb->balance.status; 3703 } 3704 3705 return STATUS_SUCCESS; 3706 } 3707 3708 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING; 3709 3710 if (Vcb->balance.removing) 3711 bqb->status |= BTRFS_BALANCE_REMOVAL; 3712 3713 if (Vcb->balance.shrinking) 3714 bqb->status |= BTRFS_BALANCE_SHRINKING; 3715 3716 if (!NT_SUCCESS(Vcb->balance.status)) 3717 bqb->status |= BTRFS_BALANCE_ERROR; 3718 3719 bqb->chunks_left = Vcb->balance.chunks_left; 3720 bqb->total_chunks = Vcb->balance.total_chunks; 3721 bqb->error = Vcb->balance.status; 3722 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts)); 3723 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts)); 3724 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts)); 3725 3726 return STATUS_SUCCESS; 3727 } 3728 3729 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3730 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3731 return STATUS_PRIVILEGE_NOT_HELD; 3732 3733 if (!Vcb->balance.thread) 3734 return STATUS_DEVICE_NOT_READY; 3735 3736 if (Vcb->balance.paused) 3737 return STATUS_DEVICE_NOT_READY; 3738 3739 Vcb->balance.paused = true; 3740 KeClearEvent(&Vcb->balance.event); 3741 3742 return STATUS_SUCCESS; 3743 } 3744 3745 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3746 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3747 return STATUS_PRIVILEGE_NOT_HELD; 3748 3749 if (!Vcb->balance.thread) 3750 return STATUS_DEVICE_NOT_READY; 3751 3752 if (!Vcb->balance.paused) 3753 return STATUS_DEVICE_NOT_READY; 3754 3755 if (Vcb->readonly) 3756 return STATUS_MEDIA_WRITE_PROTECTED; 3757 3758 Vcb->balance.paused = false; 3759 KeSetEvent(&Vcb->balance.event, 0, false); 3760 3761 return STATUS_SUCCESS; 3762 } 3763 3764 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) { 3765 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3766 return STATUS_PRIVILEGE_NOT_HELD; 3767 3768 if (!Vcb->balance.thread) 3769 return STATUS_DEVICE_NOT_READY; 3770 3771 Vcb->balance.paused = false; 3772 Vcb->balance.stopping = true; 3773 Vcb->balance.status = STATUS_SUCCESS; 3774 KeSetEvent(&Vcb->balance.event, 0, false); 3775 3776 return STATUS_SUCCESS; 3777 } 3778 3779 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) { 3780 uint64_t devid; 3781 LIST_ENTRY* le; 3782 device* dev = NULL; 3783 NTSTATUS Status; 3784 int i; 3785 uint64_t num_rw_devices; 3786 3787 TRACE("(%p, %p, %x)\n", Vcb, data, length); 3788 3789 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode)) 3790 return STATUS_PRIVILEGE_NOT_HELD; 3791 3792 if (length < sizeof(uint64_t)) 3793 return STATUS_INVALID_PARAMETER; 3794 3795 devid = *(uint64_t*)data; 3796 3797 ExAcquireResourceSharedLite(&Vcb->tree_lock, true); 3798 3799 if (Vcb->readonly) { 3800 ExReleaseResourceLite(&Vcb->tree_lock); 3801 return STATUS_MEDIA_WRITE_PROTECTED; 3802 } 3803 3804 num_rw_devices = 0; 3805 3806 le = Vcb->devices.Flink; 3807 while (le != &Vcb->devices) { 3808 device* dev2 = CONTAINING_RECORD(le, device, list_entry); 3809 3810 if (dev2->devitem.dev_id == devid) 3811 dev = dev2; 3812 3813 if (!dev2->readonly) 3814 num_rw_devices++; 3815 3816 le = le->Flink; 3817 } 3818 3819 if (!dev) { 3820 ExReleaseResourceLite(&Vcb->tree_lock); 3821 WARN("device %I64x not found\n", devid); 3822 return STATUS_NOT_FOUND; 3823 } 3824 3825 if (!dev->readonly) { 3826 if (num_rw_devices == 1) { 3827 ExReleaseResourceLite(&Vcb->tree_lock); 3828 WARN("not removing last non-readonly device\n"); 3829 return STATUS_INVALID_PARAMETER; 3830 } 3831 3832 if (num_rw_devices == 4 && 3833 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) || 3834 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6)) 3835 ) { 3836 ExReleaseResourceLite(&Vcb->tree_lock); 3837 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n"); 3838 return STATUS_CANNOT_DELETE; 3839 } 3840 3841 if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) { 3842 ExReleaseResourceLite(&Vcb->tree_lock); 3843 ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n"); 3844 return STATUS_CANNOT_DELETE; 3845 } 3846 3847 if (num_rw_devices == 2 && 3848 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) || 3849 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1)) 3850 ) { 3851 ExReleaseResourceLite(&Vcb->tree_lock); 3852 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n"); 3853 return STATUS_CANNOT_DELETE; 3854 } 3855 } 3856 3857 ExReleaseResourceLite(&Vcb->tree_lock); 3858 3859 if (Vcb->balance.thread) { 3860 WARN("balance already running\n"); 3861 return STATUS_DEVICE_NOT_READY; 3862 } 3863 3864 dev->reloc = true; 3865 3866 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3); 3867 3868 for (i = 0; i < 3; i++) { 3869 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID; 3870 Vcb->balance.opts[i].devid = devid; 3871 } 3872 3873 Vcb->balance.paused = false; 3874 Vcb->balance.removing = true; 3875 Vcb->balance.shrinking = false; 3876 Vcb->balance.status = STATUS_SUCCESS; 3877 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused); 3878 3879 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb); 3880 if (!NT_SUCCESS(Status)) { 3881 ERR("PsCreateSystemThread returned %08x\n", Status); 3882 dev->reloc = false; 3883 return Status; 3884 } 3885 3886 return STATUS_SUCCESS; 3887 } 3888