1 /* Copyright (c) Mark Harmstone 2016-17
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22
23 typedef struct {
24 uint64_t address;
25 uint64_t new_address;
26 tree_header* data;
27 EXTENT_ITEM* ei;
28 tree* t;
29 bool system;
30 LIST_ENTRY refs;
31 LIST_ENTRY list_entry;
32 } metadata_reloc;
33
34 typedef struct {
35 uint8_t type;
36 uint64_t hash;
37
38 union {
39 TREE_BLOCK_REF tbr;
40 SHARED_BLOCK_REF sbr;
41 };
42
43 metadata_reloc* parent;
44 bool top;
45 LIST_ENTRY list_entry;
46 } metadata_reloc_ref;
47
48 typedef struct {
49 uint64_t address;
50 uint64_t size;
51 uint64_t new_address;
52 chunk* newchunk;
53 EXTENT_ITEM* ei;
54 LIST_ENTRY refs;
55 LIST_ENTRY list_entry;
56 } data_reloc;
57
58 typedef struct {
59 uint8_t type;
60 uint64_t hash;
61
62 union {
63 EXTENT_DATA_REF edr;
64 SHARED_DATA_REF sdr;
65 };
66
67 metadata_reloc* parent;
68 LIST_ENTRY list_entry;
69 } data_reloc_ref;
70
71 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
72
73 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
74 bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
75 NTSTATUS Status;
76 metadata_reloc* mr;
77 EXTENT_ITEM* ei;
78 uint16_t len;
79 uint64_t inline_rc;
80 uint8_t* ptr;
81
82 mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
83 if (!mr) {
84 ERR("out of memory\n");
85 return STATUS_INSUFFICIENT_RESOURCES;
86 }
87
88 mr->address = tp->item->key.obj_id;
89 mr->data = NULL;
90 mr->ei = (EXTENT_ITEM*)tp->item->data;
91 mr->system = false;
92 InitializeListHead(&mr->refs);
93
94 Status = delete_tree_item(Vcb, tp);
95 if (!NT_SUCCESS(Status)) {
96 ERR("delete_tree_item returned %08lx\n", Status);
97 ExFreePool(mr);
98 return Status;
99 }
100
101 if (!c)
102 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
103
104 if (c) {
105 acquire_chunk_lock(c, Vcb);
106
107 c->used -= Vcb->superblock.node_size;
108
109 space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
110
111 release_chunk_lock(c, Vcb);
112 }
113
114 ei = (EXTENT_ITEM*)tp->item->data;
115 inline_rc = 0;
116
117 len = tp->item->size - sizeof(EXTENT_ITEM);
118 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
119 if (!skinny) {
120 len -= sizeof(EXTENT_ITEM2);
121 ptr += sizeof(EXTENT_ITEM2);
122 }
123
124 while (len > 0) {
125 uint8_t secttype = *ptr;
126 uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
127 metadata_reloc_ref* ref;
128
129 len--;
130
131 if (sectlen > len) {
132 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
133 return STATUS_INTERNAL_ERROR;
134 }
135
136 if (sectlen == 0) {
137 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
138 return STATUS_INTERNAL_ERROR;
139 }
140
141 ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
142 if (!ref) {
143 ERR("out of memory\n");
144 return STATUS_INSUFFICIENT_RESOURCES;
145 }
146
147 if (secttype == TYPE_TREE_BLOCK_REF) {
148 ref->type = TYPE_TREE_BLOCK_REF;
149 RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
150 inline_rc++;
151 } else if (secttype == TYPE_SHARED_BLOCK_REF) {
152 ref->type = TYPE_SHARED_BLOCK_REF;
153 RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
154 inline_rc++;
155 } else {
156 ERR("unexpected tree type %x\n", secttype);
157 ExFreePool(ref);
158 return STATUS_INTERNAL_ERROR;
159 }
160
161 ref->parent = NULL;
162 ref->top = false;
163 InsertTailList(&mr->refs, &ref->list_entry);
164
165 len -= sectlen;
166 ptr += sizeof(uint8_t) + sectlen;
167 }
168
169 if (inline_rc < ei->refcount) { // look for non-inline entries
170 traverse_ptr tp2 = *tp, next_tp;
171
172 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
173 tp2 = next_tp;
174
175 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
176 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
177 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
178 if (!ref) {
179 ERR("out of memory\n");
180 return STATUS_INSUFFICIENT_RESOURCES;
181 }
182
183 ref->type = TYPE_TREE_BLOCK_REF;
184 ref->tbr.offset = tp2.item->key.offset;
185 ref->parent = NULL;
186 ref->top = false;
187 InsertTailList(&mr->refs, &ref->list_entry);
188
189 Status = delete_tree_item(Vcb, &tp2);
190 if (!NT_SUCCESS(Status)) {
191 ERR("delete_tree_item returned %08lx\n", Status);
192 return Status;
193 }
194 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
195 metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
196 if (!ref) {
197 ERR("out of memory\n");
198 return STATUS_INSUFFICIENT_RESOURCES;
199 }
200
201 ref->type = TYPE_SHARED_BLOCK_REF;
202 ref->sbr.offset = tp2.item->key.offset;
203 ref->parent = NULL;
204 ref->top = false;
205 InsertTailList(&mr->refs, &ref->list_entry);
206
207 Status = delete_tree_item(Vcb, &tp2);
208 if (!NT_SUCCESS(Status)) {
209 ERR("delete_tree_item returned %08lx\n", Status);
210 return Status;
211 }
212 }
213 } else
214 break;
215 }
216 }
217
218 InsertTailList(items, &mr->list_entry);
219
220 if (mr2)
221 *mr2 = mr;
222
223 return STATUS_SUCCESS;
224 }
225
226 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
227 uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
228 LIST_ENTRY* le;
229 KEY searchkey;
230 traverse_ptr tp;
231 bool skinny = false;
232 NTSTATUS Status;
233
234 le = items->Flink;
235 while (le != items) {
236 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
237
238 if (mr->address == address) {
239 *mr2 = mr;
240 return STATUS_SUCCESS;
241 }
242
243 le = le->Flink;
244 }
245
246 searchkey.obj_id = address;
247 searchkey.obj_type = TYPE_METADATA_ITEM;
248 searchkey.offset = 0xffffffffffffffff;
249
250 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
251 if (!NT_SUCCESS(Status)) {
252 ERR("find_item returned %08lx\n", Status);
253 return Status;
254 }
255
256 if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
257 skinny = true;
258 else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
259 tp.item->size >= sizeof(EXTENT_ITEM)) {
260 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
261
262 if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
263 ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
264 return STATUS_INTERNAL_ERROR;
265 }
266 } else {
267 ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
268 return STATUS_INTERNAL_ERROR;
269 }
270
271 Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
272 if (!NT_SUCCESS(Status)) {
273 ERR("add_metadata_reloc returned %08lx\n", Status);
274 return Status;
275 }
276
277 return STATUS_SUCCESS;
278 }
279
sort_metadata_reloc_refs(metadata_reloc * mr)280 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
281 LIST_ENTRY newlist, *le;
282
283 if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
284 return;
285
286 // insertion sort
287
288 InitializeListHead(&newlist);
289
290 while (!IsListEmpty(&mr->refs)) {
291 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
292 bool inserted = false;
293
294 if (ref->type == TYPE_TREE_BLOCK_REF)
295 ref->hash = ref->tbr.offset;
296 else if (ref->type == TYPE_SHARED_BLOCK_REF)
297 ref->hash = ref->parent->new_address;
298
299 le = newlist.Flink;
300 while (le != &newlist) {
301 metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
302
303 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
304 InsertHeadList(le->Blink, &ref->list_entry);
305 inserted = true;
306 break;
307 }
308
309 le = le->Flink;
310 }
311
312 if (!inserted)
313 InsertTailList(&newlist, &ref->list_entry);
314 }
315
316 newlist.Flink->Blink = &mr->refs;
317 newlist.Blink->Flink = &mr->refs;
318 mr->refs.Flink = newlist.Flink;
319 mr->refs.Blink = newlist.Blink;
320 }
321
322 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
323 NTSTATUS Status;
324 LIST_ENTRY* le;
325 uint64_t rc = 0;
326 uint16_t inline_len;
327 bool all_inline = true;
328 metadata_reloc_ref* first_noninline = NULL;
329 EXTENT_ITEM* ei;
330 uint8_t* ptr;
331
332 inline_len = sizeof(EXTENT_ITEM);
333 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
334 inline_len += sizeof(EXTENT_ITEM2);
335
336 sort_metadata_reloc_refs(mr);
337
338 le = mr->refs.Flink;
339 while (le != &mr->refs) {
340 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
341 uint16_t extlen = 0;
342
343 rc++;
344
345 if (ref->type == TYPE_TREE_BLOCK_REF)
346 extlen += sizeof(TREE_BLOCK_REF);
347 else if (ref->type == TYPE_SHARED_BLOCK_REF)
348 extlen += sizeof(SHARED_BLOCK_REF);
349
350 if (all_inline) {
351 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
352 all_inline = false;
353 first_noninline = ref;
354 } else
355 inline_len += extlen + 1;
356 }
357
358 le = le->Flink;
359 }
360
361 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
362 if (!ei) {
363 ERR("out of memory\n");
364 return STATUS_INSUFFICIENT_RESOURCES;
365 }
366
367 ei->refcount = rc;
368 ei->generation = mr->ei->generation;
369 ei->flags = mr->ei->flags;
370 ptr = (uint8_t*)&ei[1];
371
372 if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
373 EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
374
375 ei2->firstitem = *(KEY*)&mr->data[1];
376 ei2->level = mr->data->level;
377
378 ptr += sizeof(EXTENT_ITEM2);
379 }
380
381 le = mr->refs.Flink;
382 while (le != &mr->refs) {
383 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
384
385 if (ref == first_noninline)
386 break;
387
388 *ptr = ref->type;
389 ptr++;
390
391 if (ref->type == TYPE_TREE_BLOCK_REF) {
392 TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
393
394 tbr->offset = ref->tbr.offset;
395
396 ptr += sizeof(TREE_BLOCK_REF);
397 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
398 SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
399
400 sbr->offset = ref->parent->new_address;
401
402 ptr += sizeof(SHARED_BLOCK_REF);
403 }
404
405 le = le->Flink;
406 }
407
408 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
409 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
410 else
411 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
412
413 if (!NT_SUCCESS(Status)) {
414 ERR("insert_tree_item returned %08lx\n", Status);
415 ExFreePool(ei);
416 return Status;
417 }
418
419 if (!all_inline) {
420 le = &first_noninline->list_entry;
421
422 while (le != &mr->refs) {
423 metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
424
425 if (ref->type == TYPE_TREE_BLOCK_REF) {
426 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
427 if (!NT_SUCCESS(Status)) {
428 ERR("insert_tree_item returned %08lx\n", Status);
429 return Status;
430 }
431 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
432 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
433 if (!NT_SUCCESS(Status)) {
434 ERR("insert_tree_item returned %08lx\n", Status);
435 return Status;
436 }
437 }
438
439 le = le->Flink;
440 }
441 }
442
443 if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
444 if (mr->data->level > 0) {
445 uint16_t i;
446 internal_node* in = (internal_node*)&mr->data[1];
447
448 for (i = 0; i < mr->data->num_items; i++) {
449 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
450
451 if (sbrrc > 0) {
452 SHARED_BLOCK_REF sbr;
453
454 sbr.offset = mr->new_address;
455
456 Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
457 if (!NT_SUCCESS(Status)) {
458 ERR("increase_extent_refcount returned %08lx\n", Status);
459 return Status;
460 }
461
462 sbr.offset = mr->address;
463
464 Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
465 sbr.offset, false, NULL);
466 if (!NT_SUCCESS(Status)) {
467 ERR("decrease_extent_refcount returned %08lx\n", Status);
468 return Status;
469 }
470 }
471 }
472 } else {
473 uint16_t i;
474 leaf_node* ln = (leaf_node*)&mr->data[1];
475
476 for (i = 0; i < mr->data->num_items; i++) {
477 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
478 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
479
480 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
481 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
482
483 if (ed2->size > 0) { // not sparse
484 uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
485
486 if (sdrrc > 0) {
487 SHARED_DATA_REF sdr;
488 chunk* c;
489
490 sdr.offset = mr->new_address;
491 sdr.count = sdrrc;
492
493 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
494 if (!NT_SUCCESS(Status)) {
495 ERR("increase_extent_refcount returned %08lx\n", Status);
496 return Status;
497 }
498
499 sdr.offset = mr->address;
500
501 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
502 sdr.offset, false, NULL);
503 if (!NT_SUCCESS(Status)) {
504 ERR("decrease_extent_refcount returned %08lx\n", Status);
505 return Status;
506 }
507
508 c = get_chunk_from_address(Vcb, ed2->address);
509
510 if (c) {
511 // check changed_extents
512
513 ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
514
515 le = c->changed_extents.Flink;
516
517 while (le != &c->changed_extents) {
518 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
519
520 if (ce->address == ed2->address) {
521 LIST_ENTRY* le2;
522
523 le2 = ce->refs.Flink;
524 while (le2 != &ce->refs) {
525 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
526
527 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
528 cer->sdr.offset = mr->new_address;
529 break;
530 }
531
532 le2 = le2->Flink;
533 }
534
535 le2 = ce->old_refs.Flink;
536 while (le2 != &ce->old_refs) {
537 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
538
539 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
540 cer->sdr.offset = mr->new_address;
541 break;
542 }
543
544 le2 = le2->Flink;
545 }
546
547 break;
548 }
549
550 le = le->Flink;
551 }
552
553 ExReleaseResourceLite(&c->changed_extents_lock);
554 }
555 }
556 }
557 }
558 }
559 }
560 }
561 }
562
563 return STATUS_SUCCESS;
564 }
565
566 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
567 LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
568 LIST_ENTRY tree_writes, *le;
569 NTSTATUS Status;
570 traverse_ptr tp;
571 uint8_t level, max_level = 0;
572 chunk* newchunk = NULL;
573
574 InitializeListHead(&tree_writes);
575
576 le = items->Flink;
577 while (le != items) {
578 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
579 LIST_ENTRY* le2;
580 chunk* pc;
581
582 mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
583 if (!mr->data) {
584 ERR("out of memory\n");
585 return STATUS_INSUFFICIENT_RESOURCES;
586 }
587
588 Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
589 c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
590 if (!NT_SUCCESS(Status)) {
591 ERR("read_data returned %08lx\n", Status);
592 return Status;
593 }
594
595 if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
596 mr->system = true;
597
598 if (data_items && mr->data->level == 0) {
599 le2 = data_items->Flink;
600 while (le2 != data_items) {
601 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
602 leaf_node* ln = (leaf_node*)&mr->data[1];
603 uint16_t i;
604
605 for (i = 0; i < mr->data->num_items; i++) {
606 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
607 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
608
609 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
610 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
611
612 if (ed2->address == dr->address)
613 ed2->address = dr->new_address;
614 }
615 }
616 }
617
618 le2 = le2->Flink;
619 }
620 }
621
622 if (mr->data->level > max_level)
623 max_level = mr->data->level;
624
625 le2 = mr->refs.Flink;
626 while (le2 != &mr->refs) {
627 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
628
629 if (ref->type == TYPE_TREE_BLOCK_REF) {
630 KEY* firstitem;
631 root* r = NULL;
632 LIST_ENTRY* le3;
633 tree* t;
634
635 firstitem = (KEY*)&mr->data[1];
636
637 le3 = Vcb->roots.Flink;
638 while (le3 != &Vcb->roots) {
639 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
640
641 if (r2->id == ref->tbr.offset) {
642 r = r2;
643 break;
644 }
645
646 le3 = le3->Flink;
647 }
648
649 if (!r) {
650 ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
651 return STATUS_INTERNAL_ERROR;
652 }
653
654 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
655 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
656 ERR("find_item_to_level returned %08lx\n", Status);
657 return Status;
658 }
659
660 t = tp.tree;
661 while (t && t->header.level < mr->data->level + 1) {
662 t = t->parent;
663 }
664
665 if (!t)
666 ref->top = true;
667 else {
668 metadata_reloc* mr2;
669
670 Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
671 if (!NT_SUCCESS(Status)) {
672 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
673 return Status;
674 }
675
676 ref->parent = mr2;
677 }
678 } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
679 metadata_reloc* mr2;
680
681 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
682 if (!NT_SUCCESS(Status)) {
683 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
684 return Status;
685 }
686
687 ref->parent = mr2;
688 }
689
690 le2 = le2->Flink;
691 }
692
693 le = le->Flink;
694 }
695
696 le = items->Flink;
697 while (le != items) {
698 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
699 LIST_ENTRY* le2;
700 uint32_t hash;
701
702 mr->t = NULL;
703
704 hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
705
706 le2 = Vcb->trees_ptrs[hash >> 24];
707
708 if (le2) {
709 while (le2 != &Vcb->trees_hash) {
710 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
711
712 if (t->header.address == mr->address) {
713 mr->t = t;
714 break;
715 } else if (t->hash > hash)
716 break;
717
718 le2 = le2->Flink;
719 }
720 }
721
722 le = le->Flink;
723 }
724
725 for (level = 0; level <= max_level; level++) {
726 le = items->Flink;
727 while (le != items) {
728 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
729
730 if (mr->data->level == level) {
731 bool done = false;
732 LIST_ENTRY* le2;
733 tree_write* tw;
734 uint64_t flags;
735 tree* t3;
736
737 if (mr->system)
738 flags = Vcb->system_flags;
739 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
740 flags = Vcb->data_flags;
741 else
742 flags = Vcb->metadata_flags;
743
744 if (newchunk) {
745 acquire_chunk_lock(newchunk, Vcb);
746
747 if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
748 newchunk->used += Vcb->superblock.node_size;
749 space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
750 done = true;
751 }
752
753 release_chunk_lock(newchunk, Vcb);
754 }
755
756 if (!done) {
757 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
758
759 le2 = Vcb->chunks.Flink;
760 while (le2 != &Vcb->chunks) {
761 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
762
763 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
764 acquire_chunk_lock(c2, Vcb);
765
766 if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
767 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
768 c2->used += Vcb->superblock.node_size;
769 space_list_subtract(c2, mr->new_address, Vcb->superblock.node_size, rollback);
770 release_chunk_lock(c2, Vcb);
771 newchunk = c2;
772 done = true;
773 break;
774 }
775 }
776
777 release_chunk_lock(c2, Vcb);
778 }
779
780 le2 = le2->Flink;
781 }
782
783 // allocate new chunk if necessary
784 if (!done) {
785 Status = alloc_chunk(Vcb, flags, &newchunk, false);
786
787 if (!NT_SUCCESS(Status)) {
788 ERR("alloc_chunk returned %08lx\n", Status);
789 ExReleaseResourceLite(&Vcb->chunk_lock);
790 goto end;
791 }
792
793 acquire_chunk_lock(newchunk, Vcb);
794
795 newchunk->balance_num = Vcb->balance.balance_num;
796
797 if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
798 release_chunk_lock(newchunk, Vcb);
799 ExReleaseResourceLite(&Vcb->chunk_lock);
800 ERR("could not find address in new chunk\n");
801 Status = STATUS_DISK_FULL;
802 goto end;
803 } else {
804 newchunk->used += Vcb->superblock.node_size;
805 space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
806 }
807
808 release_chunk_lock(newchunk, Vcb);
809 }
810
811 ExReleaseResourceLite(&Vcb->chunk_lock);
812 }
813
814 // update parents
815 le2 = mr->refs.Flink;
816 while (le2 != &mr->refs) {
817 metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
818
819 if (ref->parent) {
820 uint16_t i;
821 internal_node* in = (internal_node*)&ref->parent->data[1];
822
823 for (i = 0; i < ref->parent->data->num_items; i++) {
824 if (in[i].address == mr->address) {
825 in[i].address = mr->new_address;
826 break;
827 }
828 }
829
830 if (ref->parent->t) {
831 LIST_ENTRY* le3;
832
833 le3 = ref->parent->t->itemlist.Flink;
834 while (le3 != &ref->parent->t->itemlist) {
835 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
836
837 if (!td->inserted && td->treeholder.address == mr->address)
838 td->treeholder.address = mr->new_address;
839
840 le3 = le3->Flink;
841 }
842 }
843 } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
844 LIST_ENTRY* le3;
845 root* r = NULL;
846
847 // alter ROOT_ITEM
848
849 le3 = Vcb->roots.Flink;
850 while (le3 != &Vcb->roots) {
851 root* r2 = CONTAINING_RECORD(le3, root, list_entry);
852
853 if (r2->id == ref->tbr.offset) {
854 r = r2;
855 break;
856 }
857
858 le3 = le3->Flink;
859 }
860
861 if (r) {
862 r->treeholder.address = mr->new_address;
863
864 if (r == Vcb->root_root)
865 Vcb->superblock.root_tree_addr = mr->new_address;
866 else if (r == Vcb->chunk_root)
867 Vcb->superblock.chunk_tree_addr = mr->new_address;
868 else if (r->root_item.block_number == mr->address) {
869 KEY searchkey;
870 ROOT_ITEM* ri;
871
872 r->root_item.block_number = mr->new_address;
873
874 searchkey.obj_id = r->id;
875 searchkey.obj_type = TYPE_ROOT_ITEM;
876 searchkey.offset = 0xffffffffffffffff;
877
878 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
879 if (!NT_SUCCESS(Status)) {
880 ERR("find_item returned %08lx\n", Status);
881 goto end;
882 }
883
884 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
885 ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
886 Status = STATUS_INTERNAL_ERROR;
887 goto end;
888 }
889
890 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
891 if (!ri) {
892 ERR("out of memory\n");
893 Status = STATUS_INSUFFICIENT_RESOURCES;
894 goto end;
895 }
896
897 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
898
899 Status = delete_tree_item(Vcb, &tp);
900 if (!NT_SUCCESS(Status)) {
901 ERR("delete_tree_item returned %08lx\n", Status);
902 goto end;
903 }
904
905 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
906 if (!NT_SUCCESS(Status)) {
907 ERR("insert_tree_item returned %08lx\n", Status);
908 goto end;
909 }
910 }
911 }
912 }
913
914 le2 = le2->Flink;
915 }
916
917 mr->data->address = mr->new_address;
918
919 t3 = mr->t;
920
921 while (t3) {
922 uint8_t h;
923 bool inserted;
924 tree* t4 = NULL;
925
926 // check if tree loaded more than once
927 if (t3->list_entry.Flink != &Vcb->trees_hash) {
928 tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
929
930 if (nt->header.address == t3->header.address)
931 t4 = nt;
932 }
933
934 t3->header.address = mr->new_address;
935
936 h = t3->hash >> 24;
937
938 if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
939 if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
940 Vcb->trees_ptrs[h] = NULL;
941 else {
942 tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
943
944 if (t2->hash >> 24 == h)
945 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
946 else
947 Vcb->trees_ptrs[h] = NULL;
948 }
949 }
950
951 RemoveEntryList(&t3->list_entry_hash);
952
953 t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
954 h = t3->hash >> 24;
955
956 if (!Vcb->trees_ptrs[h]) {
957 uint8_t h2 = h;
958
959 le2 = Vcb->trees_hash.Flink;
960
961 if (h2 > 0) {
962 h2--;
963 do {
964 if (Vcb->trees_ptrs[h2]) {
965 le2 = Vcb->trees_ptrs[h2];
966 break;
967 }
968
969 h2--;
970 } while (h2 > 0);
971 }
972 } else
973 le2 = Vcb->trees_ptrs[h];
974
975 inserted = false;
976 while (le2 != &Vcb->trees_hash) {
977 tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
978
979 if (t2->hash >= t3->hash) {
980 InsertHeadList(le2->Blink, &t3->list_entry_hash);
981 inserted = true;
982 break;
983 }
984
985 le2 = le2->Flink;
986 }
987
988 if (!inserted)
989 InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
990
991 if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
992 Vcb->trees_ptrs[h] = &t3->list_entry_hash;
993
994 if (data_items && level == 0) {
995 le2 = data_items->Flink;
996
997 while (le2 != data_items) {
998 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
999 LIST_ENTRY* le3 = t3->itemlist.Flink;
1000
1001 while (le3 != &t3->itemlist) {
1002 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1003
1004 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1005 EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1006
1007 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1008 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1009
1010 if (ed2->address == dr->address)
1011 ed2->address = dr->new_address;
1012 }
1013 }
1014
1015 le3 = le3->Flink;
1016 }
1017
1018 le2 = le2->Flink;
1019 }
1020 }
1021
1022 t3 = t4;
1023 }
1024
1025 calc_tree_checksum(Vcb, mr->data);
1026
1027 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1028 if (!tw) {
1029 ERR("out of memory\n");
1030 Status = STATUS_INSUFFICIENT_RESOURCES;
1031 goto end;
1032 }
1033
1034 tw->address = mr->new_address;
1035 tw->length = Vcb->superblock.node_size;
1036 tw->data = (uint8_t*)mr->data;
1037 tw->allocated = false;
1038
1039 if (IsListEmpty(&tree_writes))
1040 InsertTailList(&tree_writes, &tw->list_entry);
1041 else {
1042 bool inserted = false;
1043
1044 le2 = tree_writes.Flink;
1045 while (le2 != &tree_writes) {
1046 tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1047
1048 if (tw2->address > tw->address) {
1049 InsertHeadList(le2->Blink, &tw->list_entry);
1050 inserted = true;
1051 break;
1052 }
1053
1054 le2 = le2->Flink;
1055 }
1056
1057 if (!inserted)
1058 InsertTailList(&tree_writes, &tw->list_entry);
1059 }
1060 }
1061
1062 le = le->Flink;
1063 }
1064 }
1065
1066 Status = do_tree_writes(Vcb, &tree_writes, true);
1067 if (!NT_SUCCESS(Status)) {
1068 ERR("do_tree_writes returned %08lx\n", Status);
1069 goto end;
1070 }
1071
1072 le = items->Flink;
1073 while (le != items) {
1074 metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1075
1076 Status = add_metadata_reloc_extent_item(Vcb, mr);
1077 if (!NT_SUCCESS(Status)) {
1078 ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1079 goto end;
1080 }
1081
1082 le = le->Flink;
1083 }
1084
1085 Status = STATUS_SUCCESS;
1086
1087 end:
1088 while (!IsListEmpty(&tree_writes)) {
1089 tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1090
1091 if (tw->allocated)
1092 ExFreePool(tw->data);
1093
1094 ExFreePool(tw);
1095 }
1096
1097 return Status;
1098 }
1099
balance_metadata_chunk(device_extension * Vcb,chunk * c,bool * changed)1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1101 KEY searchkey;
1102 traverse_ptr tp;
1103 NTSTATUS Status;
1104 bool b;
1105 LIST_ENTRY items, rollback;
1106 uint32_t loaded = 0;
1107
1108 TRACE("chunk %I64x\n", c->offset);
1109
1110 InitializeListHead(&rollback);
1111 InitializeListHead(&items);
1112
1113 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1114
1115 searchkey.obj_id = c->offset;
1116 searchkey.obj_type = TYPE_METADATA_ITEM;
1117 searchkey.offset = 0xffffffffffffffff;
1118
1119 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1120 if (!NT_SUCCESS(Status)) {
1121 ERR("find_item returned %08lx\n", Status);
1122 goto end;
1123 }
1124
1125 do {
1126 traverse_ptr next_tp;
1127
1128 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129 break;
1130
1131 if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132 bool tree = false, skinny = false;
1133
1134 if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135 tree = true;
1136 skinny = true;
1137 } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138 tp.item->size >= sizeof(EXTENT_ITEM)) {
1139 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140
1141 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1142 tree = true;
1143 }
1144
1145 if (tree) {
1146 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147
1148 if (!NT_SUCCESS(Status)) {
1149 ERR("add_metadata_reloc returned %08lx\n", Status);
1150 goto end;
1151 }
1152
1153 loaded++;
1154
1155 if (loaded >= 64) // only do 64 at a time
1156 break;
1157 }
1158 }
1159
1160 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1161
1162 if (b)
1163 tp = next_tp;
1164 } while (b);
1165
1166 if (IsListEmpty(&items)) {
1167 *changed = false;
1168 Status = STATUS_SUCCESS;
1169 goto end;
1170 } else
1171 *changed = true;
1172
1173 Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1174 if (!NT_SUCCESS(Status)) {
1175 ERR("write_metadata_items returned %08lx\n", Status);
1176 goto end;
1177 }
1178
1179 Status = STATUS_SUCCESS;
1180
1181 Vcb->need_write = true;
1182
1183 end:
1184 if (NT_SUCCESS(Status)) {
1185 Status = do_write(Vcb, NULL);
1186 if (!NT_SUCCESS(Status))
1187 ERR("do_write returned %08lx\n", Status);
1188 }
1189
1190 if (NT_SUCCESS(Status))
1191 clear_rollback(&rollback);
1192 else
1193 do_rollback(Vcb, &rollback);
1194
1195 free_trees(Vcb);
1196
1197 ExReleaseResourceLite(&Vcb->tree_lock);
1198
1199 while (!IsListEmpty(&items)) {
1200 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1201
1202 while (!IsListEmpty(&mr->refs)) {
1203 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1204
1205 ExFreePool(ref);
1206 }
1207
1208 if (mr->data)
1209 ExFreePool(mr->data);
1210
1211 ExFreePool(mr);
1212 }
1213
1214 return Status;
1215 }
1216
1217 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1218 data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1219 NTSTATUS Status;
1220 LIST_ENTRY* le;
1221 KEY searchkey;
1222 traverse_ptr tp;
1223 root* r = NULL;
1224 metadata_reloc* mr;
1225 uint64_t last_tree = 0;
1226 data_reloc_ref* ref;
1227
1228 le = Vcb->roots.Flink;
1229 while (le != &Vcb->roots) {
1230 root* r2 = CONTAINING_RECORD(le, root, list_entry);
1231
1232 if (r2->id == edr->root) {
1233 r = r2;
1234 break;
1235 }
1236
1237 le = le->Flink;
1238 }
1239
1240 if (!r) {
1241 ERR("could not find subvol %I64x\n", edr->root);
1242 return STATUS_INTERNAL_ERROR;
1243 }
1244
1245 searchkey.obj_id = edr->objid;
1246 searchkey.obj_type = TYPE_EXTENT_DATA;
1247 searchkey.offset = 0;
1248
1249 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1250 if (!NT_SUCCESS(Status)) {
1251 ERR("find_item returned %08lx\n", Status);
1252 return Status;
1253 }
1254
1255 if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1256 traverse_ptr tp2;
1257
1258 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1259 tp = tp2;
1260 else {
1261 ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1262 return STATUS_INTERNAL_ERROR;
1263 }
1264 }
1265
1266 ref = NULL;
1267
1268 while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1269 traverse_ptr tp2;
1270
1271 if (tp.item->size >= sizeof(EXTENT_DATA)) {
1272 EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1273
1274 if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1275 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1276
1277 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1278 if (ref && last_tree == tp.tree->header.address)
1279 ref->edr.count++;
1280 else {
1281 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1282 if (!ref) {
1283 ERR("out of memory\n");
1284 return STATUS_INSUFFICIENT_RESOURCES;
1285 }
1286
1287 ref->type = TYPE_EXTENT_DATA_REF;
1288 RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1289 ref->edr.count = 1;
1290
1291 Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1292 if (!NT_SUCCESS(Status)) {
1293 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1294 ExFreePool(ref);
1295 return Status;
1296 }
1297
1298 last_tree = tp.tree->header.address;
1299 ref->parent = mr;
1300
1301 InsertTailList(&dr->refs, &ref->list_entry);
1302 }
1303 }
1304 }
1305 }
1306
1307 if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1308 tp = tp2;
1309 else
1310 break;
1311 }
1312
1313 return STATUS_SUCCESS;
1314 }
1315
1316 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1317 traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1318 NTSTATUS Status;
1319 data_reloc* dr;
1320 EXTENT_ITEM* ei;
1321 uint16_t len;
1322 uint64_t inline_rc;
1323 uint8_t* ptr;
1324
1325 dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1326 if (!dr) {
1327 ERR("out of memory\n");
1328 return STATUS_INSUFFICIENT_RESOURCES;
1329 }
1330
1331 dr->address = tp->item->key.obj_id;
1332 dr->size = tp->item->key.offset;
1333 dr->ei = (EXTENT_ITEM*)tp->item->data;
1334 InitializeListHead(&dr->refs);
1335
1336 Status = delete_tree_item(Vcb, tp);
1337 if (!NT_SUCCESS(Status)) {
1338 ERR("delete_tree_item returned %08lx\n", Status);
1339 return Status;
1340 }
1341
1342 if (!c)
1343 c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1344
1345 if (c) {
1346 acquire_chunk_lock(c, Vcb);
1347
1348 c->used -= tp->item->key.offset;
1349
1350 space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1351
1352 release_chunk_lock(c, Vcb);
1353 }
1354
1355 ei = (EXTENT_ITEM*)tp->item->data;
1356 inline_rc = 0;
1357
1358 len = tp->item->size - sizeof(EXTENT_ITEM);
1359 ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1360
1361 while (len > 0) {
1362 uint8_t secttype = *ptr;
1363 uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1364
1365 len--;
1366
1367 if (sectlen > len) {
1368 ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1369 return STATUS_INTERNAL_ERROR;
1370 }
1371
1372 if (sectlen == 0) {
1373 ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1374 return STATUS_INTERNAL_ERROR;
1375 }
1376
1377 if (secttype == TYPE_EXTENT_DATA_REF) {
1378 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1379
1380 inline_rc += edr->count;
1381
1382 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1383 if (!NT_SUCCESS(Status)) {
1384 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1385 return Status;
1386 }
1387 } else if (secttype == TYPE_SHARED_DATA_REF) {
1388 metadata_reloc* mr;
1389 data_reloc_ref* ref;
1390
1391 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1392 if (!ref) {
1393 ERR("out of memory\n");
1394 return STATUS_INSUFFICIENT_RESOURCES;
1395 }
1396
1397 ref->type = TYPE_SHARED_DATA_REF;
1398 RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1399 inline_rc += ref->sdr.count;
1400
1401 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1402 if (!NT_SUCCESS(Status)) {
1403 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1404 ExFreePool(ref);
1405 return Status;
1406 }
1407
1408 ref->parent = mr;
1409
1410 InsertTailList(&dr->refs, &ref->list_entry);
1411 } else {
1412 ERR("unexpected tree type %x\n", secttype);
1413 return STATUS_INTERNAL_ERROR;
1414 }
1415
1416
1417 len -= sectlen;
1418 ptr += sizeof(uint8_t) + sectlen;
1419 }
1420
1421 if (inline_rc < ei->refcount) { // look for non-inline entries
1422 traverse_ptr tp2 = *tp, next_tp;
1423
1424 while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1425 tp2 = next_tp;
1426
1427 if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1428 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1429 Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1430 if (!NT_SUCCESS(Status)) {
1431 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1432 return Status;
1433 }
1434
1435 Status = delete_tree_item(Vcb, &tp2);
1436 if (!NT_SUCCESS(Status)) {
1437 ERR("delete_tree_item returned %08lx\n", Status);
1438 return Status;
1439 }
1440 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1441 metadata_reloc* mr;
1442 data_reloc_ref* ref;
1443
1444 ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1445 if (!ref) {
1446 ERR("out of memory\n");
1447 return STATUS_INSUFFICIENT_RESOURCES;
1448 }
1449
1450 ref->type = TYPE_SHARED_DATA_REF;
1451 ref->sdr.offset = tp2.item->key.offset;
1452 ref->sdr.count = *((uint32_t*)tp2.item->data);
1453
1454 Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1455 if (!NT_SUCCESS(Status)) {
1456 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1457 ExFreePool(ref);
1458 return Status;
1459 }
1460
1461 ref->parent = mr;
1462 InsertTailList(&dr->refs, &ref->list_entry);
1463
1464 Status = delete_tree_item(Vcb, &tp2);
1465 if (!NT_SUCCESS(Status)) {
1466 ERR("delete_tree_item returned %08lx\n", Status);
1467 return Status;
1468 }
1469 }
1470 } else
1471 break;
1472 }
1473 }
1474
1475 InsertTailList(items, &dr->list_entry);
1476
1477 return STATUS_SUCCESS;
1478 }
1479
sort_data_reloc_refs(data_reloc * dr)1480 static void sort_data_reloc_refs(data_reloc* dr) {
1481 LIST_ENTRY newlist, *le;
1482
1483 if (IsListEmpty(&dr->refs))
1484 return;
1485
1486 // insertion sort
1487
1488 InitializeListHead(&newlist);
1489
1490 while (!IsListEmpty(&dr->refs)) {
1491 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1492 bool inserted = false;
1493
1494 if (ref->type == TYPE_EXTENT_DATA_REF)
1495 ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1496 else if (ref->type == TYPE_SHARED_DATA_REF)
1497 ref->hash = ref->parent->new_address;
1498
1499 le = newlist.Flink;
1500 while (le != &newlist) {
1501 data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1502
1503 if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1504 InsertHeadList(le->Blink, &ref->list_entry);
1505 inserted = true;
1506 break;
1507 }
1508
1509 le = le->Flink;
1510 }
1511
1512 if (!inserted)
1513 InsertTailList(&newlist, &ref->list_entry);
1514 }
1515
1516 le = newlist.Flink;
1517 while (le != &newlist) {
1518 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1519
1520 if (le->Flink != &newlist) {
1521 data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1522
1523 if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1524 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1525 RemoveEntryList(&ref2->list_entry);
1526 ref->edr.count += ref2->edr.count;
1527 ExFreePool(ref2);
1528 continue;
1529 }
1530 }
1531
1532 le = le->Flink;
1533 }
1534
1535 newlist.Flink->Blink = &dr->refs;
1536 newlist.Blink->Flink = &dr->refs;
1537 dr->refs.Flink = newlist.Flink;
1538 dr->refs.Blink = newlist.Blink;
1539 }
1540
1541 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1542 NTSTATUS Status;
1543 LIST_ENTRY* le;
1544 uint64_t rc = 0;
1545 uint16_t inline_len;
1546 bool all_inline = true;
1547 data_reloc_ref* first_noninline = NULL;
1548 EXTENT_ITEM* ei;
1549 uint8_t* ptr;
1550
1551 inline_len = sizeof(EXTENT_ITEM);
1552
1553 sort_data_reloc_refs(dr);
1554
1555 le = dr->refs.Flink;
1556 while (le != &dr->refs) {
1557 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1558 uint16_t extlen = 0;
1559
1560 if (ref->type == TYPE_EXTENT_DATA_REF) {
1561 extlen += sizeof(EXTENT_DATA_REF);
1562 rc += ref->edr.count;
1563 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1564 extlen += sizeof(SHARED_DATA_REF);
1565 rc++;
1566 }
1567
1568 if (all_inline) {
1569 if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1570 all_inline = false;
1571 first_noninline = ref;
1572 } else
1573 inline_len += extlen + 1;
1574 }
1575
1576 le = le->Flink;
1577 }
1578
1579 ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1580 if (!ei) {
1581 ERR("out of memory\n");
1582 return STATUS_INSUFFICIENT_RESOURCES;
1583 }
1584
1585 ei->refcount = rc;
1586 ei->generation = dr->ei->generation;
1587 ei->flags = dr->ei->flags;
1588 ptr = (uint8_t*)&ei[1];
1589
1590 le = dr->refs.Flink;
1591 while (le != &dr->refs) {
1592 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1593
1594 if (ref == first_noninline)
1595 break;
1596
1597 *ptr = ref->type;
1598 ptr++;
1599
1600 if (ref->type == TYPE_EXTENT_DATA_REF) {
1601 EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1602
1603 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1604
1605 ptr += sizeof(EXTENT_DATA_REF);
1606 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1607 SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1608
1609 sdr->offset = ref->parent->new_address;
1610 sdr->count = ref->sdr.count;
1611
1612 ptr += sizeof(SHARED_DATA_REF);
1613 }
1614
1615 le = le->Flink;
1616 }
1617
1618 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1619 if (!NT_SUCCESS(Status)) {
1620 ERR("insert_tree_item returned %08lx\n", Status);
1621 return Status;
1622 }
1623
1624 if (!all_inline) {
1625 le = &first_noninline->list_entry;
1626
1627 while (le != &dr->refs) {
1628 data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1629
1630 if (ref->type == TYPE_EXTENT_DATA_REF) {
1631 EXTENT_DATA_REF* edr;
1632
1633 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1634 if (!edr) {
1635 ERR("out of memory\n");
1636 return STATUS_INSUFFICIENT_RESOURCES;
1637 }
1638
1639 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1640
1641 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1642 if (!NT_SUCCESS(Status)) {
1643 ERR("insert_tree_item returned %08lx\n", Status);
1644 return Status;
1645 }
1646 } else if (ref->type == TYPE_SHARED_DATA_REF) {
1647 uint32_t* sdr;
1648
1649 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1650 if (!sdr) {
1651 ERR("out of memory\n");
1652 return STATUS_INSUFFICIENT_RESOURCES;
1653 }
1654
1655 *sdr = ref->sdr.count;
1656
1657 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1658 if (!NT_SUCCESS(Status)) {
1659 ERR("insert_tree_item returned %08lx\n", Status);
1660 return Status;
1661 }
1662 }
1663
1664 le = le->Flink;
1665 }
1666 }
1667
1668 return STATUS_SUCCESS;
1669 }
1670
balance_data_chunk(device_extension * Vcb,chunk * c,bool * changed)1671 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1672 KEY searchkey;
1673 traverse_ptr tp;
1674 NTSTATUS Status;
1675 bool b;
1676 LIST_ENTRY items, metadata_items, rollback, *le;
1677 uint64_t loaded = 0, num_loaded = 0;
1678 chunk* newchunk = NULL;
1679 uint8_t* data = NULL;
1680
1681 TRACE("chunk %I64x\n", c->offset);
1682
1683 InitializeListHead(&rollback);
1684 InitializeListHead(&items);
1685 InitializeListHead(&metadata_items);
1686
1687 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1688
1689 searchkey.obj_id = c->offset;
1690 searchkey.obj_type = TYPE_EXTENT_ITEM;
1691 searchkey.offset = 0xffffffffffffffff;
1692
1693 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1694 if (!NT_SUCCESS(Status)) {
1695 ERR("find_item returned %08lx\n", Status);
1696 goto end;
1697 }
1698
1699 do {
1700 traverse_ptr next_tp;
1701
1702 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1703 break;
1704
1705 if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1706 bool tree = false;
1707
1708 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1709 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1710
1711 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1712 tree = true;
1713 }
1714
1715 if (!tree) {
1716 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1717
1718 if (!NT_SUCCESS(Status)) {
1719 ERR("add_data_reloc returned %08lx\n", Status);
1720 goto end;
1721 }
1722
1723 loaded += tp.item->key.offset;
1724 num_loaded++;
1725
1726 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1727 break;
1728 }
1729 }
1730
1731 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1732
1733 if (b)
1734 tp = next_tp;
1735 } while (b);
1736
1737 if (IsListEmpty(&items)) {
1738 *changed = false;
1739 Status = STATUS_SUCCESS;
1740 goto end;
1741 } else
1742 *changed = true;
1743
1744 data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1745 if (!data) {
1746 ERR("out of memory\n");
1747 Status = STATUS_INSUFFICIENT_RESOURCES;
1748 goto end;
1749 }
1750
1751 le = items.Flink;
1752 while (le != &items) {
1753 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1754 bool done = false;
1755 LIST_ENTRY* le2;
1756 void* csum;
1757 RTL_BITMAP bmp;
1758 ULONG* bmparr;
1759 ULONG bmplen, runlength, index, lastoff;
1760
1761 if (newchunk) {
1762 acquire_chunk_lock(newchunk, Vcb);
1763
1764 if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1765 newchunk->used += dr->size;
1766 space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1767 done = true;
1768 }
1769
1770 release_chunk_lock(newchunk, Vcb);
1771 }
1772
1773 if (!done) {
1774 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1775
1776 le2 = Vcb->chunks.Flink;
1777 while (le2 != &Vcb->chunks) {
1778 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1779
1780 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1781 acquire_chunk_lock(c2, Vcb);
1782
1783 if ((c2->chunk_item->size - c2->used) >= dr->size) {
1784 if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1785 c2->used += dr->size;
1786 space_list_subtract(c2, dr->new_address, dr->size, &rollback);
1787 release_chunk_lock(c2, Vcb);
1788 newchunk = c2;
1789 done = true;
1790 break;
1791 }
1792 }
1793
1794 release_chunk_lock(c2, Vcb);
1795 }
1796
1797 le2 = le2->Flink;
1798 }
1799
1800 // allocate new chunk if necessary
1801 if (!done) {
1802 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1803
1804 if (!NT_SUCCESS(Status)) {
1805 ERR("alloc_chunk returned %08lx\n", Status);
1806 ExReleaseResourceLite(&Vcb->chunk_lock);
1807 goto end;
1808 }
1809
1810 acquire_chunk_lock(newchunk, Vcb);
1811
1812 newchunk->balance_num = Vcb->balance.balance_num;
1813
1814 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1815 release_chunk_lock(newchunk, Vcb);
1816 ExReleaseResourceLite(&Vcb->chunk_lock);
1817 ERR("could not find address in new chunk\n");
1818 Status = STATUS_DISK_FULL;
1819 goto end;
1820 } else {
1821 newchunk->used += dr->size;
1822 space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1823 }
1824
1825 release_chunk_lock(newchunk, Vcb);
1826 }
1827
1828 ExReleaseResourceLite(&Vcb->chunk_lock);
1829 }
1830
1831 dr->newchunk = newchunk;
1832
1833 bmplen = (ULONG)(dr->size >> Vcb->sector_shift);
1834
1835 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1836 if (!bmparr) {
1837 ERR("out of memory\n");
1838 Status = STATUS_INSUFFICIENT_RESOURCES;
1839 goto end;
1840 }
1841
1842 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((dr->size * Vcb->csum_size) >> Vcb->sector_shift), ALLOC_TAG);
1843 if (!csum) {
1844 ERR("out of memory\n");
1845 ExFreePool(bmparr);
1846 Status = STATUS_INSUFFICIENT_RESOURCES;
1847 goto end;
1848 }
1849
1850 RtlInitializeBitMap(&bmp, bmparr, bmplen);
1851 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1852
1853 searchkey.obj_id = EXTENT_CSUM_ID;
1854 searchkey.obj_type = TYPE_EXTENT_CSUM;
1855 searchkey.offset = dr->address;
1856
1857 Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1858 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1859 ERR("find_item returned %08lx\n", Status);
1860 ExFreePool(csum);
1861 ExFreePool(bmparr);
1862 goto end;
1863 }
1864
1865 if (Status != STATUS_NOT_FOUND) {
1866 do {
1867 traverse_ptr next_tp;
1868
1869 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1870 if (tp.item->key.offset >= dr->address + dr->size)
1871 break;
1872 else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size) >= dr->address) {
1873 uint64_t cs = max(dr->address, tp.item->key.offset);
1874 uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size));
1875
1876 RtlCopyMemory((uint8_t*)csum + (((cs - dr->address) * Vcb->csum_size) >> Vcb->sector_shift),
1877 tp.item->data + (((cs - tp.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
1878 (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift));
1879
1880 RtlClearBits(&bmp, (ULONG)((cs - dr->address) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift));
1881
1882 if (ce == dr->address + dr->size)
1883 break;
1884 }
1885 }
1886
1887 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1888 tp = next_tp;
1889 else
1890 break;
1891 } while (true);
1892 }
1893
1894 lastoff = 0;
1895 runlength = RtlFindFirstRunClear(&bmp, &index);
1896
1897 while (runlength != 0) {
1898 if (index >= bmplen)
1899 break;
1900
1901 if (index + runlength >= bmplen) {
1902 runlength = bmplen - index;
1903
1904 if (runlength == 0)
1905 break;
1906 }
1907
1908 if (index > lastoff) {
1909 ULONG off = lastoff;
1910 ULONG size = index - lastoff;
1911
1912 // handle no csum run
1913 do {
1914 ULONG rl;
1915
1916 if (size << Vcb->sector_shift > BALANCE_UNIT)
1917 rl = BALANCE_UNIT >> Vcb->sector_shift;
1918 else
1919 rl = size;
1920
1921 Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1922 c, NULL, NULL, 0, false, NormalPagePriority);
1923 if (!NT_SUCCESS(Status)) {
1924 ERR("read_data returned %08lx\n", Status);
1925 ExFreePool(csum);
1926 ExFreePool(bmparr);
1927 goto end;
1928 }
1929
1930 Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1931 NULL, newchunk, false, 0, NormalPagePriority);
1932 if (!NT_SUCCESS(Status)) {
1933 ERR("write_data_complete returned %08lx\n", Status);
1934 ExFreePool(csum);
1935 ExFreePool(bmparr);
1936 goto end;
1937 }
1938
1939 size -= rl;
1940 off += rl;
1941 } while (size > 0);
1942 }
1943
1944 add_checksum_entry(Vcb, dr->new_address + (index << Vcb->sector_shift), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1945 add_checksum_entry(Vcb, dr->address + (index << Vcb->sector_shift), runlength, NULL, NULL);
1946
1947 // handle csum run
1948 do {
1949 ULONG rl;
1950
1951 if (runlength << Vcb->sector_shift > BALANCE_UNIT)
1952 rl = BALANCE_UNIT >> Vcb->sector_shift;
1953 else
1954 rl = runlength;
1955
1956 Status = read_data(Vcb, dr->address + (index << Vcb->sector_shift), rl << Vcb->sector_shift,
1957 (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1958 if (!NT_SUCCESS(Status)) {
1959 ERR("read_data returned %08lx\n", Status);
1960 ExFreePool(csum);
1961 ExFreePool(bmparr);
1962 goto end;
1963 }
1964
1965 Status = write_data_complete(Vcb, dr->new_address + (index << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1966 NULL, newchunk, false, 0, NormalPagePriority);
1967 if (!NT_SUCCESS(Status)) {
1968 ERR("write_data_complete returned %08lx\n", Status);
1969 ExFreePool(csum);
1970 ExFreePool(bmparr);
1971 goto end;
1972 }
1973
1974 runlength -= rl;
1975 index += rl;
1976 } while (runlength > 0);
1977
1978 lastoff = index;
1979 runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1980 }
1981
1982 ExFreePool(csum);
1983 ExFreePool(bmparr);
1984
1985 // handle final nocsum run
1986 if (lastoff < dr->size >> Vcb->sector_shift) {
1987 ULONG off = lastoff;
1988 ULONG size = (ULONG)((dr->size >> Vcb->sector_shift) - lastoff);
1989
1990 do {
1991 ULONG rl;
1992
1993 if (size << Vcb->sector_shift > BALANCE_UNIT)
1994 rl = BALANCE_UNIT >> Vcb->sector_shift;
1995 else
1996 rl = size;
1997
1998 Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1999 c, NULL, NULL, 0, false, NormalPagePriority);
2000 if (!NT_SUCCESS(Status)) {
2001 ERR("read_data returned %08lx\n", Status);
2002 goto end;
2003 }
2004
2005 Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
2006 NULL, newchunk, false, 0, NormalPagePriority);
2007 if (!NT_SUCCESS(Status)) {
2008 ERR("write_data_complete returned %08lx\n", Status);
2009 goto end;
2010 }
2011
2012 size -= rl;
2013 off += rl;
2014 } while (size > 0);
2015 }
2016
2017 le = le->Flink;
2018 }
2019
2020 ExFreePool(data);
2021 data = NULL;
2022
2023 Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2024 if (!NT_SUCCESS(Status)) {
2025 ERR("write_metadata_items returned %08lx\n", Status);
2026 goto end;
2027 }
2028
2029 le = items.Flink;
2030 while (le != &items) {
2031 data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2032
2033 Status = add_data_reloc_extent_item(Vcb, dr);
2034 if (!NT_SUCCESS(Status)) {
2035 ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2036 goto end;
2037 }
2038
2039 le = le->Flink;
2040 }
2041
2042 le = c->changed_extents.Flink;
2043 while (le != &c->changed_extents) {
2044 LIST_ENTRY *le2, *le3;
2045 changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2046
2047 le3 = le->Flink;
2048
2049 le2 = items.Flink;
2050 while (le2 != &items) {
2051 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2052
2053 if (ce->address == dr->address) {
2054 ce->address = dr->new_address;
2055 RemoveEntryList(&ce->list_entry);
2056 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2057 break;
2058 }
2059
2060 le2 = le2->Flink;
2061 }
2062
2063 le = le3;
2064 }
2065
2066 Status = STATUS_SUCCESS;
2067
2068 Vcb->need_write = true;
2069
2070 end:
2071 if (NT_SUCCESS(Status)) {
2072 // update extents in cache inodes before we flush
2073 le = Vcb->chunks.Flink;
2074 while (le != &Vcb->chunks) {
2075 chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2076
2077 if (c2->cache) {
2078 LIST_ENTRY* le2;
2079
2080 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2081
2082 le2 = c2->cache->extents.Flink;
2083 while (le2 != &c2->cache->extents) {
2084 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2085
2086 if (!ext->ignore) {
2087 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2088 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2089
2090 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2091 LIST_ENTRY* le3 = items.Flink;
2092 while (le3 != &items) {
2093 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2094
2095 if (ed2->address == dr->address) {
2096 ed2->address = dr->new_address;
2097 break;
2098 }
2099
2100 le3 = le3->Flink;
2101 }
2102 }
2103 }
2104 }
2105
2106 le2 = le2->Flink;
2107 }
2108
2109 ExReleaseResourceLite(c2->cache->Header.Resource);
2110 }
2111
2112 le = le->Flink;
2113 }
2114
2115 Status = do_write(Vcb, NULL);
2116 if (!NT_SUCCESS(Status))
2117 ERR("do_write returned %08lx\n", Status);
2118 }
2119
2120 if (NT_SUCCESS(Status)) {
2121 clear_rollback(&rollback);
2122
2123 // update open FCBs
2124 // FIXME - speed this up(?)
2125
2126 le = Vcb->all_fcbs.Flink;
2127 while (le != &Vcb->all_fcbs) {
2128 struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2129 LIST_ENTRY* le2;
2130
2131 ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2132
2133 le2 = fcb->extents.Flink;
2134 while (le2 != &fcb->extents) {
2135 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2136
2137 if (!ext->ignore) {
2138 if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2139 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2140
2141 if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2142 LIST_ENTRY* le3 = items.Flink;
2143 while (le3 != &items) {
2144 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2145
2146 if (ed2->address == dr->address) {
2147 ed2->address = dr->new_address;
2148 break;
2149 }
2150
2151 le3 = le3->Flink;
2152 }
2153 }
2154 }
2155 }
2156
2157 le2 = le2->Flink;
2158 }
2159
2160 ExReleaseResourceLite(fcb->Header.Resource);
2161
2162 le = le->Flink;
2163 }
2164 } else
2165 do_rollback(Vcb, &rollback);
2166
2167 free_trees(Vcb);
2168
2169 ExReleaseResourceLite(&Vcb->tree_lock);
2170
2171 if (data)
2172 ExFreePool(data);
2173
2174 while (!IsListEmpty(&items)) {
2175 data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2176
2177 while (!IsListEmpty(&dr->refs)) {
2178 data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2179
2180 ExFreePool(ref);
2181 }
2182
2183 ExFreePool(dr);
2184 }
2185
2186 while (!IsListEmpty(&metadata_items)) {
2187 metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2188
2189 while (!IsListEmpty(&mr->refs)) {
2190 metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2191
2192 ExFreePool(ref);
2193 }
2194
2195 if (mr->data)
2196 ExFreePool(mr->data);
2197
2198 ExFreePool(mr);
2199 }
2200
2201 return Status;
2202 }
2203
get_chunk_dup_type(chunk * c)2204 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2205 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2206 return BLOCK_FLAG_RAID0;
2207 else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2208 return BLOCK_FLAG_RAID1;
2209 else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2210 return BLOCK_FLAG_DUPLICATE;
2211 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2212 return BLOCK_FLAG_RAID10;
2213 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2214 return BLOCK_FLAG_RAID5;
2215 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2216 return BLOCK_FLAG_RAID6;
2217 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2218 return BLOCK_FLAG_RAID1C3;
2219 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2220 return BLOCK_FLAG_RAID1C4;
2221 else
2222 return BLOCK_FLAG_SINGLE;
2223 }
2224
should_balance_chunk(device_extension * Vcb,uint8_t sort,chunk * c)2225 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2226 btrfs_balance_opts* opts;
2227
2228 opts = &Vcb->balance.opts[sort];
2229
2230 if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2231 return false;
2232
2233 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2234 uint64_t type = get_chunk_dup_type(c);
2235
2236 if (!(type & opts->profiles))
2237 return false;
2238 }
2239
2240 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2241 uint16_t i;
2242 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2243 bool b = false;
2244
2245 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2246 if (cis[i].dev_id == opts->devid) {
2247 b = true;
2248 break;
2249 }
2250 }
2251
2252 if (!b)
2253 return false;
2254 }
2255
2256 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2257 uint16_t i, factor;
2258 uint64_t physsize;
2259 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2260 bool b = false;
2261
2262 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2263 factor = c->chunk_item->num_stripes;
2264 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2265 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2266 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2267 factor = c->chunk_item->num_stripes - 1;
2268 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2269 factor = c->chunk_item->num_stripes - 2;
2270 else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2271 factor = 1;
2272
2273 physsize = c->chunk_item->size / factor;
2274
2275 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2276 if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2277 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2278 b = true;
2279 break;
2280 }
2281 }
2282
2283 if (!b)
2284 return false;
2285 }
2286
2287 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2288 if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2289 return false;
2290 }
2291
2292 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2293 if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2294 return false;
2295 }
2296
2297 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2298 uint64_t usage = c->used * 100 / c->chunk_item->size;
2299
2300 // usage == 0 should mean completely empty, not just that usage rounds to 0%
2301 if (c->used > 0 && usage == 0)
2302 usage = 1;
2303
2304 if (usage < opts->usage_start || usage > opts->usage_end)
2305 return false;
2306 }
2307
2308 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2309 uint64_t type = get_chunk_dup_type(c);
2310
2311 if (type == opts->convert)
2312 return false;
2313 }
2314
2315 return true;
2316 }
2317
copy_balance_args(btrfs_balance_opts * opts,BALANCE_ARGS * args)2318 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2319 if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2320 args->profiles = opts->profiles;
2321 args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2322 }
2323
2324 if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2325 if (args->usage_start == 0) {
2326 args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2327 args->usage_start = opts->usage_start;
2328 args->usage_end = opts->usage_end;
2329 } else {
2330 args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2331 args->usage = opts->usage_end;
2332 }
2333 }
2334
2335 if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2336 args->devid = opts->devid;
2337 args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2338 }
2339
2340 if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2341 args->drange_start = opts->drange_start;
2342 args->drange_end = opts->drange_end;
2343 args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2344 }
2345
2346 if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2347 args->vrange_start = opts->vrange_start;
2348 args->vrange_end = opts->vrange_end;
2349 args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2350 }
2351
2352 if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2353 args->convert = opts->convert;
2354 args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2355
2356 if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2357 args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2358 }
2359
2360 if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2361 if (args->limit_start == 0) {
2362 args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2363 args->limit_start = (uint32_t)opts->limit_start;
2364 args->limit_end = (uint32_t)opts->limit_end;
2365 } else {
2366 args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2367 args->limit = opts->limit_end;
2368 }
2369 }
2370
2371 if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2372 args->stripes_start = opts->stripes_start;
2373 args->stripes_end = opts->stripes_end;
2374 args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2375 }
2376 }
2377
add_balance_item(device_extension * Vcb)2378 static NTSTATUS add_balance_item(device_extension* Vcb) {
2379 KEY searchkey;
2380 traverse_ptr tp;
2381 NTSTATUS Status;
2382 BALANCE_ITEM* bi;
2383
2384 searchkey.obj_id = BALANCE_ITEM_ID;
2385 searchkey.obj_type = TYPE_TEMP_ITEM;
2386 searchkey.offset = 0;
2387
2388 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2389
2390 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2391 if (!NT_SUCCESS(Status)) {
2392 ERR("find_item returned %08lx\n", Status);
2393 goto end;
2394 }
2395
2396 if (!keycmp(tp.item->key, searchkey)) {
2397 Status = delete_tree_item(Vcb, &tp);
2398 if (!NT_SUCCESS(Status)) {
2399 ERR("delete_tree_item returned %08lx\n", Status);
2400 goto end;
2401 }
2402 }
2403
2404 bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2405 if (!bi) {
2406 ERR("out of memory\n");
2407 Status = STATUS_INSUFFICIENT_RESOURCES;
2408 goto end;
2409 }
2410
2411 RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2412
2413 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2414 bi->flags |= BALANCE_FLAGS_DATA;
2415 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2416 }
2417
2418 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2419 bi->flags |= BALANCE_FLAGS_METADATA;
2420 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2421 }
2422
2423 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2424 bi->flags |= BALANCE_FLAGS_SYSTEM;
2425 copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2426 }
2427
2428 Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2429 if (!NT_SUCCESS(Status)) {
2430 ERR("insert_tree_item returned %08lx\n", Status);
2431 ExFreePool(bi);
2432 goto end;
2433 }
2434
2435 Status = STATUS_SUCCESS;
2436
2437 end:
2438 if (NT_SUCCESS(Status)) {
2439 Status = do_write(Vcb, NULL);
2440 if (!NT_SUCCESS(Status))
2441 ERR("do_write returned %08lx\n", Status);
2442 }
2443
2444 free_trees(Vcb);
2445
2446 ExReleaseResourceLite(&Vcb->tree_lock);
2447
2448 return Status;
2449 }
2450
remove_balance_item(device_extension * Vcb)2451 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2452 KEY searchkey;
2453 traverse_ptr tp;
2454 NTSTATUS Status;
2455
2456 searchkey.obj_id = BALANCE_ITEM_ID;
2457 searchkey.obj_type = TYPE_TEMP_ITEM;
2458 searchkey.offset = 0;
2459
2460 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2461
2462 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2463 if (!NT_SUCCESS(Status)) {
2464 ERR("find_item returned %08lx\n", Status);
2465 goto end;
2466 }
2467
2468 if (!keycmp(tp.item->key, searchkey)) {
2469 Status = delete_tree_item(Vcb, &tp);
2470 if (!NT_SUCCESS(Status)) {
2471 ERR("delete_tree_item returned %08lx\n", Status);
2472 goto end;
2473 }
2474
2475 Status = do_write(Vcb, NULL);
2476 if (!NT_SUCCESS(Status)) {
2477 ERR("do_write returned %08lx\n", Status);
2478 goto end;
2479 }
2480
2481 free_trees(Vcb);
2482 }
2483
2484 Status = STATUS_SUCCESS;
2485
2486 end:
2487 ExReleaseResourceLite(&Vcb->tree_lock);
2488
2489 return Status;
2490 }
2491
load_balance_args(btrfs_balance_opts * opts,BALANCE_ARGS * args)2492 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2493 opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2494
2495 if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2496 opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2497 opts->profiles = args->profiles;
2498 }
2499
2500 if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2501 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2502
2503 opts->usage_start = 0;
2504 opts->usage_end = (uint8_t)args->usage;
2505 } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2506 opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2507
2508 opts->usage_start = (uint8_t)args->usage_start;
2509 opts->usage_end = (uint8_t)args->usage_end;
2510 }
2511
2512 if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2513 opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2514 opts->devid = args->devid;
2515 }
2516
2517 if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2518 opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2519 opts->drange_start = args->drange_start;
2520 opts->drange_end = args->drange_end;
2521 }
2522
2523 if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2524 opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2525 opts->vrange_start = args->vrange_start;
2526 opts->vrange_end = args->vrange_end;
2527 }
2528
2529 if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2530 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2531
2532 opts->limit_start = 0;
2533 opts->limit_end = args->limit;
2534 } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2535 opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2536
2537 opts->limit_start = args->limit_start;
2538 opts->limit_end = args->limit_end;
2539 }
2540
2541 if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2542 opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2543
2544 opts->stripes_start = (uint16_t)args->stripes_start;
2545 opts->stripes_end = (uint16_t)args->stripes_end;
2546 }
2547
2548 if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2549 opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2550 opts->convert = args->convert;
2551
2552 if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2553 opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2554 }
2555 }
2556
remove_superblocks(device * dev)2557 static NTSTATUS remove_superblocks(device* dev) {
2558 NTSTATUS Status;
2559 superblock* sb;
2560 int i = 0;
2561
2562 sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2563 if (!sb) {
2564 ERR("out of memory\n");
2565 return STATUS_INSUFFICIENT_RESOURCES;
2566 }
2567
2568 RtlZeroMemory(sb, sizeof(superblock));
2569
2570 while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2571 Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2572
2573 if (!NT_SUCCESS(Status)) {
2574 ExFreePool(sb);
2575 return Status;
2576 }
2577
2578 i++;
2579 }
2580
2581 ExFreePool(sb);
2582
2583 return STATUS_SUCCESS;
2584 }
2585
2586 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2587 KEY searchkey;
2588 traverse_ptr tp;
2589 NTSTATUS Status;
2590 LIST_ENTRY* le;
2591 volume_device_extension* vde;
2592
2593 if (Vcb->need_write) {
2594 Status = do_write(Vcb, NULL);
2595
2596 if (!NT_SUCCESS(Status))
2597 ERR("do_write returned %08lx\n", Status);
2598 } else
2599 Status = STATUS_SUCCESS;
2600
2601 free_trees(Vcb);
2602
2603 if (!NT_SUCCESS(Status))
2604 return Status;
2605
2606 // remove entry in chunk tree
2607
2608 searchkey.obj_id = 1;
2609 searchkey.obj_type = TYPE_DEV_ITEM;
2610 searchkey.offset = dev->devitem.dev_id;
2611
2612 Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2613 if (!NT_SUCCESS(Status)) {
2614 ERR("find_item returned %08lx\n", Status);
2615 return Status;
2616 }
2617
2618 if (!keycmp(searchkey, tp.item->key)) {
2619 Status = delete_tree_item(Vcb, &tp);
2620
2621 if (!NT_SUCCESS(Status)) {
2622 ERR("delete_tree_item returned %08lx\n", Status);
2623 return Status;
2624 }
2625 }
2626
2627 // remove stats entry in device tree
2628
2629 searchkey.obj_id = 0;
2630 searchkey.obj_type = TYPE_DEV_STATS;
2631 searchkey.offset = dev->devitem.dev_id;
2632
2633 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2634 if (!NT_SUCCESS(Status)) {
2635 ERR("find_item returned %08lx\n", Status);
2636 return Status;
2637 }
2638
2639 if (!keycmp(searchkey, tp.item->key)) {
2640 Status = delete_tree_item(Vcb, &tp);
2641
2642 if (!NT_SUCCESS(Status)) {
2643 ERR("delete_tree_item returned %08lx\n", Status);
2644 return Status;
2645 }
2646 }
2647
2648 // update superblock
2649
2650 Vcb->superblock.num_devices--;
2651 Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2652 Vcb->devices_loaded--;
2653
2654 RemoveEntryList(&dev->list_entry);
2655
2656 // flush
2657
2658 Status = do_write(Vcb, NULL);
2659 if (!NT_SUCCESS(Status))
2660 ERR("do_write returned %08lx\n", Status);
2661
2662 free_trees(Vcb);
2663
2664 if (!NT_SUCCESS(Status))
2665 return Status;
2666
2667 if (!dev->readonly && dev->devobj) {
2668 Status = remove_superblocks(dev);
2669 if (!NT_SUCCESS(Status))
2670 WARN("remove_superblocks returned %08lx\n", Status);
2671 }
2672
2673 // remove entry in volume list
2674
2675 vde = Vcb->vde;
2676
2677 if (dev->devobj) {
2678 pdo_device_extension* pdode = vde->pdode;
2679
2680 ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2681
2682 le = pdode->children.Flink;
2683 while (le != &pdode->children) {
2684 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2685
2686 if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2687 PFILE_OBJECT FileObject;
2688 PDEVICE_OBJECT mountmgr;
2689 UNICODE_STRING mmdevpath;
2690
2691 pdode->children_loaded--;
2692
2693 if (vc->had_drive_letter) { // re-add entry to mountmgr
2694 RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2695 Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2696 if (!NT_SUCCESS(Status))
2697 ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2698 else {
2699 MOUNTDEV_NAME mdn;
2700
2701 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2702 if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2703 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2704 else {
2705 MOUNTDEV_NAME* mdn2;
2706 ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2707
2708 mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2709 if (!mdn2)
2710 ERR("out of memory\n");
2711 else {
2712 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2713 if (!NT_SUCCESS(Status))
2714 ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2715 else {
2716 UNICODE_STRING name;
2717
2718 name.Buffer = mdn2->Name;
2719 name.Length = name.MaximumLength = mdn2->NameLength;
2720
2721 Status = mountmgr_add_drive_letter(mountmgr, &name);
2722 if (!NT_SUCCESS(Status))
2723 WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2724 }
2725
2726 ExFreePool(mdn2);
2727 }
2728 }
2729
2730 ObDereferenceObject(FileObject);
2731 }
2732 }
2733
2734 ExFreePool(vc->pnp_name.Buffer);
2735 RemoveEntryList(&vc->list_entry);
2736 ExFreePool(vc);
2737
2738 ObDereferenceObject(vc->fileobj);
2739
2740 break;
2741 }
2742
2743 le = le->Flink;
2744 }
2745
2746 if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2747 vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2748
2749 le = pdode->children.Flink;
2750 while (le != &pdode->children) {
2751 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2752
2753 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2754 vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2755 break;
2756 }
2757
2758 le = le->Flink;
2759 }
2760 }
2761
2762 pdode->num_children = Vcb->superblock.num_devices;
2763
2764 ExReleaseResourceLite(&pdode->child_lock);
2765
2766 // free dev
2767
2768 if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2769 trim_whole_device(dev);
2770 }
2771
2772 while (!IsListEmpty(&dev->space)) {
2773 LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2774 space* s = CONTAINING_RECORD(le2, space, list_entry);
2775
2776 ExFreePool(s);
2777 }
2778
2779 ExFreePool(dev);
2780
2781 if (Vcb->trim) {
2782 Vcb->trim = false;
2783
2784 le = Vcb->devices.Flink;
2785 while (le != &Vcb->devices) {
2786 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2787
2788 if (dev2->trim) {
2789 Vcb->trim = true;
2790 break;
2791 }
2792
2793 le = le->Flink;
2794 }
2795 }
2796
2797 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2798
2799 return STATUS_SUCCESS;
2800 }
2801
2802 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2803 DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2804 DEVICE_DATA_SET_RANGE* ranges;
2805 ULONG datalen, i;
2806 KEY searchkey;
2807 traverse_ptr tp;
2808 NTSTATUS Status;
2809 bool b;
2810 uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2811 LIST_ENTRY* le;
2812
2813 dev->num_trim_entries = 0;
2814
2815 searchkey.obj_id = dev->devitem.dev_id;
2816 searchkey.obj_type = TYPE_DEV_EXTENT;
2817 searchkey.offset = 0;
2818
2819 Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2820 if (!NT_SUCCESS(Status)) {
2821 ERR("find_item returned %08lx\n", Status);
2822 return;
2823 }
2824
2825 do {
2826 traverse_ptr next_tp;
2827
2828 if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2829 if (tp.item->size >= sizeof(DEV_EXTENT)) {
2830 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2831
2832 if (tp.item->key.offset > lastoff)
2833 add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2834
2835 lastoff = tp.item->key.offset + de->length;
2836 } else {
2837 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2838 return;
2839 }
2840 }
2841
2842 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2843
2844 if (b) {
2845 tp = next_tp;
2846 if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2847 break;
2848 }
2849 } while (b);
2850
2851 if (lastoff < dev->devitem.num_bytes)
2852 add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2853
2854 if (dev->num_trim_entries == 0)
2855 return;
2856
2857 datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2858
2859 dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2860 if (!dmdsa) {
2861 ERR("out of memory\n");
2862 goto end;
2863 }
2864
2865 dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2866 dmdsa->Action = DeviceDsmAction_Trim;
2867 dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2868 dmdsa->ParameterBlockOffset = 0;
2869 dmdsa->ParameterBlockLength = 0;
2870 dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2871 dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2872
2873 ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2874
2875 i = 0;
2876 le = dev->trim_list.Flink;
2877 while (le != &dev->trim_list) {
2878 space* s = CONTAINING_RECORD(le, space, list_entry);
2879
2880 ranges[i].StartingOffset = s->address;
2881 ranges[i].LengthInBytes = s->size;
2882 i++;
2883
2884 le = le->Flink;
2885 }
2886
2887 Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2888 if (!NT_SUCCESS(Status))
2889 WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2890
2891 ExFreePool(dmdsa);
2892
2893 end:
2894 while (!IsListEmpty(&dev->trim_list)) {
2895 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2896 ExFreePool(s);
2897 }
2898
2899 dev->num_trim_entries = 0;
2900 }
2901
try_consolidation(device_extension * Vcb,uint64_t flags,chunk ** newchunk)2902 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2903 NTSTATUS Status;
2904 bool changed;
2905 LIST_ENTRY* le;
2906 chunk* rc;
2907
2908 // FIXME - allow with metadata chunks?
2909
2910 while (true) {
2911 rc = NULL;
2912
2913 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2914
2915 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2916
2917 // choose the least-used chunk we haven't looked at yet
2918 le = Vcb->chunks.Flink;
2919 while (le != &Vcb->chunks) {
2920 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2921
2922 // FIXME - skip full-size chunks over e.g. 90% full?
2923 if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2924 rc = c;
2925
2926 le = le->Flink;
2927 }
2928
2929 ExReleaseResourceLite(&Vcb->chunk_lock);
2930
2931 if (!rc) {
2932 ExReleaseResourceLite(&Vcb->tree_lock);
2933 break;
2934 }
2935
2936 if (rc->list_entry_balance.Flink) {
2937 RemoveEntryList(&rc->list_entry_balance);
2938 Vcb->balance.chunks_left--;
2939 }
2940
2941 rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2942 rc->reloc = true;
2943
2944 ExReleaseResourceLite(&Vcb->tree_lock);
2945
2946 do {
2947 changed = false;
2948
2949 Status = balance_data_chunk(Vcb, rc, &changed);
2950 if (!NT_SUCCESS(Status)) {
2951 ERR("balance_data_chunk returned %08lx\n", Status);
2952 Vcb->balance.status = Status;
2953 rc->list_entry_balance.Flink = NULL;
2954 rc->reloc = false;
2955 return Status;
2956 }
2957
2958 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2959
2960 if (Vcb->readonly)
2961 Vcb->balance.stopping = true;
2962
2963 if (Vcb->balance.stopping)
2964 return STATUS_SUCCESS;
2965 } while (changed);
2966
2967 rc->list_entry_balance.Flink = NULL;
2968
2969 rc->changed = true;
2970 rc->space_changed = true;
2971 rc->balance_num = Vcb->balance.balance_num;
2972
2973 Status = do_write(Vcb, NULL);
2974 if (!NT_SUCCESS(Status)) {
2975 ERR("do_write returned %08lx\n", Status);
2976 return Status;
2977 }
2978
2979 free_trees(Vcb);
2980 }
2981
2982 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2983
2984 Status = alloc_chunk(Vcb, flags, &rc, true);
2985
2986 ExReleaseResourceLite(&Vcb->chunk_lock);
2987
2988 if (NT_SUCCESS(Status)) {
2989 *newchunk = rc;
2990 return Status;
2991 } else {
2992 ERR("alloc_chunk returned %08lx\n", Status);
2993 return Status;
2994 }
2995 }
2996
regenerate_space_list(device_extension * Vcb,device * dev)2997 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2998 LIST_ENTRY* le;
2999
3000 while (!IsListEmpty(&dev->space)) {
3001 space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3002
3003 ExFreePool(s);
3004 }
3005
3006 // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3007
3008 space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3009
3010 le = Vcb->chunks.Flink;
3011 while (le != &Vcb->chunks) {
3012 uint16_t n;
3013 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3014 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3015
3016 for (n = 0; n < c->chunk_item->num_stripes; n++) {
3017 uint64_t stripe_size = 0;
3018
3019 if (cis[n].dev_id == dev->devitem.dev_id) {
3020 if (stripe_size == 0) {
3021 uint16_t factor;
3022
3023 if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3024 factor = c->chunk_item->num_stripes;
3025 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3026 factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3027 else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3028 factor = c->chunk_item->num_stripes - 1;
3029 else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3030 factor = c->chunk_item->num_stripes - 2;
3031 else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3032 factor = 1;
3033
3034 stripe_size = c->chunk_item->size / factor;
3035 }
3036
3037 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3038 }
3039 }
3040
3041 le = le->Flink;
3042 }
3043
3044 return STATUS_SUCCESS;
3045 }
3046
_Function_class_(KSTART_ROUTINE)3047 _Function_class_(KSTART_ROUTINE)
3048 void __stdcall balance_thread(void* context) {
3049 device_extension* Vcb = (device_extension*)context;
3050 LIST_ENTRY chunks;
3051 LIST_ENTRY* le;
3052 uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3053 uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3054 NTSTATUS Status;
3055
3056 Vcb->balance.balance_num++;
3057
3058 Vcb->balance.stopping = false;
3059 KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3060
3061 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062 old_data_flags = Vcb->data_flags;
3063 Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3064
3065 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3066 }
3067
3068 if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3069 old_metadata_flags = Vcb->metadata_flags;
3070 Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3071 }
3072
3073 if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3074 old_system_flags = Vcb->system_flags;
3075 Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3076 }
3077
3078 if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3079 if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3080 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3081 else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3082 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3083 }
3084
3085 num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3086 Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3087
3088 InitializeListHead(&chunks);
3089
3090 // FIXME - what are we supposed to do with limit_start?
3091
3092 if (!Vcb->readonly) {
3093 if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3094 Status = add_balance_item(Vcb);
3095 if (!NT_SUCCESS(Status)) {
3096 ERR("add_balance_item returned %08lx\n", Status);
3097 Vcb->balance.status = Status;
3098 goto end;
3099 }
3100 } else {
3101 if (Vcb->need_write) {
3102 Status = do_write(Vcb, NULL);
3103
3104 free_trees(Vcb);
3105
3106 if (!NT_SUCCESS(Status)) {
3107 ERR("do_write returned %08lx\n", Status);
3108 Vcb->balance.status = Status;
3109 goto end;
3110 }
3111 }
3112 }
3113 }
3114
3115 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3116
3117 if (Vcb->balance.stopping)
3118 goto end;
3119
3120 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3121
3122 le = Vcb->chunks.Flink;
3123 while (le != &Vcb->chunks) {
3124 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3125 uint8_t sort;
3126
3127 acquire_chunk_lock(c, Vcb);
3128
3129 if (c->chunk_item->type & BLOCK_FLAG_DATA)
3130 sort = BALANCE_OPTS_DATA;
3131 else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3132 sort = BALANCE_OPTS_METADATA;
3133 else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3134 sort = BALANCE_OPTS_SYSTEM;
3135 else {
3136 ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3137 release_chunk_lock(c, Vcb);
3138 break;
3139 }
3140
3141 if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3142 should_balance_chunk(Vcb, sort, c)) {
3143 InsertTailList(&chunks, &c->list_entry_balance);
3144
3145 num_chunks[sort]++;
3146 Vcb->balance.total_chunks++;
3147 Vcb->balance.chunks_left++;
3148 } else if (sort == BALANCE_OPTS_METADATA)
3149 okay_metadata_chunks++;
3150 else if (sort == BALANCE_OPTS_DATA)
3151 okay_data_chunks++;
3152 else if (sort == BALANCE_OPTS_SYSTEM)
3153 okay_system_chunks++;
3154
3155 if (!c->cache_loaded) {
3156 Status = load_cache_chunk(Vcb, c, NULL);
3157
3158 if (!NT_SUCCESS(Status)) {
3159 ERR("load_cache_chunk returned %08lx\n", Status);
3160 Vcb->balance.status = Status;
3161 release_chunk_lock(c, Vcb);
3162 ExReleaseResourceLite(&Vcb->chunk_lock);
3163 goto end;
3164 }
3165 }
3166
3167 release_chunk_lock(c, Vcb);
3168
3169 le = le->Flink;
3170 }
3171
3172 ExReleaseResourceLite(&Vcb->chunk_lock);
3173
3174 // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3175 if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3176 bool consolidated = false;
3177 chunk* c;
3178
3179 if (okay_metadata_chunks == 0) {
3180 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3181
3182 Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3183 if (NT_SUCCESS(Status))
3184 c->balance_num = Vcb->balance.balance_num;
3185 else if (Status != STATUS_DISK_FULL || consolidated) {
3186 ERR("alloc_chunk returned %08lx\n", Status);
3187 ExReleaseResourceLite(&Vcb->chunk_lock);
3188 Vcb->balance.status = Status;
3189 goto end;
3190 }
3191
3192 ExReleaseResourceLite(&Vcb->chunk_lock);
3193
3194 if (Status == STATUS_DISK_FULL) {
3195 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3196 if (!NT_SUCCESS(Status)) {
3197 ERR("try_consolidation returned %08lx\n", Status);
3198 Vcb->balance.status = Status;
3199 goto end;
3200 } else
3201 c->balance_num = Vcb->balance.balance_num;
3202
3203 consolidated = true;
3204
3205 if (Vcb->balance.stopping)
3206 goto end;
3207 }
3208 }
3209
3210 if (okay_data_chunks == 0) {
3211 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3212
3213 Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3214 if (NT_SUCCESS(Status))
3215 c->balance_num = Vcb->balance.balance_num;
3216 else if (Status != STATUS_DISK_FULL || consolidated) {
3217 ERR("alloc_chunk returned %08lx\n", Status);
3218 ExReleaseResourceLite(&Vcb->chunk_lock);
3219 Vcb->balance.status = Status;
3220 goto end;
3221 }
3222
3223 ExReleaseResourceLite(&Vcb->chunk_lock);
3224
3225 if (Status == STATUS_DISK_FULL) {
3226 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3227 if (!NT_SUCCESS(Status)) {
3228 ERR("try_consolidation returned %08lx\n", Status);
3229 Vcb->balance.status = Status;
3230 goto end;
3231 } else
3232 c->balance_num = Vcb->balance.balance_num;
3233
3234 consolidated = true;
3235
3236 if (Vcb->balance.stopping)
3237 goto end;
3238 }
3239 }
3240
3241 if (okay_system_chunks == 0) {
3242 ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3243
3244 Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3245 if (NT_SUCCESS(Status))
3246 c->balance_num = Vcb->balance.balance_num;
3247 else if (Status != STATUS_DISK_FULL || consolidated) {
3248 ERR("alloc_chunk returned %08lx\n", Status);
3249 ExReleaseResourceLite(&Vcb->chunk_lock);
3250 Vcb->balance.status = Status;
3251 goto end;
3252 }
3253
3254 ExReleaseResourceLite(&Vcb->chunk_lock);
3255
3256 if (Status == STATUS_DISK_FULL) {
3257 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3258 if (!NT_SUCCESS(Status)) {
3259 ERR("try_consolidation returned %08lx\n", Status);
3260 Vcb->balance.status = Status;
3261 goto end;
3262 } else
3263 c->balance_num = Vcb->balance.balance_num;
3264
3265 consolidated = true;
3266
3267 if (Vcb->balance.stopping)
3268 goto end;
3269 }
3270 }
3271 }
3272
3273 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3274
3275 le = chunks.Flink;
3276 while (le != &chunks) {
3277 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278
3279 c->reloc = true;
3280
3281 le = le->Flink;
3282 }
3283
3284 ExReleaseResourceLite(&Vcb->chunk_lock);
3285
3286 // do data chunks before metadata
3287 le = chunks.Flink;
3288 while (le != &chunks) {
3289 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3290 LIST_ENTRY* le2 = le->Flink;
3291
3292 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3293 bool changed;
3294
3295 do {
3296 changed = false;
3297
3298 Status = balance_data_chunk(Vcb, c, &changed);
3299 if (!NT_SUCCESS(Status)) {
3300 ERR("balance_data_chunk returned %08lx\n", Status);
3301 Vcb->balance.status = Status;
3302 goto end;
3303 }
3304
3305 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3306
3307 if (Vcb->readonly)
3308 Vcb->balance.stopping = true;
3309
3310 if (Vcb->balance.stopping)
3311 break;
3312 } while (changed);
3313
3314 c->changed = true;
3315 c->space_changed = true;
3316 }
3317
3318 if (Vcb->balance.stopping)
3319 goto end;
3320
3321 if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3322 (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3323 RemoveEntryList(&c->list_entry_balance);
3324 c->list_entry_balance.Flink = NULL;
3325
3326 Vcb->balance.chunks_left--;
3327 }
3328
3329 le = le2;
3330 }
3331
3332 // do metadata chunks
3333 while (!IsListEmpty(&chunks)) {
3334 chunk* c;
3335 bool changed;
3336
3337 le = RemoveHeadList(&chunks);
3338 c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3339
3340 if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3341 do {
3342 Status = balance_metadata_chunk(Vcb, c, &changed);
3343 if (!NT_SUCCESS(Status)) {
3344 ERR("balance_metadata_chunk returned %08lx\n", Status);
3345 Vcb->balance.status = Status;
3346 goto end;
3347 }
3348
3349 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3350
3351 if (Vcb->readonly)
3352 Vcb->balance.stopping = true;
3353
3354 if (Vcb->balance.stopping)
3355 break;
3356 } while (changed);
3357
3358 c->changed = true;
3359 c->space_changed = true;
3360 }
3361
3362 if (Vcb->balance.stopping)
3363 break;
3364
3365 c->list_entry_balance.Flink = NULL;
3366
3367 Vcb->balance.chunks_left--;
3368 }
3369
3370 end:
3371 if (!Vcb->readonly) {
3372 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3373 le = chunks.Flink;
3374 while (le != &chunks) {
3375 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3376 c->reloc = false;
3377
3378 le = le->Flink;
3379 c->list_entry_balance.Flink = NULL;
3380 }
3381
3382 if (old_data_flags != 0)
3383 Vcb->data_flags = old_data_flags;
3384
3385 if (old_metadata_flags != 0)
3386 Vcb->metadata_flags = old_metadata_flags;
3387
3388 if (old_system_flags != 0)
3389 Vcb->system_flags = old_system_flags;
3390 }
3391
3392 if (Vcb->balance.removing) {
3393 device* dev = NULL;
3394
3395 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3396
3397 le = Vcb->devices.Flink;
3398 while (le != &Vcb->devices) {
3399 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3400
3401 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3402 dev = dev2;
3403 break;
3404 }
3405
3406 le = le->Flink;
3407 }
3408
3409 if (dev) {
3410 if (Vcb->balance.chunks_left == 0) {
3411 Status = finish_removing_device(Vcb, dev);
3412
3413 if (!NT_SUCCESS(Status)) {
3414 ERR("finish_removing_device returned %08lx\n", Status);
3415 dev->reloc = false;
3416 }
3417 } else
3418 dev->reloc = false;
3419 }
3420
3421 ExReleaseResourceLite(&Vcb->tree_lock);
3422 } else if (Vcb->balance.shrinking) {
3423 device* dev = NULL;
3424
3425 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3426
3427 le = Vcb->devices.Flink;
3428 while (le != &Vcb->devices) {
3429 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3430
3431 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3432 dev = dev2;
3433 break;
3434 }
3435
3436 le = le->Flink;
3437 }
3438
3439 if (!dev) {
3440 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3441 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3442 }
3443
3444 if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3445 if (dev) {
3446 Status = regenerate_space_list(Vcb, dev);
3447 if (!NT_SUCCESS(Status))
3448 WARN("regenerate_space_list returned %08lx\n", Status);
3449 }
3450 } else {
3451 uint64_t old_size;
3452
3453 old_size = dev->devitem.num_bytes;
3454 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3455
3456 Status = update_dev_item(Vcb, dev, NULL);
3457 if (!NT_SUCCESS(Status)) {
3458 ERR("update_dev_item returned %08lx\n", Status);
3459 dev->devitem.num_bytes = old_size;
3460 Vcb->balance.status = Status;
3461
3462 Status = regenerate_space_list(Vcb, dev);
3463 if (!NT_SUCCESS(Status))
3464 WARN("regenerate_space_list returned %08lx\n", Status);
3465 } else {
3466 Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3467
3468 Status = do_write(Vcb, NULL);
3469 if (!NT_SUCCESS(Status))
3470 ERR("do_write returned %08lx\n", Status);
3471
3472 free_trees(Vcb);
3473 }
3474 }
3475
3476 ExReleaseResourceLite(&Vcb->tree_lock);
3477
3478 if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3479 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3480 } else {
3481 Status = remove_balance_item(Vcb);
3482 if (!NT_SUCCESS(Status)) {
3483 ERR("remove_balance_item returned %08lx\n", Status);
3484 goto end;
3485 }
3486 }
3487
3488 if (Vcb->trim && !Vcb->options.no_trim) {
3489 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3490
3491 le = Vcb->devices.Flink;
3492 while (le != &Vcb->devices) {
3493 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3494
3495 if (dev2->devobj && !dev2->readonly && dev2->trim)
3496 trim_unalloc_space(Vcb, dev2);
3497
3498 le = le->Flink;
3499 }
3500
3501 ExReleaseResourceLite(&Vcb->tree_lock);
3502 }
3503 }
3504
3505 ZwClose(Vcb->balance.thread);
3506 Vcb->balance.thread = NULL;
3507
3508 KeSetEvent(&Vcb->balance.finished, 0, false);
3509 }
3510
start_balance(device_extension * Vcb,void * data,ULONG length,KPROCESSOR_MODE processor_mode)3511 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3512 NTSTATUS Status;
3513 btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3514 OBJECT_ATTRIBUTES oa;
3515 uint8_t i;
3516
3517 if (length < sizeof(btrfs_start_balance) || !data)
3518 return STATUS_INVALID_PARAMETER;
3519
3520 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3521 return STATUS_PRIVILEGE_NOT_HELD;
3522
3523 if (Vcb->locked) {
3524 WARN("cannot start balance while locked\n");
3525 return STATUS_DEVICE_NOT_READY;
3526 }
3527
3528 if (Vcb->scrub.thread) {
3529 WARN("cannot start balance while scrub running\n");
3530 return STATUS_DEVICE_NOT_READY;
3531 }
3532
3533 if (Vcb->balance.thread) {
3534 WARN("balance already running\n");
3535 return STATUS_DEVICE_NOT_READY;
3536 }
3537
3538 if (Vcb->readonly)
3539 return STATUS_MEDIA_WRITE_PROTECTED;
3540
3541 if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3542 !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3543 !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3544 return STATUS_SUCCESS;
3545
3546 for (i = 0; i < 3; i++) {
3547 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3548 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3549 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3550 BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 |
3551 BLOCK_FLAG_RAID1C4;
3552
3553 if (bsb->opts[i].profiles == 0)
3554 return STATUS_INVALID_PARAMETER;
3555 }
3556
3557 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3558 if (bsb->opts[i].devid == 0)
3559 return STATUS_INVALID_PARAMETER;
3560 }
3561
3562 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3563 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3564 return STATUS_INVALID_PARAMETER;
3565 }
3566
3567 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3568 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3569 return STATUS_INVALID_PARAMETER;
3570 }
3571
3572 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3573 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3574 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3575
3576 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3577 return STATUS_INVALID_PARAMETER;
3578 }
3579
3580 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3581 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3582 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3583
3584 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3585 return STATUS_INVALID_PARAMETER;
3586 }
3587
3588 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3589 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3590 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3591
3592 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3593 return STATUS_INVALID_PARAMETER;
3594 }
3595
3596 if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3597 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3598 bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3599 bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3600 bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 &&
3601 bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3602 return STATUS_INVALID_PARAMETER;
3603 }
3604 }
3605 }
3606
3607 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3608 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3609 RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3610
3611 Vcb->balance.paused = false;
3612 Vcb->balance.removing = false;
3613 Vcb->balance.shrinking = false;
3614 Vcb->balance.status = STATUS_SUCCESS;
3615 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3616
3617 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3618
3619 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3620 if (!NT_SUCCESS(Status)) {
3621 ERR("PsCreateSystemThread returned %08lx\n", Status);
3622 return Status;
3623 }
3624
3625 return STATUS_SUCCESS;
3626 }
3627
3628 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3629 KEY searchkey;
3630 traverse_ptr tp;
3631 NTSTATUS Status;
3632 BALANCE_ITEM* bi;
3633 OBJECT_ATTRIBUTES oa;
3634 int i;
3635
3636 searchkey.obj_id = BALANCE_ITEM_ID;
3637 searchkey.obj_type = TYPE_TEMP_ITEM;
3638 searchkey.offset = 0;
3639
3640 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3641 if (!NT_SUCCESS(Status)) {
3642 ERR("find_item returned %08lx\n", Status);
3643 return Status;
3644 }
3645
3646 if (keycmp(tp.item->key, searchkey)) {
3647 TRACE("no balance item found\n");
3648 return STATUS_NOT_FOUND;
3649 }
3650
3651 if (tp.item->size < sizeof(BALANCE_ITEM)) {
3652 WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3653 tp.item->size, sizeof(BALANCE_ITEM));
3654 return STATUS_INTERNAL_ERROR;
3655 }
3656
3657 bi = (BALANCE_ITEM*)tp.item->data;
3658
3659 if (bi->flags & BALANCE_FLAGS_DATA)
3660 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3661
3662 if (bi->flags & BALANCE_FLAGS_METADATA)
3663 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3664
3665 if (bi->flags & BALANCE_FLAGS_SYSTEM)
3666 load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3667
3668 // do the heuristics that Linux driver does
3669
3670 for (i = 0; i < 3; i++) {
3671 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3672 // if converting, don't redo chunks already done
3673
3674 if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3675 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3676
3677 // don't balance chunks more than 90% filled - presumably these
3678 // have already been done
3679
3680 if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3681 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3682 ) {
3683 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3684 Vcb->balance.opts[i].usage_start = 0;
3685 Vcb->balance.opts[i].usage_end = 90;
3686 }
3687 }
3688 }
3689
3690 if (Vcb->readonly || Vcb->options.skip_balance)
3691 Vcb->balance.paused = true;
3692 else
3693 Vcb->balance.paused = false;
3694
3695 Vcb->balance.removing = false;
3696 Vcb->balance.shrinking = false;
3697 Vcb->balance.status = STATUS_SUCCESS;
3698 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3699
3700 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3701
3702 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3703 if (!NT_SUCCESS(Status)) {
3704 ERR("PsCreateSystemThread returned %08lx\n", Status);
3705 return Status;
3706 }
3707
3708 return STATUS_SUCCESS;
3709 }
3710
query_balance(device_extension * Vcb,void * data,ULONG length)3711 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3712 btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3713
3714 if (length < sizeof(btrfs_query_balance) || !data)
3715 return STATUS_INVALID_PARAMETER;
3716
3717 if (!Vcb->balance.thread) {
3718 bqb->status = BTRFS_BALANCE_STOPPED;
3719
3720 if (!NT_SUCCESS(Vcb->balance.status)) {
3721 bqb->status |= BTRFS_BALANCE_ERROR;
3722 bqb->error = Vcb->balance.status;
3723 }
3724
3725 return STATUS_SUCCESS;
3726 }
3727
3728 bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3729
3730 if (Vcb->balance.removing)
3731 bqb->status |= BTRFS_BALANCE_REMOVAL;
3732
3733 if (Vcb->balance.shrinking)
3734 bqb->status |= BTRFS_BALANCE_SHRINKING;
3735
3736 if (!NT_SUCCESS(Vcb->balance.status))
3737 bqb->status |= BTRFS_BALANCE_ERROR;
3738
3739 bqb->chunks_left = Vcb->balance.chunks_left;
3740 bqb->total_chunks = Vcb->balance.total_chunks;
3741 bqb->error = Vcb->balance.status;
3742 RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3743 RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3744 RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3745
3746 return STATUS_SUCCESS;
3747 }
3748
pause_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3749 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3750 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3751 return STATUS_PRIVILEGE_NOT_HELD;
3752
3753 if (!Vcb->balance.thread)
3754 return STATUS_DEVICE_NOT_READY;
3755
3756 if (Vcb->balance.paused)
3757 return STATUS_DEVICE_NOT_READY;
3758
3759 Vcb->balance.paused = true;
3760 KeClearEvent(&Vcb->balance.event);
3761
3762 return STATUS_SUCCESS;
3763 }
3764
resume_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3765 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3766 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3767 return STATUS_PRIVILEGE_NOT_HELD;
3768
3769 if (!Vcb->balance.thread)
3770 return STATUS_DEVICE_NOT_READY;
3771
3772 if (!Vcb->balance.paused)
3773 return STATUS_DEVICE_NOT_READY;
3774
3775 if (Vcb->readonly)
3776 return STATUS_MEDIA_WRITE_PROTECTED;
3777
3778 Vcb->balance.paused = false;
3779 KeSetEvent(&Vcb->balance.event, 0, false);
3780
3781 return STATUS_SUCCESS;
3782 }
3783
stop_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3784 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3785 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3786 return STATUS_PRIVILEGE_NOT_HELD;
3787
3788 if (!Vcb->balance.thread)
3789 return STATUS_DEVICE_NOT_READY;
3790
3791 Vcb->balance.paused = false;
3792 Vcb->balance.stopping = true;
3793 Vcb->balance.status = STATUS_SUCCESS;
3794 KeSetEvent(&Vcb->balance.event, 0, false);
3795
3796 return STATUS_SUCCESS;
3797 }
3798
remove_device(device_extension * Vcb,void * data,ULONG length,KPROCESSOR_MODE processor_mode)3799 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3800 uint64_t devid;
3801 LIST_ENTRY* le;
3802 device* dev = NULL;
3803 NTSTATUS Status;
3804 int i;
3805 uint64_t num_rw_devices;
3806 OBJECT_ATTRIBUTES oa;
3807
3808 TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3809
3810 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3811 return STATUS_PRIVILEGE_NOT_HELD;
3812
3813 if (length < sizeof(uint64_t))
3814 return STATUS_INVALID_PARAMETER;
3815
3816 devid = *(uint64_t*)data;
3817
3818 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3819
3820 if (Vcb->readonly) {
3821 ExReleaseResourceLite(&Vcb->tree_lock);
3822 return STATUS_MEDIA_WRITE_PROTECTED;
3823 }
3824
3825 num_rw_devices = 0;
3826
3827 le = Vcb->devices.Flink;
3828 while (le != &Vcb->devices) {
3829 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3830
3831 if (dev2->devitem.dev_id == devid)
3832 dev = dev2;
3833
3834 if (!dev2->readonly)
3835 num_rw_devices++;
3836
3837 le = le->Flink;
3838 }
3839
3840 if (!dev) {
3841 ExReleaseResourceLite(&Vcb->tree_lock);
3842 WARN("device %I64x not found\n", devid);
3843 return STATUS_NOT_FOUND;
3844 }
3845
3846 if (!dev->readonly) {
3847 if (num_rw_devices == 1) {
3848 ExReleaseResourceLite(&Vcb->tree_lock);
3849 WARN("not removing last non-readonly device\n");
3850 return STATUS_INVALID_PARAMETER;
3851 }
3852
3853 if (num_rw_devices == 4 &&
3854 ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3855 (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3856 (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3857 )
3858 ) {
3859 ExReleaseResourceLite(&Vcb->tree_lock);
3860 ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3861 return STATUS_CANNOT_DELETE;
3862 }
3863
3864 if (num_rw_devices == 3 &&
3865 ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) ||
3866 (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3))
3867 ) {
3868 ExReleaseResourceLite(&Vcb->tree_lock);
3869 ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n");
3870 return STATUS_CANNOT_DELETE;
3871 }
3872
3873 if (num_rw_devices == 2 &&
3874 ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3875 (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3876 ) {
3877 ExReleaseResourceLite(&Vcb->tree_lock);
3878 ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3879 return STATUS_CANNOT_DELETE;
3880 }
3881 }
3882
3883 ExReleaseResourceLite(&Vcb->tree_lock);
3884
3885 if (Vcb->balance.thread) {
3886 WARN("balance already running\n");
3887 return STATUS_DEVICE_NOT_READY;
3888 }
3889
3890 dev->reloc = true;
3891
3892 RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3893
3894 for (i = 0; i < 3; i++) {
3895 Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3896 Vcb->balance.opts[i].devid = devid;
3897 }
3898
3899 Vcb->balance.paused = false;
3900 Vcb->balance.removing = true;
3901 Vcb->balance.shrinking = false;
3902 Vcb->balance.status = STATUS_SUCCESS;
3903 KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3904
3905 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3906
3907 Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3908 if (!NT_SUCCESS(Status)) {
3909 ERR("PsCreateSystemThread returned %08lx\n", Status);
3910 dev->reloc = false;
3911 return Status;
3912 }
3913
3914 return STATUS_SUCCESS;
3915 }
3916