xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 139a3d66)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22 
23 typedef struct {
24     uint64_t address;
25     uint64_t new_address;
26     tree_header* data;
27     EXTENT_ITEM* ei;
28     tree* t;
29     bool system;
30     LIST_ENTRY refs;
31     LIST_ENTRY list_entry;
32 } metadata_reloc;
33 
34 typedef struct {
35     uint8_t type;
36     uint64_t hash;
37 
38     union {
39         TREE_BLOCK_REF tbr;
40         SHARED_BLOCK_REF sbr;
41     };
42 
43     metadata_reloc* parent;
44     bool top;
45     LIST_ENTRY list_entry;
46 } metadata_reloc_ref;
47 
48 typedef struct {
49     uint64_t address;
50     uint64_t size;
51     uint64_t new_address;
52     chunk* newchunk;
53     EXTENT_ITEM* ei;
54     LIST_ENTRY refs;
55     LIST_ENTRY list_entry;
56 } data_reloc;
57 
58 typedef struct {
59     uint8_t type;
60     uint64_t hash;
61 
62     union {
63         EXTENT_DATA_REF edr;
64         SHARED_DATA_REF sdr;
65     };
66 
67     metadata_reloc* parent;
68     LIST_ENTRY list_entry;
69 } data_reloc_ref;
70 
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74 
75 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
76 
77 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
78                                    bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
79     NTSTATUS Status;
80     metadata_reloc* mr;
81     EXTENT_ITEM* ei;
82     uint16_t len;
83     uint64_t inline_rc;
84     uint8_t* ptr;
85 
86     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
87     if (!mr) {
88         ERR("out of memory\n");
89         return STATUS_INSUFFICIENT_RESOURCES;
90     }
91 
92     mr->address = tp->item->key.obj_id;
93     mr->data = NULL;
94     mr->ei = (EXTENT_ITEM*)tp->item->data;
95     mr->system = false;
96     InitializeListHead(&mr->refs);
97 
98     Status = delete_tree_item(Vcb, tp);
99     if (!NT_SUCCESS(Status)) {
100         ERR("delete_tree_item returned %08lx\n", Status);
101         ExFreePool(mr);
102         return Status;
103     }
104 
105     if (!c)
106         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
107 
108     if (c) {
109         acquire_chunk_lock(c, Vcb);
110 
111         c->used -= Vcb->superblock.node_size;
112 
113         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
114 
115         release_chunk_lock(c, Vcb);
116     }
117 
118     ei = (EXTENT_ITEM*)tp->item->data;
119     inline_rc = 0;
120 
121     len = tp->item->size - sizeof(EXTENT_ITEM);
122     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
123     if (!skinny) {
124         len -= sizeof(EXTENT_ITEM2);
125         ptr += sizeof(EXTENT_ITEM2);
126     }
127 
128     while (len > 0) {
129         uint8_t secttype = *ptr;
130         uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
131         metadata_reloc_ref* ref;
132 
133         len--;
134 
135         if (sectlen > len) {
136             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
137             return STATUS_INTERNAL_ERROR;
138         }
139 
140         if (sectlen == 0) {
141             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
142             return STATUS_INTERNAL_ERROR;
143         }
144 
145         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
146         if (!ref) {
147             ERR("out of memory\n");
148             return STATUS_INSUFFICIENT_RESOURCES;
149         }
150 
151         if (secttype == TYPE_TREE_BLOCK_REF) {
152             ref->type = TYPE_TREE_BLOCK_REF;
153             RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
154             inline_rc++;
155         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
156             ref->type = TYPE_SHARED_BLOCK_REF;
157             RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
158             inline_rc++;
159         } else {
160             ERR("unexpected tree type %x\n", secttype);
161             ExFreePool(ref);
162             return STATUS_INTERNAL_ERROR;
163         }
164 
165         ref->parent = NULL;
166         ref->top = false;
167         InsertTailList(&mr->refs, &ref->list_entry);
168 
169         len -= sectlen;
170         ptr += sizeof(uint8_t) + sectlen;
171     }
172 
173     if (inline_rc < ei->refcount) { // look for non-inline entries
174         traverse_ptr tp2 = *tp, next_tp;
175 
176         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
177             tp2 = next_tp;
178 
179             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
180                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
181                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
182                     if (!ref) {
183                         ERR("out of memory\n");
184                         return STATUS_INSUFFICIENT_RESOURCES;
185                     }
186 
187                     ref->type = TYPE_TREE_BLOCK_REF;
188                     ref->tbr.offset = tp2.item->key.offset;
189                     ref->parent = NULL;
190                     ref->top = false;
191                     InsertTailList(&mr->refs, &ref->list_entry);
192 
193                     Status = delete_tree_item(Vcb, &tp2);
194                     if (!NT_SUCCESS(Status)) {
195                         ERR("delete_tree_item returned %08lx\n", Status);
196                         return Status;
197                     }
198                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
199                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
200                     if (!ref) {
201                         ERR("out of memory\n");
202                         return STATUS_INSUFFICIENT_RESOURCES;
203                     }
204 
205                     ref->type = TYPE_SHARED_BLOCK_REF;
206                     ref->sbr.offset = tp2.item->key.offset;
207                     ref->parent = NULL;
208                     ref->top = false;
209                     InsertTailList(&mr->refs, &ref->list_entry);
210 
211                     Status = delete_tree_item(Vcb, &tp2);
212                     if (!NT_SUCCESS(Status)) {
213                         ERR("delete_tree_item returned %08lx\n", Status);
214                         return Status;
215                     }
216                 }
217             } else
218                 break;
219         }
220     }
221 
222     InsertTailList(items, &mr->list_entry);
223 
224     if (mr2)
225         *mr2 = mr;
226 
227     return STATUS_SUCCESS;
228 }
229 
230 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
231                                           uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
232     LIST_ENTRY* le;
233     KEY searchkey;
234     traverse_ptr tp;
235     bool skinny = false;
236     NTSTATUS Status;
237 
238     le = items->Flink;
239     while (le != items) {
240         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
241 
242         if (mr->address == address) {
243             *mr2 = mr;
244             return STATUS_SUCCESS;
245         }
246 
247         le = le->Flink;
248     }
249 
250     searchkey.obj_id = address;
251     searchkey.obj_type = TYPE_METADATA_ITEM;
252     searchkey.offset = 0xffffffffffffffff;
253 
254     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
255     if (!NT_SUCCESS(Status)) {
256         ERR("find_item returned %08lx\n", Status);
257         return Status;
258     }
259 
260     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
261         skinny = true;
262     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
263              tp.item->size >= sizeof(EXTENT_ITEM)) {
264         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
265 
266         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
267             ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
268             return STATUS_INTERNAL_ERROR;
269         }
270     } else {
271         ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
272         return STATUS_INTERNAL_ERROR;
273     }
274 
275     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
276     if (!NT_SUCCESS(Status)) {
277         ERR("add_metadata_reloc returned %08lx\n", Status);
278         return Status;
279     }
280 
281     return STATUS_SUCCESS;
282 }
283 
284 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
285     LIST_ENTRY newlist, *le;
286 
287     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
288         return;
289 
290     // insertion sort
291 
292     InitializeListHead(&newlist);
293 
294     while (!IsListEmpty(&mr->refs)) {
295         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
296         bool inserted = false;
297 
298         if (ref->type == TYPE_TREE_BLOCK_REF)
299             ref->hash = ref->tbr.offset;
300         else if (ref->type == TYPE_SHARED_BLOCK_REF)
301             ref->hash = ref->parent->new_address;
302 
303         le = newlist.Flink;
304         while (le != &newlist) {
305             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
306 
307             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
308                 InsertHeadList(le->Blink, &ref->list_entry);
309                 inserted = true;
310                 break;
311             }
312 
313             le = le->Flink;
314         }
315 
316         if (!inserted)
317             InsertTailList(&newlist, &ref->list_entry);
318     }
319 
320     newlist.Flink->Blink = &mr->refs;
321     newlist.Blink->Flink = &mr->refs;
322     mr->refs.Flink = newlist.Flink;
323     mr->refs.Blink = newlist.Blink;
324 }
325 
326 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
327     NTSTATUS Status;
328     LIST_ENTRY* le;
329     uint64_t rc = 0;
330     uint16_t inline_len;
331     bool all_inline = true;
332     metadata_reloc_ref* first_noninline = NULL;
333     EXTENT_ITEM* ei;
334     uint8_t* ptr;
335 
336     inline_len = sizeof(EXTENT_ITEM);
337     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
338         inline_len += sizeof(EXTENT_ITEM2);
339 
340     sort_metadata_reloc_refs(mr);
341 
342     le = mr->refs.Flink;
343     while (le != &mr->refs) {
344         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
345         uint16_t extlen = 0;
346 
347         rc++;
348 
349         if (ref->type == TYPE_TREE_BLOCK_REF)
350             extlen += sizeof(TREE_BLOCK_REF);
351         else if (ref->type == TYPE_SHARED_BLOCK_REF)
352             extlen += sizeof(SHARED_BLOCK_REF);
353 
354         if (all_inline) {
355             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
356                 all_inline = false;
357                 first_noninline = ref;
358             } else
359                 inline_len += extlen + 1;
360         }
361 
362         le = le->Flink;
363     }
364 
365     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
366     if (!ei) {
367         ERR("out of memory\n");
368         return STATUS_INSUFFICIENT_RESOURCES;
369     }
370 
371     ei->refcount = rc;
372     ei->generation = mr->ei->generation;
373     ei->flags = mr->ei->flags;
374     ptr = (uint8_t*)&ei[1];
375 
376     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
377         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
378 
379         ei2->firstitem = *(KEY*)&mr->data[1];
380         ei2->level = mr->data->level;
381 
382         ptr += sizeof(EXTENT_ITEM2);
383     }
384 
385     le = mr->refs.Flink;
386     while (le != &mr->refs) {
387         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
388 
389         if (ref == first_noninline)
390             break;
391 
392         *ptr = ref->type;
393         ptr++;
394 
395         if (ref->type == TYPE_TREE_BLOCK_REF) {
396             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
397 
398             tbr->offset = ref->tbr.offset;
399 
400             ptr += sizeof(TREE_BLOCK_REF);
401         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
402             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
403 
404             sbr->offset = ref->parent->new_address;
405 
406             ptr += sizeof(SHARED_BLOCK_REF);
407         }
408 
409         le = le->Flink;
410     }
411 
412     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
413         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
414     else
415         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
416 
417     if (!NT_SUCCESS(Status)) {
418         ERR("insert_tree_item returned %08lx\n", Status);
419         ExFreePool(ei);
420         return Status;
421     }
422 
423     if (!all_inline) {
424         le = &first_noninline->list_entry;
425 
426         while (le != &mr->refs) {
427             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
428 
429             if (ref->type == TYPE_TREE_BLOCK_REF) {
430                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
431                 if (!NT_SUCCESS(Status)) {
432                     ERR("insert_tree_item returned %08lx\n", Status);
433                     return Status;
434                 }
435             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
436                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
437                 if (!NT_SUCCESS(Status)) {
438                     ERR("insert_tree_item returned %08lx\n", Status);
439                     return Status;
440                 }
441             }
442 
443             le = le->Flink;
444         }
445     }
446 
447     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
448         if (mr->data->level > 0) {
449             uint16_t i;
450             internal_node* in = (internal_node*)&mr->data[1];
451 
452             for (i = 0; i < mr->data->num_items; i++) {
453                 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
454 
455                 if (sbrrc > 0) {
456                     SHARED_BLOCK_REF sbr;
457 
458                     sbr.offset = mr->new_address;
459 
460                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
461                     if (!NT_SUCCESS(Status)) {
462                         ERR("increase_extent_refcount returned %08lx\n", Status);
463                         return Status;
464                     }
465 
466                     sbr.offset = mr->address;
467 
468                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
469                                                       sbr.offset, false, NULL);
470                     if (!NT_SUCCESS(Status)) {
471                         ERR("decrease_extent_refcount returned %08lx\n", Status);
472                         return Status;
473                     }
474                 }
475             }
476         } else {
477             uint16_t i;
478             leaf_node* ln = (leaf_node*)&mr->data[1];
479 
480             for (i = 0; i < mr->data->num_items; i++) {
481                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
482                     EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
483 
484                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
485                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
486 
487                         if (ed2->size > 0) { // not sparse
488                             uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
489 
490                             if (sdrrc > 0) {
491                                 SHARED_DATA_REF sdr;
492                                 chunk* c;
493 
494                                 sdr.offset = mr->new_address;
495                                 sdr.count = sdrrc;
496 
497                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
498                                 if (!NT_SUCCESS(Status)) {
499                                     ERR("increase_extent_refcount returned %08lx\n", Status);
500                                     return Status;
501                                 }
502 
503                                 sdr.offset = mr->address;
504 
505                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
506                                                                   sdr.offset, false, NULL);
507                                 if (!NT_SUCCESS(Status)) {
508                                     ERR("decrease_extent_refcount returned %08lx\n", Status);
509                                     return Status;
510                                 }
511 
512                                 c = get_chunk_from_address(Vcb, ed2->address);
513 
514                                 if (c) {
515                                     // check changed_extents
516 
517                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
518 
519                                     le = c->changed_extents.Flink;
520 
521                                     while (le != &c->changed_extents) {
522                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
523 
524                                         if (ce->address == ed2->address) {
525                                             LIST_ENTRY* le2;
526 
527                                             le2 = ce->refs.Flink;
528                                             while (le2 != &ce->refs) {
529                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
530 
531                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
532                                                     cer->sdr.offset = mr->new_address;
533                                                     break;
534                                                 }
535 
536                                                 le2 = le2->Flink;
537                                             }
538 
539                                             le2 = ce->old_refs.Flink;
540                                             while (le2 != &ce->old_refs) {
541                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
542 
543                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
544                                                     cer->sdr.offset = mr->new_address;
545                                                     break;
546                                                 }
547 
548                                                 le2 = le2->Flink;
549                                             }
550 
551                                             break;
552                                         }
553 
554                                         le = le->Flink;
555                                     }
556 
557                                     ExReleaseResourceLite(&c->changed_extents_lock);
558                                 }
559                             }
560                         }
561                     }
562                 }
563             }
564         }
565     }
566 
567     return STATUS_SUCCESS;
568 }
569 
570 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
571                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
572     LIST_ENTRY tree_writes, *le;
573     NTSTATUS Status;
574     traverse_ptr tp;
575     uint8_t level, max_level = 0;
576     chunk* newchunk = NULL;
577 
578     InitializeListHead(&tree_writes);
579 
580     le = items->Flink;
581     while (le != items) {
582         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
583         LIST_ENTRY* le2;
584         chunk* pc;
585 
586         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
587         if (!mr->data) {
588             ERR("out of memory\n");
589             return STATUS_INSUFFICIENT_RESOURCES;
590         }
591 
592         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
593                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
594         if (!NT_SUCCESS(Status)) {
595             ERR("read_data returned %08lx\n", Status);
596             return Status;
597         }
598 
599         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
600             mr->system = true;
601 
602         if (data_items && mr->data->level == 0) {
603             le2 = data_items->Flink;
604             while (le2 != data_items) {
605                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
606                 leaf_node* ln = (leaf_node*)&mr->data[1];
607                 uint16_t i;
608 
609                 for (i = 0; i < mr->data->num_items; i++) {
610                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
611                         EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
612 
613                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
614                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
615 
616                             if (ed2->address == dr->address)
617                                 ed2->address = dr->new_address;
618                         }
619                     }
620                 }
621 
622                 le2 = le2->Flink;
623             }
624         }
625 
626         if (mr->data->level > max_level)
627             max_level = mr->data->level;
628 
629         le2 = mr->refs.Flink;
630         while (le2 != &mr->refs) {
631             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
632 
633             if (ref->type == TYPE_TREE_BLOCK_REF) {
634                 KEY* firstitem;
635                 root* r = NULL;
636                 LIST_ENTRY* le3;
637                 tree* t;
638 
639                 firstitem = (KEY*)&mr->data[1];
640 
641                 le3 = Vcb->roots.Flink;
642                 while (le3 != &Vcb->roots) {
643                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
644 
645                     if (r2->id == ref->tbr.offset) {
646                         r = r2;
647                         break;
648                     }
649 
650                     le3 = le3->Flink;
651                 }
652 
653                 if (!r) {
654                     ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
655                     return STATUS_INTERNAL_ERROR;
656                 }
657 
658                 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
659                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
660                     ERR("find_item_to_level returned %08lx\n", Status);
661                     return Status;
662                 }
663 
664                 t = tp.tree;
665                 while (t && t->header.level < mr->data->level + 1) {
666                     t = t->parent;
667                 }
668 
669                 if (!t)
670                     ref->top = true;
671                 else {
672                     metadata_reloc* mr2;
673 
674                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
675                     if (!NT_SUCCESS(Status)) {
676                         ERR("add_metadata_reloc_parent returned %08lx\n", Status);
677                         return Status;
678                     }
679 
680                     ref->parent = mr2;
681                 }
682             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
683                 metadata_reloc* mr2;
684 
685                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
686                 if (!NT_SUCCESS(Status)) {
687                     ERR("add_metadata_reloc_parent returned %08lx\n", Status);
688                     return Status;
689                 }
690 
691                 ref->parent = mr2;
692             }
693 
694             le2 = le2->Flink;
695         }
696 
697         le = le->Flink;
698     }
699 
700     le = items->Flink;
701     while (le != items) {
702         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
703         LIST_ENTRY* le2;
704         uint32_t hash;
705 
706         mr->t = NULL;
707 
708         hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
709 
710         le2 = Vcb->trees_ptrs[hash >> 24];
711 
712         if (le2) {
713             while (le2 != &Vcb->trees_hash) {
714                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
715 
716                 if (t->header.address == mr->address) {
717                     mr->t = t;
718                     break;
719                 } else if (t->hash > hash)
720                     break;
721 
722                 le2 = le2->Flink;
723             }
724         }
725 
726         le = le->Flink;
727     }
728 
729     for (level = 0; level <= max_level; level++) {
730         le = items->Flink;
731         while (le != items) {
732             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
733 
734             if (mr->data->level == level) {
735                 bool done = false;
736                 LIST_ENTRY* le2;
737                 tree_write* tw;
738                 uint64_t flags;
739                 tree* t3;
740 
741                 if (mr->system)
742                     flags = Vcb->system_flags;
743                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
744                     flags = Vcb->data_flags;
745                 else
746                     flags = Vcb->metadata_flags;
747 
748                 if (newchunk) {
749                     acquire_chunk_lock(newchunk, Vcb);
750 
751                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
752                         newchunk->used += Vcb->superblock.node_size;
753                         space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
754                         done = true;
755                     }
756 
757                     release_chunk_lock(newchunk, Vcb);
758                 }
759 
760                 if (!done) {
761                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
762 
763                     le2 = Vcb->chunks.Flink;
764                     while (le2 != &Vcb->chunks) {
765                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
766 
767                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
768                             acquire_chunk_lock(c2, Vcb);
769 
770                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
771                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
772                                     c2->used += Vcb->superblock.node_size;
773                                     space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
774                                     release_chunk_lock(c2, Vcb);
775                                     newchunk = c2;
776                                     done = true;
777                                     break;
778                                 }
779                             }
780 
781                             release_chunk_lock(c2, Vcb);
782                         }
783 
784                         le2 = le2->Flink;
785                     }
786 
787                     // allocate new chunk if necessary
788                     if (!done) {
789                         Status = alloc_chunk(Vcb, flags, &newchunk, false);
790 
791                         if (!NT_SUCCESS(Status)) {
792                             ERR("alloc_chunk returned %08lx\n", Status);
793                             ExReleaseResourceLite(&Vcb->chunk_lock);
794                             goto end;
795                         }
796 
797                         acquire_chunk_lock(newchunk, Vcb);
798 
799                         newchunk->balance_num = Vcb->balance.balance_num;
800 
801                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
802                             release_chunk_lock(newchunk, Vcb);
803                             ExReleaseResourceLite(&Vcb->chunk_lock);
804                             ERR("could not find address in new chunk\n");
805                             Status = STATUS_DISK_FULL;
806                             goto end;
807                         } else {
808                             newchunk->used += Vcb->superblock.node_size;
809                             space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
810                         }
811 
812                         release_chunk_lock(newchunk, Vcb);
813                     }
814 
815                     ExReleaseResourceLite(&Vcb->chunk_lock);
816                 }
817 
818                 // update parents
819                 le2 = mr->refs.Flink;
820                 while (le2 != &mr->refs) {
821                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
822 
823                     if (ref->parent) {
824                         uint16_t i;
825                         internal_node* in = (internal_node*)&ref->parent->data[1];
826 
827                         for (i = 0; i < ref->parent->data->num_items; i++) {
828                             if (in[i].address == mr->address) {
829                                 in[i].address = mr->new_address;
830                                 break;
831                             }
832                         }
833 
834                         if (ref->parent->t) {
835                             LIST_ENTRY* le3;
836 
837                             le3 = ref->parent->t->itemlist.Flink;
838                             while (le3 != &ref->parent->t->itemlist) {
839                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
840 
841                                 if (!td->inserted && td->treeholder.address == mr->address)
842                                     td->treeholder.address = mr->new_address;
843 
844                                 le3 = le3->Flink;
845                             }
846                         }
847                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
848                         LIST_ENTRY* le3;
849                         root* r = NULL;
850 
851                         // alter ROOT_ITEM
852 
853                         le3 = Vcb->roots.Flink;
854                         while (le3 != &Vcb->roots) {
855                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
856 
857                             if (r2->id == ref->tbr.offset) {
858                                 r = r2;
859                                 break;
860                             }
861 
862                             le3 = le3->Flink;
863                         }
864 
865                         if (r) {
866                             r->treeholder.address = mr->new_address;
867 
868                             if (r == Vcb->root_root)
869                                 Vcb->superblock.root_tree_addr = mr->new_address;
870                             else if (r == Vcb->chunk_root)
871                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
872                             else if (r->root_item.block_number == mr->address) {
873                                 KEY searchkey;
874                                 ROOT_ITEM* ri;
875 
876                                 r->root_item.block_number = mr->new_address;
877 
878                                 searchkey.obj_id = r->id;
879                                 searchkey.obj_type = TYPE_ROOT_ITEM;
880                                 searchkey.offset = 0xffffffffffffffff;
881 
882                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
883                                 if (!NT_SUCCESS(Status)) {
884                                     ERR("find_item returned %08lx\n", Status);
885                                     goto end;
886                                 }
887 
888                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
889                                     ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
890                                     Status = STATUS_INTERNAL_ERROR;
891                                     goto end;
892                                 }
893 
894                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
895                                 if (!ri) {
896                                     ERR("out of memory\n");
897                                     Status = STATUS_INSUFFICIENT_RESOURCES;
898                                     goto end;
899                                 }
900 
901                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
902 
903                                 Status = delete_tree_item(Vcb, &tp);
904                                 if (!NT_SUCCESS(Status)) {
905                                     ERR("delete_tree_item returned %08lx\n", Status);
906                                     goto end;
907                                 }
908 
909                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
910                                 if (!NT_SUCCESS(Status)) {
911                                     ERR("insert_tree_item returned %08lx\n", Status);
912                                     goto end;
913                                 }
914                             }
915                         }
916                     }
917 
918                     le2 = le2->Flink;
919                 }
920 
921                 mr->data->address = mr->new_address;
922 
923                 t3 = mr->t;
924 
925                 while (t3) {
926                     uint8_t h;
927                     bool inserted;
928                     tree* t4 = NULL;
929 
930                     // check if tree loaded more than once
931                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
932                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
933 
934                         if (nt->header.address == t3->header.address)
935                             t4 = nt;
936                     }
937 
938                     t3->header.address = mr->new_address;
939 
940                     h = t3->hash >> 24;
941 
942                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
943                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
944                             Vcb->trees_ptrs[h] = NULL;
945                         else {
946                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
947 
948                             if (t2->hash >> 24 == h)
949                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
950                             else
951                                 Vcb->trees_ptrs[h] = NULL;
952                         }
953                     }
954 
955                     RemoveEntryList(&t3->list_entry_hash);
956 
957                     t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
958                     h = t3->hash >> 24;
959 
960                     if (!Vcb->trees_ptrs[h]) {
961                         uint8_t h2 = h;
962 
963                         le2 = Vcb->trees_hash.Flink;
964 
965                         if (h2 > 0) {
966                             h2--;
967                             do {
968                                 if (Vcb->trees_ptrs[h2]) {
969                                     le2 = Vcb->trees_ptrs[h2];
970                                     break;
971                                 }
972 
973                                 h2--;
974                             } while (h2 > 0);
975                         }
976                     } else
977                         le2 = Vcb->trees_ptrs[h];
978 
979                     inserted = false;
980                     while (le2 != &Vcb->trees_hash) {
981                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
982 
983                         if (t2->hash >= t3->hash) {
984                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
985                             inserted = true;
986                             break;
987                         }
988 
989                         le2 = le2->Flink;
990                     }
991 
992                     if (!inserted)
993                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
994 
995                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
996                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
997 
998                     if (data_items && level == 0) {
999                         le2 = data_items->Flink;
1000 
1001                         while (le2 != data_items) {
1002                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1003                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1004 
1005                             while (le3 != &t3->itemlist) {
1006                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1007 
1008                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1009                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1010 
1011                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1012                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1013 
1014                                         if (ed2->address == dr->address)
1015                                             ed2->address = dr->new_address;
1016                                     }
1017                                 }
1018 
1019                                 le3 = le3->Flink;
1020                             }
1021 
1022                             le2 = le2->Flink;
1023                         }
1024                     }
1025 
1026                     t3 = t4;
1027                 }
1028 
1029                 calc_tree_checksum(Vcb, mr->data);
1030 
1031                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1032                 if (!tw) {
1033                     ERR("out of memory\n");
1034                     Status = STATUS_INSUFFICIENT_RESOURCES;
1035                     goto end;
1036                 }
1037 
1038                 tw->address = mr->new_address;
1039                 tw->length = Vcb->superblock.node_size;
1040                 tw->data = (uint8_t*)mr->data;
1041                 tw->allocated = false;
1042 
1043                 if (IsListEmpty(&tree_writes))
1044                     InsertTailList(&tree_writes, &tw->list_entry);
1045                 else {
1046                     bool inserted = false;
1047 
1048                     le2 = tree_writes.Flink;
1049                     while (le2 != &tree_writes) {
1050                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1051 
1052                         if (tw2->address > tw->address) {
1053                             InsertHeadList(le2->Blink, &tw->list_entry);
1054                             inserted = true;
1055                             break;
1056                         }
1057 
1058                         le2 = le2->Flink;
1059                     }
1060 
1061                     if (!inserted)
1062                         InsertTailList(&tree_writes, &tw->list_entry);
1063                 }
1064             }
1065 
1066             le = le->Flink;
1067         }
1068     }
1069 
1070     Status = do_tree_writes(Vcb, &tree_writes, true);
1071     if (!NT_SUCCESS(Status)) {
1072         ERR("do_tree_writes returned %08lx\n", Status);
1073         goto end;
1074     }
1075 
1076     le = items->Flink;
1077     while (le != items) {
1078         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1079 
1080         Status = add_metadata_reloc_extent_item(Vcb, mr);
1081         if (!NT_SUCCESS(Status)) {
1082             ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1083             goto end;
1084         }
1085 
1086         le = le->Flink;
1087     }
1088 
1089     Status = STATUS_SUCCESS;
1090 
1091 end:
1092     while (!IsListEmpty(&tree_writes)) {
1093         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1094 
1095         if (tw->allocated)
1096             ExFreePool(tw->data);
1097 
1098         ExFreePool(tw);
1099     }
1100 
1101     return Status;
1102 }
1103 
1104 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1105     KEY searchkey;
1106     traverse_ptr tp;
1107     NTSTATUS Status;
1108     bool b;
1109     LIST_ENTRY items, rollback;
1110     uint32_t loaded = 0;
1111 
1112     TRACE("chunk %I64x\n", c->offset);
1113 
1114     InitializeListHead(&rollback);
1115     InitializeListHead(&items);
1116 
1117     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1118 
1119     searchkey.obj_id = c->offset;
1120     searchkey.obj_type = TYPE_METADATA_ITEM;
1121     searchkey.offset = 0xffffffffffffffff;
1122 
1123     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1124     if (!NT_SUCCESS(Status)) {
1125         ERR("find_item returned %08lx\n", Status);
1126         goto end;
1127     }
1128 
1129     do {
1130         traverse_ptr next_tp;
1131 
1132         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1133             break;
1134 
1135         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1136             bool tree = false, skinny = false;
1137 
1138             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1139                 tree = true;
1140                 skinny = true;
1141             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1142                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1143                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1144 
1145                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1146                     tree = true;
1147             }
1148 
1149             if (tree) {
1150                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1151 
1152                 if (!NT_SUCCESS(Status)) {
1153                     ERR("add_metadata_reloc returned %08lx\n", Status);
1154                     goto end;
1155                 }
1156 
1157                 loaded++;
1158 
1159                 if (loaded >= 64) // only do 64 at a time
1160                     break;
1161             }
1162         }
1163 
1164         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1165 
1166         if (b)
1167             tp = next_tp;
1168     } while (b);
1169 
1170     if (IsListEmpty(&items)) {
1171         *changed = false;
1172         Status = STATUS_SUCCESS;
1173         goto end;
1174     } else
1175         *changed = true;
1176 
1177     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1178     if (!NT_SUCCESS(Status)) {
1179         ERR("write_metadata_items returned %08lx\n", Status);
1180         goto end;
1181     }
1182 
1183     Status = STATUS_SUCCESS;
1184 
1185     Vcb->need_write = true;
1186 
1187 end:
1188     if (NT_SUCCESS(Status)) {
1189         Status = do_write(Vcb, NULL);
1190         if (!NT_SUCCESS(Status))
1191             ERR("do_write returned %08lx\n", Status);
1192     }
1193 
1194     if (NT_SUCCESS(Status))
1195         clear_rollback(&rollback);
1196     else
1197         do_rollback(Vcb, &rollback);
1198 
1199     free_trees(Vcb);
1200 
1201     ExReleaseResourceLite(&Vcb->tree_lock);
1202 
1203     while (!IsListEmpty(&items)) {
1204         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1205 
1206         while (!IsListEmpty(&mr->refs)) {
1207             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1208 
1209             ExFreePool(ref);
1210         }
1211 
1212         if (mr->data)
1213             ExFreePool(mr->data);
1214 
1215         ExFreePool(mr);
1216     }
1217 
1218     return Status;
1219 }
1220 
1221 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1222                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1223     NTSTATUS Status;
1224     LIST_ENTRY* le;
1225     KEY searchkey;
1226     traverse_ptr tp;
1227     root* r = NULL;
1228     metadata_reloc* mr;
1229     uint64_t last_tree = 0;
1230     data_reloc_ref* ref;
1231 
1232     le = Vcb->roots.Flink;
1233     while (le != &Vcb->roots) {
1234         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1235 
1236         if (r2->id == edr->root) {
1237             r = r2;
1238             break;
1239         }
1240 
1241         le = le->Flink;
1242     }
1243 
1244     if (!r) {
1245         ERR("could not find subvol %I64x\n", edr->root);
1246         return STATUS_INTERNAL_ERROR;
1247     }
1248 
1249     searchkey.obj_id = edr->objid;
1250     searchkey.obj_type = TYPE_EXTENT_DATA;
1251     searchkey.offset = 0;
1252 
1253     Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1254     if (!NT_SUCCESS(Status)) {
1255         ERR("find_item returned %08lx\n", Status);
1256         return Status;
1257     }
1258 
1259     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1260         traverse_ptr tp2;
1261 
1262         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1263             tp = tp2;
1264         else {
1265             ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1266             return STATUS_INTERNAL_ERROR;
1267         }
1268     }
1269 
1270     ref = NULL;
1271 
1272     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1273         traverse_ptr tp2;
1274 
1275         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1276             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1277 
1278             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1279                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1280 
1281                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1282                     if (ref && last_tree == tp.tree->header.address)
1283                         ref->edr.count++;
1284                     else {
1285                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1286                         if (!ref) {
1287                             ERR("out of memory\n");
1288                             return STATUS_INSUFFICIENT_RESOURCES;
1289                         }
1290 
1291                         ref->type = TYPE_EXTENT_DATA_REF;
1292                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1293                         ref->edr.count = 1;
1294 
1295                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1296                         if (!NT_SUCCESS(Status)) {
1297                             ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1298                             ExFreePool(ref);
1299                             return Status;
1300                         }
1301 
1302                         last_tree = tp.tree->header.address;
1303                         ref->parent = mr;
1304 
1305                         InsertTailList(&dr->refs, &ref->list_entry);
1306                     }
1307                 }
1308             }
1309         }
1310 
1311         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1312             tp = tp2;
1313         else
1314             break;
1315     }
1316 
1317     return STATUS_SUCCESS;
1318 }
1319 
1320 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1321                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1322     NTSTATUS Status;
1323     data_reloc* dr;
1324     EXTENT_ITEM* ei;
1325     uint16_t len;
1326     uint64_t inline_rc;
1327     uint8_t* ptr;
1328 
1329     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1330     if (!dr) {
1331         ERR("out of memory\n");
1332         return STATUS_INSUFFICIENT_RESOURCES;
1333     }
1334 
1335     dr->address = tp->item->key.obj_id;
1336     dr->size = tp->item->key.offset;
1337     dr->ei = (EXTENT_ITEM*)tp->item->data;
1338     InitializeListHead(&dr->refs);
1339 
1340     Status = delete_tree_item(Vcb, tp);
1341     if (!NT_SUCCESS(Status)) {
1342         ERR("delete_tree_item returned %08lx\n", Status);
1343         return Status;
1344     }
1345 
1346     if (!c)
1347         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1348 
1349     if (c) {
1350         acquire_chunk_lock(c, Vcb);
1351 
1352         c->used -= tp->item->key.offset;
1353 
1354         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1355 
1356         release_chunk_lock(c, Vcb);
1357     }
1358 
1359     ei = (EXTENT_ITEM*)tp->item->data;
1360     inline_rc = 0;
1361 
1362     len = tp->item->size - sizeof(EXTENT_ITEM);
1363     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1364 
1365     while (len > 0) {
1366         uint8_t secttype = *ptr;
1367         uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1368 
1369         len--;
1370 
1371         if (sectlen > len) {
1372             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1373             return STATUS_INTERNAL_ERROR;
1374         }
1375 
1376         if (sectlen == 0) {
1377             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1378             return STATUS_INTERNAL_ERROR;
1379         }
1380 
1381         if (secttype == TYPE_EXTENT_DATA_REF) {
1382             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1383 
1384             inline_rc += edr->count;
1385 
1386             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1387             if (!NT_SUCCESS(Status)) {
1388                 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1389                 return Status;
1390             }
1391         } else if (secttype == TYPE_SHARED_DATA_REF) {
1392             metadata_reloc* mr;
1393             data_reloc_ref* ref;
1394 
1395             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1396             if (!ref) {
1397                 ERR("out of memory\n");
1398                 return STATUS_INSUFFICIENT_RESOURCES;
1399             }
1400 
1401             ref->type = TYPE_SHARED_DATA_REF;
1402             RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1403             inline_rc += ref->sdr.count;
1404 
1405             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1406             if (!NT_SUCCESS(Status)) {
1407                 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1408                 ExFreePool(ref);
1409                 return Status;
1410             }
1411 
1412             ref->parent = mr;
1413 
1414             InsertTailList(&dr->refs, &ref->list_entry);
1415         } else {
1416             ERR("unexpected tree type %x\n", secttype);
1417             return STATUS_INTERNAL_ERROR;
1418         }
1419 
1420 
1421         len -= sectlen;
1422         ptr += sizeof(uint8_t) + sectlen;
1423     }
1424 
1425     if (inline_rc < ei->refcount) { // look for non-inline entries
1426         traverse_ptr tp2 = *tp, next_tp;
1427 
1428         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1429             tp2 = next_tp;
1430 
1431             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1432                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1433                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1434                     if (!NT_SUCCESS(Status)) {
1435                         ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1436                         return Status;
1437                     }
1438 
1439                     Status = delete_tree_item(Vcb, &tp2);
1440                     if (!NT_SUCCESS(Status)) {
1441                         ERR("delete_tree_item returned %08lx\n", Status);
1442                         return Status;
1443                     }
1444                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1445                     metadata_reloc* mr;
1446                     data_reloc_ref* ref;
1447 
1448                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1449                     if (!ref) {
1450                         ERR("out of memory\n");
1451                         return STATUS_INSUFFICIENT_RESOURCES;
1452                     }
1453 
1454                     ref->type = TYPE_SHARED_DATA_REF;
1455                     ref->sdr.offset = tp2.item->key.offset;
1456                     ref->sdr.count = *((uint32_t*)tp2.item->data);
1457 
1458                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1459                     if (!NT_SUCCESS(Status)) {
1460                         ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1461                         ExFreePool(ref);
1462                         return Status;
1463                     }
1464 
1465                     ref->parent = mr;
1466                     InsertTailList(&dr->refs, &ref->list_entry);
1467 
1468                     Status = delete_tree_item(Vcb, &tp2);
1469                     if (!NT_SUCCESS(Status)) {
1470                         ERR("delete_tree_item returned %08lx\n", Status);
1471                         return Status;
1472                     }
1473                 }
1474             } else
1475                 break;
1476         }
1477     }
1478 
1479     InsertTailList(items, &dr->list_entry);
1480 
1481     return STATUS_SUCCESS;
1482 }
1483 
1484 static void sort_data_reloc_refs(data_reloc* dr) {
1485     LIST_ENTRY newlist, *le;
1486 
1487     if (IsListEmpty(&dr->refs))
1488         return;
1489 
1490     // insertion sort
1491 
1492     InitializeListHead(&newlist);
1493 
1494     while (!IsListEmpty(&dr->refs)) {
1495         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1496         bool inserted = false;
1497 
1498         if (ref->type == TYPE_EXTENT_DATA_REF)
1499             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1500         else if (ref->type == TYPE_SHARED_DATA_REF)
1501             ref->hash = ref->parent->new_address;
1502 
1503         le = newlist.Flink;
1504         while (le != &newlist) {
1505             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1506 
1507             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1508                 InsertHeadList(le->Blink, &ref->list_entry);
1509                 inserted = true;
1510                 break;
1511             }
1512 
1513             le = le->Flink;
1514         }
1515 
1516         if (!inserted)
1517             InsertTailList(&newlist, &ref->list_entry);
1518     }
1519 
1520     le = newlist.Flink;
1521     while (le != &newlist) {
1522         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1523 
1524         if (le->Flink != &newlist) {
1525             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1526 
1527             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1528                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1529                 RemoveEntryList(&ref2->list_entry);
1530                 ref->edr.count += ref2->edr.count;
1531                 ExFreePool(ref2);
1532                 continue;
1533             }
1534         }
1535 
1536         le = le->Flink;
1537     }
1538 
1539     newlist.Flink->Blink = &dr->refs;
1540     newlist.Blink->Flink = &dr->refs;
1541     dr->refs.Flink = newlist.Flink;
1542     dr->refs.Blink = newlist.Blink;
1543 }
1544 
1545 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1546     NTSTATUS Status;
1547     LIST_ENTRY* le;
1548     uint64_t rc = 0;
1549     uint16_t inline_len;
1550     bool all_inline = true;
1551     data_reloc_ref* first_noninline = NULL;
1552     EXTENT_ITEM* ei;
1553     uint8_t* ptr;
1554 
1555     inline_len = sizeof(EXTENT_ITEM);
1556 
1557     sort_data_reloc_refs(dr);
1558 
1559     le = dr->refs.Flink;
1560     while (le != &dr->refs) {
1561         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1562         uint16_t extlen = 0;
1563 
1564         if (ref->type == TYPE_EXTENT_DATA_REF) {
1565             extlen += sizeof(EXTENT_DATA_REF);
1566             rc += ref->edr.count;
1567         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1568             extlen += sizeof(SHARED_DATA_REF);
1569             rc++;
1570         }
1571 
1572         if (all_inline) {
1573             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1574                 all_inline = false;
1575                 first_noninline = ref;
1576             } else
1577                 inline_len += extlen + 1;
1578         }
1579 
1580         le = le->Flink;
1581     }
1582 
1583     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1584     if (!ei) {
1585         ERR("out of memory\n");
1586         return STATUS_INSUFFICIENT_RESOURCES;
1587     }
1588 
1589     ei->refcount = rc;
1590     ei->generation = dr->ei->generation;
1591     ei->flags = dr->ei->flags;
1592     ptr = (uint8_t*)&ei[1];
1593 
1594     le = dr->refs.Flink;
1595     while (le != &dr->refs) {
1596         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1597 
1598         if (ref == first_noninline)
1599             break;
1600 
1601         *ptr = ref->type;
1602         ptr++;
1603 
1604         if (ref->type == TYPE_EXTENT_DATA_REF) {
1605             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1606 
1607             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1608 
1609             ptr += sizeof(EXTENT_DATA_REF);
1610         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1611             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1612 
1613             sdr->offset = ref->parent->new_address;
1614             sdr->count = ref->sdr.count;
1615 
1616             ptr += sizeof(SHARED_DATA_REF);
1617         }
1618 
1619         le = le->Flink;
1620     }
1621 
1622     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1623     if (!NT_SUCCESS(Status)) {
1624         ERR("insert_tree_item returned %08lx\n", Status);
1625         return Status;
1626     }
1627 
1628     if (!all_inline) {
1629         le = &first_noninline->list_entry;
1630 
1631         while (le != &dr->refs) {
1632             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1633 
1634             if (ref->type == TYPE_EXTENT_DATA_REF) {
1635                 EXTENT_DATA_REF* edr;
1636 
1637                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1638                 if (!edr) {
1639                     ERR("out of memory\n");
1640                     return STATUS_INSUFFICIENT_RESOURCES;
1641                 }
1642 
1643                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1644 
1645                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1646                 if (!NT_SUCCESS(Status)) {
1647                     ERR("insert_tree_item returned %08lx\n", Status);
1648                     return Status;
1649                 }
1650             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1651                 uint32_t* sdr;
1652 
1653                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1654                 if (!sdr) {
1655                     ERR("out of memory\n");
1656                     return STATUS_INSUFFICIENT_RESOURCES;
1657                 }
1658 
1659                 *sdr = ref->sdr.count;
1660 
1661                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1662                 if (!NT_SUCCESS(Status)) {
1663                     ERR("insert_tree_item returned %08lx\n", Status);
1664                     return Status;
1665                 }
1666             }
1667 
1668             le = le->Flink;
1669         }
1670     }
1671 
1672     return STATUS_SUCCESS;
1673 }
1674 
1675 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1676     KEY searchkey;
1677     traverse_ptr tp;
1678     NTSTATUS Status;
1679     bool b;
1680     LIST_ENTRY items, metadata_items, rollback, *le;
1681     uint64_t loaded = 0, num_loaded = 0;
1682     chunk* newchunk = NULL;
1683     uint8_t* data = NULL;
1684 
1685     TRACE("chunk %I64x\n", c->offset);
1686 
1687     InitializeListHead(&rollback);
1688     InitializeListHead(&items);
1689     InitializeListHead(&metadata_items);
1690 
1691     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1692 
1693     searchkey.obj_id = c->offset;
1694     searchkey.obj_type = TYPE_EXTENT_ITEM;
1695     searchkey.offset = 0xffffffffffffffff;
1696 
1697     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1698     if (!NT_SUCCESS(Status)) {
1699         ERR("find_item returned %08lx\n", Status);
1700         goto end;
1701     }
1702 
1703     do {
1704         traverse_ptr next_tp;
1705 
1706         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1707             break;
1708 
1709         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1710             bool tree = false;
1711 
1712             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1713                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1714 
1715                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1716                     tree = true;
1717             }
1718 
1719             if (!tree) {
1720                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1721 
1722                 if (!NT_SUCCESS(Status)) {
1723                     ERR("add_data_reloc returned %08lx\n", Status);
1724                     goto end;
1725                 }
1726 
1727                 loaded += tp.item->key.offset;
1728                 num_loaded++;
1729 
1730                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1731                     break;
1732             }
1733         }
1734 
1735         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1736 
1737         if (b)
1738             tp = next_tp;
1739     } while (b);
1740 
1741     if (IsListEmpty(&items)) {
1742         *changed = false;
1743         Status = STATUS_SUCCESS;
1744         goto end;
1745     } else
1746         *changed = true;
1747 
1748     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1749     if (!data) {
1750         ERR("out of memory\n");
1751         Status = STATUS_INSUFFICIENT_RESOURCES;
1752         goto end;
1753     }
1754 
1755     le = items.Flink;
1756     while (le != &items) {
1757         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1758         bool done = false;
1759         LIST_ENTRY* le2;
1760         void* csum;
1761         RTL_BITMAP bmp;
1762         ULONG* bmparr;
1763         ULONG bmplen, runlength, index, lastoff;
1764 
1765         if (newchunk) {
1766             acquire_chunk_lock(newchunk, Vcb);
1767 
1768             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1769                 newchunk->used += dr->size;
1770                 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1771                 done = true;
1772             }
1773 
1774             release_chunk_lock(newchunk, Vcb);
1775         }
1776 
1777         if (!done) {
1778             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1779 
1780             le2 = Vcb->chunks.Flink;
1781             while (le2 != &Vcb->chunks) {
1782                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1783 
1784                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1785                     acquire_chunk_lock(c2, Vcb);
1786 
1787                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1788                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1789                             c2->used += dr->size;
1790                             space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1791                             release_chunk_lock(c2, Vcb);
1792                             newchunk = c2;
1793                             done = true;
1794                             break;
1795                         }
1796                     }
1797 
1798                     release_chunk_lock(c2, Vcb);
1799                 }
1800 
1801                 le2 = le2->Flink;
1802             }
1803 
1804             // allocate new chunk if necessary
1805             if (!done) {
1806                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1807 
1808                 if (!NT_SUCCESS(Status)) {
1809                     ERR("alloc_chunk returned %08lx\n", Status);
1810                     ExReleaseResourceLite(&Vcb->chunk_lock);
1811                     goto end;
1812                 }
1813 
1814                 acquire_chunk_lock(newchunk, Vcb);
1815 
1816                 newchunk->balance_num = Vcb->balance.balance_num;
1817 
1818                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1819                     release_chunk_lock(newchunk, Vcb);
1820                     ExReleaseResourceLite(&Vcb->chunk_lock);
1821                     ERR("could not find address in new chunk\n");
1822                     Status = STATUS_DISK_FULL;
1823                     goto end;
1824                 } else {
1825                     newchunk->used += dr->size;
1826                     space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1827                 }
1828 
1829                 release_chunk_lock(newchunk, Vcb);
1830             }
1831 
1832             ExReleaseResourceLite(&Vcb->chunk_lock);
1833         }
1834 
1835         dr->newchunk = newchunk;
1836 
1837         bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1838 
1839         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1840         if (!bmparr) {
1841             ERR("out of memory\n");
1842             Status = STATUS_INSUFFICIENT_RESOURCES;
1843             goto end;
1844         }
1845 
1846         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * Vcb->csum_size / Vcb->superblock.sector_size), ALLOC_TAG);
1847         if (!csum) {
1848             ERR("out of memory\n");
1849             ExFreePool(bmparr);
1850             Status = STATUS_INSUFFICIENT_RESOURCES;
1851             goto end;
1852         }
1853 
1854         RtlInitializeBitMap(&bmp, bmparr, bmplen);
1855         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1856 
1857         searchkey.obj_id = EXTENT_CSUM_ID;
1858         searchkey.obj_type = TYPE_EXTENT_CSUM;
1859         searchkey.offset = dr->address;
1860 
1861         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1862         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1863             ERR("find_item returned %08lx\n", Status);
1864             ExFreePool(csum);
1865             ExFreePool(bmparr);
1866             goto end;
1867         }
1868 
1869         if (Status != STATUS_NOT_FOUND) {
1870             do {
1871                 traverse_ptr next_tp;
1872 
1873                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1874                     if (tp.item->key.offset >= dr->address + dr->size)
1875                         break;
1876                     else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size) >= dr->address) {
1877                         uint64_t cs = max(dr->address, tp.item->key.offset);
1878                         uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / Vcb->csum_size));
1879 
1880                         RtlCopyMemory((uint8_t*)csum + ((cs - dr->address) * Vcb->csum_size / Vcb->superblock.sector_size),
1881                                       tp.item->data + ((cs - tp.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size),
1882                                       (ULONG)((ce - cs) * Vcb->csum_size / Vcb->superblock.sector_size));
1883 
1884                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1885 
1886                         if (ce == dr->address + dr->size)
1887                             break;
1888                     }
1889                 }
1890 
1891                 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1892                     tp = next_tp;
1893                 else
1894                     break;
1895             } while (true);
1896         }
1897 
1898         lastoff = 0;
1899         runlength = RtlFindFirstRunClear(&bmp, &index);
1900 
1901         while (runlength != 0) {
1902             if (index >= bmplen)
1903                 break;
1904 
1905             if (index + runlength >= bmplen) {
1906                 runlength = bmplen - index;
1907 
1908                 if (runlength == 0)
1909                     break;
1910             }
1911 
1912             if (index > lastoff) {
1913                 ULONG off = lastoff;
1914                 ULONG size = index - lastoff;
1915 
1916                 // handle no csum run
1917                 do {
1918                     ULONG rl;
1919 
1920                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1921                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1922                     else
1923                         rl = size;
1924 
1925                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1926                                        c, NULL, NULL, 0, false, NormalPagePriority);
1927                     if (!NT_SUCCESS(Status)) {
1928                         ERR("read_data returned %08lx\n", Status);
1929                         ExFreePool(csum);
1930                         ExFreePool(bmparr);
1931                         goto end;
1932                     }
1933 
1934                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1935                                                  NULL, newchunk, false, 0, NormalPagePriority);
1936                     if (!NT_SUCCESS(Status)) {
1937                         ERR("write_data_complete returned %08lx\n", Status);
1938                         ExFreePool(csum);
1939                         ExFreePool(bmparr);
1940                         goto end;
1941                     }
1942 
1943                     size -= rl;
1944                     off += rl;
1945                 } while (size > 0);
1946             }
1947 
1948             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1949             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1950 
1951             // handle csum run
1952             do {
1953                 ULONG rl;
1954 
1955                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1956                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1957                 else
1958                     rl = runlength;
1959 
1960                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size,
1961                                    (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1962                 if (!NT_SUCCESS(Status)) {
1963                     ERR("read_data returned %08lx\n", Status);
1964                     ExFreePool(csum);
1965                     ExFreePool(bmparr);
1966                     goto end;
1967                 }
1968 
1969                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1970                                              NULL, newchunk, false, 0, NormalPagePriority);
1971                 if (!NT_SUCCESS(Status)) {
1972                     ERR("write_data_complete returned %08lx\n", Status);
1973                     ExFreePool(csum);
1974                     ExFreePool(bmparr);
1975                     goto end;
1976                 }
1977 
1978                 runlength -= rl;
1979                 index += rl;
1980             } while (runlength > 0);
1981 
1982             lastoff = index;
1983             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1984         }
1985 
1986         ExFreePool(csum);
1987         ExFreePool(bmparr);
1988 
1989         // handle final nocsum run
1990         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1991             ULONG off = lastoff;
1992             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1993 
1994             do {
1995                 ULONG rl;
1996 
1997                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1998                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1999                 else
2000                     rl = size;
2001 
2002                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2003                                    c, NULL, NULL, 0, false, NormalPagePriority);
2004                 if (!NT_SUCCESS(Status)) {
2005                     ERR("read_data returned %08lx\n", Status);
2006                     goto end;
2007                 }
2008 
2009                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2010                                              NULL, newchunk, false, 0, NormalPagePriority);
2011                 if (!NT_SUCCESS(Status)) {
2012                     ERR("write_data_complete returned %08lx\n", Status);
2013                     goto end;
2014                 }
2015 
2016                 size -= rl;
2017                 off += rl;
2018             } while (size > 0);
2019         }
2020 
2021         le = le->Flink;
2022     }
2023 
2024     ExFreePool(data);
2025     data = NULL;
2026 
2027     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2028     if (!NT_SUCCESS(Status)) {
2029         ERR("write_metadata_items returned %08lx\n", Status);
2030         goto end;
2031     }
2032 
2033     le = items.Flink;
2034     while (le != &items) {
2035         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2036 
2037         Status = add_data_reloc_extent_item(Vcb, dr);
2038         if (!NT_SUCCESS(Status)) {
2039             ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2040             goto end;
2041         }
2042 
2043         le = le->Flink;
2044     }
2045 
2046     le = c->changed_extents.Flink;
2047     while (le != &c->changed_extents) {
2048         LIST_ENTRY *le2, *le3;
2049         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2050 
2051         le3 = le->Flink;
2052 
2053         le2 = items.Flink;
2054         while (le2 != &items) {
2055             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2056 
2057             if (ce->address == dr->address) {
2058                 ce->address = dr->new_address;
2059                 RemoveEntryList(&ce->list_entry);
2060                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2061                 break;
2062             }
2063 
2064             le2 = le2->Flink;
2065         }
2066 
2067         le = le3;
2068     }
2069 
2070     Status = STATUS_SUCCESS;
2071 
2072     Vcb->need_write = true;
2073 
2074 end:
2075     if (NT_SUCCESS(Status)) {
2076         // update extents in cache inodes before we flush
2077         le = Vcb->chunks.Flink;
2078         while (le != &Vcb->chunks) {
2079             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2080 
2081             if (c2->cache) {
2082                 LIST_ENTRY* le2;
2083 
2084                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2085 
2086                 le2 = c2->cache->extents.Flink;
2087                 while (le2 != &c2->cache->extents) {
2088                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2089 
2090                     if (!ext->ignore) {
2091                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2092                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2093 
2094                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2095                                 LIST_ENTRY* le3 = items.Flink;
2096                                 while (le3 != &items) {
2097                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2098 
2099                                     if (ed2->address == dr->address) {
2100                                         ed2->address = dr->new_address;
2101                                         break;
2102                                     }
2103 
2104                                     le3 = le3->Flink;
2105                                 }
2106                             }
2107                         }
2108                     }
2109 
2110                     le2 = le2->Flink;
2111                 }
2112 
2113                 ExReleaseResourceLite(c2->cache->Header.Resource);
2114             }
2115 
2116             le = le->Flink;
2117         }
2118 
2119         Status = do_write(Vcb, NULL);
2120         if (!NT_SUCCESS(Status))
2121             ERR("do_write returned %08lx\n", Status);
2122     }
2123 
2124     if (NT_SUCCESS(Status)) {
2125         clear_rollback(&rollback);
2126 
2127         // update open FCBs
2128         // FIXME - speed this up(?)
2129 
2130         le = Vcb->all_fcbs.Flink;
2131         while (le != &Vcb->all_fcbs) {
2132             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2133             LIST_ENTRY* le2;
2134 
2135             ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2136 
2137             le2 = fcb->extents.Flink;
2138             while (le2 != &fcb->extents) {
2139                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2140 
2141                 if (!ext->ignore) {
2142                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2143                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2144 
2145                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2146                             LIST_ENTRY* le3 = items.Flink;
2147                             while (le3 != &items) {
2148                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2149 
2150                                 if (ed2->address == dr->address) {
2151                                     ed2->address = dr->new_address;
2152                                     break;
2153                                 }
2154 
2155                                 le3 = le3->Flink;
2156                             }
2157                         }
2158                     }
2159                 }
2160 
2161                 le2 = le2->Flink;
2162             }
2163 
2164             ExReleaseResourceLite(fcb->Header.Resource);
2165 
2166             le = le->Flink;
2167         }
2168     } else
2169         do_rollback(Vcb, &rollback);
2170 
2171     free_trees(Vcb);
2172 
2173     ExReleaseResourceLite(&Vcb->tree_lock);
2174 
2175     if (data)
2176         ExFreePool(data);
2177 
2178     while (!IsListEmpty(&items)) {
2179         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2180 
2181         while (!IsListEmpty(&dr->refs)) {
2182             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2183 
2184             ExFreePool(ref);
2185         }
2186 
2187         ExFreePool(dr);
2188     }
2189 
2190     while (!IsListEmpty(&metadata_items)) {
2191         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2192 
2193         while (!IsListEmpty(&mr->refs)) {
2194             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2195 
2196             ExFreePool(ref);
2197         }
2198 
2199         if (mr->data)
2200             ExFreePool(mr->data);
2201 
2202         ExFreePool(mr);
2203     }
2204 
2205     return Status;
2206 }
2207 
2208 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2209     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2210         return BLOCK_FLAG_RAID0;
2211     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2212         return BLOCK_FLAG_RAID1;
2213     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2214         return BLOCK_FLAG_DUPLICATE;
2215     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2216         return BLOCK_FLAG_RAID10;
2217     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2218         return BLOCK_FLAG_RAID5;
2219     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2220         return BLOCK_FLAG_RAID6;
2221     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2222         return BLOCK_FLAG_RAID1C3;
2223     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2224         return BLOCK_FLAG_RAID1C4;
2225     else
2226         return BLOCK_FLAG_SINGLE;
2227 }
2228 
2229 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2230     btrfs_balance_opts* opts;
2231 
2232     opts = &Vcb->balance.opts[sort];
2233 
2234     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2235         return false;
2236 
2237     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2238         uint64_t type = get_chunk_dup_type(c);
2239 
2240         if (!(type & opts->profiles))
2241             return false;
2242     }
2243 
2244     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2245         uint16_t i;
2246         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2247         bool b = false;
2248 
2249         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2250             if (cis[i].dev_id == opts->devid) {
2251                 b = true;
2252                 break;
2253             }
2254         }
2255 
2256         if (!b)
2257             return false;
2258     }
2259 
2260     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2261         uint16_t i, factor;
2262         uint64_t physsize;
2263         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2264         bool b = false;
2265 
2266         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2267             factor = c->chunk_item->num_stripes;
2268         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2269             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2270         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2271             factor = c->chunk_item->num_stripes - 1;
2272         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2273             factor = c->chunk_item->num_stripes - 2;
2274         else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2275             factor = 1;
2276 
2277         physsize = c->chunk_item->size / factor;
2278 
2279         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2280             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2281                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2282                 b = true;
2283                 break;
2284             }
2285         }
2286 
2287         if (!b)
2288             return false;
2289     }
2290 
2291     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2292         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2293             return false;
2294     }
2295 
2296     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2297         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2298             return false;
2299     }
2300 
2301     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2302         uint64_t usage = c->used * 100 / c->chunk_item->size;
2303 
2304         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2305         if (c->used > 0 && usage == 0)
2306             usage = 1;
2307 
2308         if (usage < opts->usage_start || usage > opts->usage_end)
2309             return false;
2310     }
2311 
2312     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2313         uint64_t type = get_chunk_dup_type(c);
2314 
2315         if (type == opts->convert)
2316             return false;
2317     }
2318 
2319     return true;
2320 }
2321 
2322 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2323     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2324         args->profiles = opts->profiles;
2325         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2326     }
2327 
2328     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2329         if (args->usage_start == 0) {
2330             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2331             args->usage_start = opts->usage_start;
2332             args->usage_end = opts->usage_end;
2333         } else {
2334             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2335             args->usage = opts->usage_end;
2336         }
2337     }
2338 
2339     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2340         args->devid = opts->devid;
2341         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2342     }
2343 
2344     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2345         args->drange_start = opts->drange_start;
2346         args->drange_end = opts->drange_end;
2347         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2348     }
2349 
2350     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2351         args->vrange_start = opts->vrange_start;
2352         args->vrange_end = opts->vrange_end;
2353         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2354     }
2355 
2356     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2357         args->convert = opts->convert;
2358         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2359 
2360         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2361             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2362     }
2363 
2364     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2365         if (args->limit_start == 0) {
2366             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2367             args->limit_start = (uint32_t)opts->limit_start;
2368             args->limit_end = (uint32_t)opts->limit_end;
2369         } else {
2370             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2371             args->limit = opts->limit_end;
2372         }
2373     }
2374 
2375     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2376         args->stripes_start = opts->stripes_start;
2377         args->stripes_end = opts->stripes_end;
2378         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2379     }
2380 }
2381 
2382 static NTSTATUS add_balance_item(device_extension* Vcb) {
2383     KEY searchkey;
2384     traverse_ptr tp;
2385     NTSTATUS Status;
2386     BALANCE_ITEM* bi;
2387 
2388     searchkey.obj_id = BALANCE_ITEM_ID;
2389     searchkey.obj_type = TYPE_TEMP_ITEM;
2390     searchkey.offset = 0;
2391 
2392     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2393 
2394     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2395     if (!NT_SUCCESS(Status)) {
2396         ERR("find_item returned %08lx\n", Status);
2397         goto end;
2398     }
2399 
2400     if (!keycmp(tp.item->key, searchkey)) {
2401         Status = delete_tree_item(Vcb, &tp);
2402         if (!NT_SUCCESS(Status)) {
2403             ERR("delete_tree_item returned %08lx\n", Status);
2404             goto end;
2405         }
2406     }
2407 
2408     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2409     if (!bi) {
2410         ERR("out of memory\n");
2411         Status = STATUS_INSUFFICIENT_RESOURCES;
2412         goto end;
2413     }
2414 
2415     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2416 
2417     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2418         bi->flags |= BALANCE_FLAGS_DATA;
2419         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2420     }
2421 
2422     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2423         bi->flags |= BALANCE_FLAGS_METADATA;
2424         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2425     }
2426 
2427     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2428         bi->flags |= BALANCE_FLAGS_SYSTEM;
2429         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2430     }
2431 
2432     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2433     if (!NT_SUCCESS(Status)) {
2434         ERR("insert_tree_item returned %08lx\n", Status);
2435         ExFreePool(bi);
2436         goto end;
2437     }
2438 
2439     Status = STATUS_SUCCESS;
2440 
2441 end:
2442     if (NT_SUCCESS(Status)) {
2443         Status = do_write(Vcb, NULL);
2444         if (!NT_SUCCESS(Status))
2445             ERR("do_write returned %08lx\n", Status);
2446     }
2447 
2448     free_trees(Vcb);
2449 
2450     ExReleaseResourceLite(&Vcb->tree_lock);
2451 
2452     return Status;
2453 }
2454 
2455 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2456     KEY searchkey;
2457     traverse_ptr tp;
2458     NTSTATUS Status;
2459 
2460     searchkey.obj_id = BALANCE_ITEM_ID;
2461     searchkey.obj_type = TYPE_TEMP_ITEM;
2462     searchkey.offset = 0;
2463 
2464     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2465 
2466     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2467     if (!NT_SUCCESS(Status)) {
2468         ERR("find_item returned %08lx\n", Status);
2469         goto end;
2470     }
2471 
2472     if (!keycmp(tp.item->key, searchkey)) {
2473         Status = delete_tree_item(Vcb, &tp);
2474         if (!NT_SUCCESS(Status)) {
2475             ERR("delete_tree_item returned %08lx\n", Status);
2476             goto end;
2477         }
2478 
2479         Status = do_write(Vcb, NULL);
2480         if (!NT_SUCCESS(Status)) {
2481             ERR("do_write returned %08lx\n", Status);
2482             goto end;
2483         }
2484 
2485         free_trees(Vcb);
2486     }
2487 
2488     Status = STATUS_SUCCESS;
2489 
2490 end:
2491     ExReleaseResourceLite(&Vcb->tree_lock);
2492 
2493     return Status;
2494 }
2495 
2496 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2497     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2498 
2499     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2500         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2501         opts->profiles = args->profiles;
2502     }
2503 
2504     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2505         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2506 
2507         opts->usage_start = 0;
2508         opts->usage_end = (uint8_t)args->usage;
2509     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2510         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2511 
2512         opts->usage_start = (uint8_t)args->usage_start;
2513         opts->usage_end = (uint8_t)args->usage_end;
2514     }
2515 
2516     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2517         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2518         opts->devid = args->devid;
2519     }
2520 
2521     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2522         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2523         opts->drange_start = args->drange_start;
2524         opts->drange_end = args->drange_end;
2525     }
2526 
2527     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2528         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2529         opts->vrange_start = args->vrange_start;
2530         opts->vrange_end = args->vrange_end;
2531     }
2532 
2533     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2534         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2535 
2536         opts->limit_start = 0;
2537         opts->limit_end = args->limit;
2538     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2539         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2540 
2541         opts->limit_start = args->limit_start;
2542         opts->limit_end = args->limit_end;
2543     }
2544 
2545     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2546         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2547 
2548         opts->stripes_start = (uint16_t)args->stripes_start;
2549         opts->stripes_end = (uint16_t)args->stripes_end;
2550     }
2551 
2552     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2553         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2554         opts->convert = args->convert;
2555 
2556         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2557             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2558     }
2559 }
2560 
2561 static NTSTATUS remove_superblocks(device* dev) {
2562     NTSTATUS Status;
2563     superblock* sb;
2564     int i = 0;
2565 
2566     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2567     if (!sb) {
2568         ERR("out of memory\n");
2569         return STATUS_INSUFFICIENT_RESOURCES;
2570     }
2571 
2572     RtlZeroMemory(sb, sizeof(superblock));
2573 
2574     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2575         Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2576 
2577         if (!NT_SUCCESS(Status)) {
2578             ExFreePool(sb);
2579             return Status;
2580         }
2581 
2582         i++;
2583     }
2584 
2585     ExFreePool(sb);
2586 
2587     return STATUS_SUCCESS;
2588 }
2589 
2590 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2591     KEY searchkey;
2592     traverse_ptr tp;
2593     NTSTATUS Status;
2594     LIST_ENTRY* le;
2595     volume_device_extension* vde;
2596 
2597     if (Vcb->need_write) {
2598         Status = do_write(Vcb, NULL);
2599 
2600         if (!NT_SUCCESS(Status))
2601             ERR("do_write returned %08lx\n", Status);
2602     } else
2603         Status = STATUS_SUCCESS;
2604 
2605     free_trees(Vcb);
2606 
2607     if (!NT_SUCCESS(Status))
2608         return Status;
2609 
2610     // remove entry in chunk tree
2611 
2612     searchkey.obj_id = 1;
2613     searchkey.obj_type = TYPE_DEV_ITEM;
2614     searchkey.offset = dev->devitem.dev_id;
2615 
2616     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2617     if (!NT_SUCCESS(Status)) {
2618         ERR("find_item returned %08lx\n", Status);
2619         return Status;
2620     }
2621 
2622     if (!keycmp(searchkey, tp.item->key)) {
2623         Status = delete_tree_item(Vcb, &tp);
2624 
2625         if (!NT_SUCCESS(Status)) {
2626             ERR("delete_tree_item returned %08lx\n", Status);
2627             return Status;
2628         }
2629     }
2630 
2631     // remove stats entry in device tree
2632 
2633     searchkey.obj_id = 0;
2634     searchkey.obj_type = TYPE_DEV_STATS;
2635     searchkey.offset = dev->devitem.dev_id;
2636 
2637     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2638     if (!NT_SUCCESS(Status)) {
2639         ERR("find_item returned %08lx\n", Status);
2640         return Status;
2641     }
2642 
2643     if (!keycmp(searchkey, tp.item->key)) {
2644         Status = delete_tree_item(Vcb, &tp);
2645 
2646         if (!NT_SUCCESS(Status)) {
2647             ERR("delete_tree_item returned %08lx\n", Status);
2648             return Status;
2649         }
2650     }
2651 
2652     // update superblock
2653 
2654     Vcb->superblock.num_devices--;
2655     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2656     Vcb->devices_loaded--;
2657 
2658     RemoveEntryList(&dev->list_entry);
2659 
2660     // flush
2661 
2662     Status = do_write(Vcb, NULL);
2663     if (!NT_SUCCESS(Status))
2664         ERR("do_write returned %08lx\n", Status);
2665 
2666     free_trees(Vcb);
2667 
2668     if (!NT_SUCCESS(Status))
2669         return Status;
2670 
2671     if (!dev->readonly && dev->devobj) {
2672         Status = remove_superblocks(dev);
2673         if (!NT_SUCCESS(Status))
2674             WARN("remove_superblocks returned %08lx\n", Status);
2675     }
2676 
2677     // remove entry in volume list
2678 
2679     vde = Vcb->vde;
2680 
2681     if (dev->devobj) {
2682         pdo_device_extension* pdode = vde->pdode;
2683 
2684         ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2685 
2686         le = pdode->children.Flink;
2687         while (le != &pdode->children) {
2688             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2689 
2690             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2691                 PFILE_OBJECT FileObject;
2692                 PDEVICE_OBJECT mountmgr;
2693                 UNICODE_STRING mmdevpath;
2694 
2695                 pdode->children_loaded--;
2696 
2697                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2698                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2699                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2700                     if (!NT_SUCCESS(Status))
2701                         ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2702                     else {
2703                         MOUNTDEV_NAME mdn;
2704 
2705                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2706                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2707                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2708                         else {
2709                             MOUNTDEV_NAME* mdn2;
2710                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2711 
2712                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2713                             if (!mdn2)
2714                                 ERR("out of memory\n");
2715                             else {
2716                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2717                                 if (!NT_SUCCESS(Status))
2718                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2719                                 else {
2720                                     UNICODE_STRING name;
2721 
2722                                     name.Buffer = mdn2->Name;
2723                                     name.Length = name.MaximumLength = mdn2->NameLength;
2724 
2725                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2726                                     if (!NT_SUCCESS(Status))
2727                                         WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2728                                 }
2729 
2730                                 ExFreePool(mdn2);
2731                             }
2732                         }
2733 
2734                         ObDereferenceObject(FileObject);
2735                     }
2736                 }
2737 
2738                 ExFreePool(vc->pnp_name.Buffer);
2739                 RemoveEntryList(&vc->list_entry);
2740                 ExFreePool(vc);
2741 
2742                 ObDereferenceObject(vc->fileobj);
2743 
2744                 break;
2745             }
2746 
2747             le = le->Flink;
2748         }
2749 
2750         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2751             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2752 
2753             le = pdode->children.Flink;
2754             while (le != &pdode->children) {
2755                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2756 
2757                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2758                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2759                     break;
2760                 }
2761 
2762                 le = le->Flink;
2763             }
2764         }
2765 
2766         pdode->num_children = Vcb->superblock.num_devices;
2767 
2768         ExReleaseResourceLite(&pdode->child_lock);
2769 
2770         // free dev
2771 
2772         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2773             trim_whole_device(dev);
2774     }
2775 
2776     while (!IsListEmpty(&dev->space)) {
2777         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2778         space* s = CONTAINING_RECORD(le2, space, list_entry);
2779 
2780         ExFreePool(s);
2781     }
2782 
2783     ExFreePool(dev);
2784 
2785     if (Vcb->trim) {
2786         Vcb->trim = false;
2787 
2788         le = Vcb->devices.Flink;
2789         while (le != &Vcb->devices) {
2790             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2791 
2792             if (dev2->trim) {
2793                 Vcb->trim = true;
2794                 break;
2795             }
2796 
2797             le = le->Flink;
2798         }
2799     }
2800 
2801     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2802 
2803     return STATUS_SUCCESS;
2804 }
2805 
2806 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2807     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2808     DEVICE_DATA_SET_RANGE* ranges;
2809     ULONG datalen, i;
2810     KEY searchkey;
2811     traverse_ptr tp;
2812     NTSTATUS Status;
2813     bool b;
2814     uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2815     LIST_ENTRY* le;
2816 
2817     dev->num_trim_entries = 0;
2818 
2819     searchkey.obj_id = dev->devitem.dev_id;
2820     searchkey.obj_type = TYPE_DEV_EXTENT;
2821     searchkey.offset = 0;
2822 
2823     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2824     if (!NT_SUCCESS(Status)) {
2825         ERR("find_item returned %08lx\n", Status);
2826         return;
2827     }
2828 
2829     do {
2830         traverse_ptr next_tp;
2831 
2832         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2833             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2834                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2835 
2836                 if (tp.item->key.offset > lastoff)
2837                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2838 
2839                 lastoff = tp.item->key.offset + de->length;
2840             } else {
2841                 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2842                 return;
2843             }
2844         }
2845 
2846         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2847 
2848         if (b) {
2849             tp = next_tp;
2850             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2851                 break;
2852         }
2853     } while (b);
2854 
2855     if (lastoff < dev->devitem.num_bytes)
2856         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2857 
2858     if (dev->num_trim_entries == 0)
2859         return;
2860 
2861     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2862 
2863     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2864     if (!dmdsa) {
2865         ERR("out of memory\n");
2866         goto end;
2867     }
2868 
2869     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2870     dmdsa->Action = DeviceDsmAction_Trim;
2871     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2872     dmdsa->ParameterBlockOffset = 0;
2873     dmdsa->ParameterBlockLength = 0;
2874     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2875     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2876 
2877     ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2878 
2879     i = 0;
2880     le = dev->trim_list.Flink;
2881     while (le != &dev->trim_list) {
2882         space* s = CONTAINING_RECORD(le, space, list_entry);
2883 
2884         ranges[i].StartingOffset = s->address;
2885         ranges[i].LengthInBytes = s->size;
2886         i++;
2887 
2888         le = le->Flink;
2889     }
2890 
2891     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2892     if (!NT_SUCCESS(Status))
2893         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2894 
2895     ExFreePool(dmdsa);
2896 
2897 end:
2898     while (!IsListEmpty(&dev->trim_list)) {
2899         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2900         ExFreePool(s);
2901     }
2902 
2903     dev->num_trim_entries = 0;
2904 }
2905 
2906 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2907     NTSTATUS Status;
2908     bool changed;
2909     LIST_ENTRY* le;
2910     chunk* rc;
2911 
2912     // FIXME - allow with metadata chunks?
2913 
2914     while (true) {
2915         rc = NULL;
2916 
2917         ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2918 
2919         ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2920 
2921         // choose the least-used chunk we haven't looked at yet
2922         le = Vcb->chunks.Flink;
2923         while (le != &Vcb->chunks) {
2924             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2925 
2926             // FIXME - skip full-size chunks over e.g. 90% full?
2927             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2928                 rc = c;
2929 
2930             le = le->Flink;
2931         }
2932 
2933         ExReleaseResourceLite(&Vcb->chunk_lock);
2934 
2935         if (!rc) {
2936             ExReleaseResourceLite(&Vcb->tree_lock);
2937             break;
2938         }
2939 
2940         if (rc->list_entry_balance.Flink) {
2941             RemoveEntryList(&rc->list_entry_balance);
2942             Vcb->balance.chunks_left--;
2943         }
2944 
2945         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2946         rc->reloc = true;
2947 
2948         ExReleaseResourceLite(&Vcb->tree_lock);
2949 
2950         do {
2951             changed = false;
2952 
2953             Status = balance_data_chunk(Vcb, rc, &changed);
2954             if (!NT_SUCCESS(Status)) {
2955                 ERR("balance_data_chunk returned %08lx\n", Status);
2956                 Vcb->balance.status = Status;
2957                 rc->list_entry_balance.Flink = NULL;
2958                 rc->reloc = false;
2959                 return Status;
2960             }
2961 
2962             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2963 
2964             if (Vcb->readonly)
2965                 Vcb->balance.stopping = true;
2966 
2967             if (Vcb->balance.stopping)
2968                 return STATUS_SUCCESS;
2969         } while (changed);
2970 
2971         rc->list_entry_balance.Flink = NULL;
2972 
2973         rc->changed = true;
2974         rc->space_changed = true;
2975         rc->balance_num = Vcb->balance.balance_num;
2976 
2977         Status = do_write(Vcb, NULL);
2978         if (!NT_SUCCESS(Status)) {
2979             ERR("do_write returned %08lx\n", Status);
2980             return Status;
2981         }
2982 
2983         free_trees(Vcb);
2984     }
2985 
2986     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2987 
2988     Status = alloc_chunk(Vcb, flags, &rc, true);
2989 
2990     ExReleaseResourceLite(&Vcb->chunk_lock);
2991 
2992     if (NT_SUCCESS(Status)) {
2993         *newchunk = rc;
2994         return Status;
2995     } else {
2996         ERR("alloc_chunk returned %08lx\n", Status);
2997         return Status;
2998     }
2999 }
3000 
3001 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
3002     LIST_ENTRY* le;
3003 
3004     while (!IsListEmpty(&dev->space)) {
3005         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3006 
3007         ExFreePool(s);
3008     }
3009 
3010     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3011 
3012     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3013 
3014     le = Vcb->chunks.Flink;
3015     while (le != &Vcb->chunks) {
3016         uint16_t n;
3017         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3018         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3019 
3020         for (n = 0; n < c->chunk_item->num_stripes; n++) {
3021             uint64_t stripe_size = 0;
3022 
3023             if (cis[n].dev_id == dev->devitem.dev_id) {
3024                 if (stripe_size == 0) {
3025                     uint16_t factor;
3026 
3027                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3028                         factor = c->chunk_item->num_stripes;
3029                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3030                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3031                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3032                         factor = c->chunk_item->num_stripes - 1;
3033                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3034                         factor = c->chunk_item->num_stripes - 2;
3035                     else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3036                         factor = 1;
3037 
3038                     stripe_size = c->chunk_item->size / factor;
3039                 }
3040 
3041                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3042             }
3043         }
3044 
3045         le = le->Flink;
3046     }
3047 
3048     return STATUS_SUCCESS;
3049 }
3050 
3051 _Function_class_(KSTART_ROUTINE)
3052 void __stdcall balance_thread(void* context) {
3053     device_extension* Vcb = (device_extension*)context;
3054     LIST_ENTRY chunks;
3055     LIST_ENTRY* le;
3056     uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3057     uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3058     NTSTATUS Status;
3059 
3060     Vcb->balance.balance_num++;
3061 
3062     Vcb->balance.stopping = false;
3063     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3064 
3065     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3066         old_data_flags = Vcb->data_flags;
3067         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3068 
3069         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3070     }
3071 
3072     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073         old_metadata_flags = Vcb->metadata_flags;
3074         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3075     }
3076 
3077     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3078         old_system_flags = Vcb->system_flags;
3079         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3080     }
3081 
3082     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3083         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3084             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3085         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3086             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3087     }
3088 
3089     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3090     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3091 
3092     InitializeListHead(&chunks);
3093 
3094     // FIXME - what are we supposed to do with limit_start?
3095 
3096     if (!Vcb->readonly) {
3097         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3098             Status = add_balance_item(Vcb);
3099             if (!NT_SUCCESS(Status)) {
3100                 ERR("add_balance_item returned %08lx\n", Status);
3101                 Vcb->balance.status = Status;
3102                 goto end;
3103             }
3104         } else {
3105             if (Vcb->need_write) {
3106                 Status = do_write(Vcb, NULL);
3107 
3108                 free_trees(Vcb);
3109 
3110                 if (!NT_SUCCESS(Status)) {
3111                     ERR("do_write returned %08lx\n", Status);
3112                     Vcb->balance.status = Status;
3113                     goto end;
3114                 }
3115             }
3116         }
3117     }
3118 
3119     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3120 
3121     if (Vcb->balance.stopping)
3122         goto end;
3123 
3124     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3125 
3126     le = Vcb->chunks.Flink;
3127     while (le != &Vcb->chunks) {
3128         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3129         uint8_t sort;
3130 
3131         acquire_chunk_lock(c, Vcb);
3132 
3133         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3134             sort = BALANCE_OPTS_DATA;
3135         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3136             sort = BALANCE_OPTS_METADATA;
3137         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3138             sort = BALANCE_OPTS_SYSTEM;
3139         else {
3140             ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3141             release_chunk_lock(c, Vcb);
3142             break;
3143         }
3144 
3145         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3146             should_balance_chunk(Vcb, sort, c)) {
3147             InsertTailList(&chunks, &c->list_entry_balance);
3148 
3149             num_chunks[sort]++;
3150             Vcb->balance.total_chunks++;
3151             Vcb->balance.chunks_left++;
3152         } else if (sort == BALANCE_OPTS_METADATA)
3153             okay_metadata_chunks++;
3154         else if (sort == BALANCE_OPTS_DATA)
3155             okay_data_chunks++;
3156         else if (sort == BALANCE_OPTS_SYSTEM)
3157             okay_system_chunks++;
3158 
3159         if (!c->cache_loaded) {
3160             Status = load_cache_chunk(Vcb, c, NULL);
3161 
3162             if (!NT_SUCCESS(Status)) {
3163                 ERR("load_cache_chunk returned %08lx\n", Status);
3164                 Vcb->balance.status = Status;
3165                 release_chunk_lock(c, Vcb);
3166                 ExReleaseResourceLite(&Vcb->chunk_lock);
3167                 goto end;
3168             }
3169         }
3170 
3171         release_chunk_lock(c, Vcb);
3172 
3173         le = le->Flink;
3174     }
3175 
3176     ExReleaseResourceLite(&Vcb->chunk_lock);
3177 
3178     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3179     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3180         bool consolidated = false;
3181         chunk* c;
3182 
3183         if (okay_metadata_chunks == 0) {
3184             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3185 
3186             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3187             if (NT_SUCCESS(Status))
3188                 c->balance_num = Vcb->balance.balance_num;
3189             else if (Status != STATUS_DISK_FULL || consolidated) {
3190                 ERR("alloc_chunk returned %08lx\n", Status);
3191                 ExReleaseResourceLite(&Vcb->chunk_lock);
3192                 Vcb->balance.status = Status;
3193                 goto end;
3194             }
3195 
3196             ExReleaseResourceLite(&Vcb->chunk_lock);
3197 
3198             if (Status == STATUS_DISK_FULL) {
3199                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3200                 if (!NT_SUCCESS(Status)) {
3201                     ERR("try_consolidation returned %08lx\n", Status);
3202                     Vcb->balance.status = Status;
3203                     goto end;
3204                 } else
3205                     c->balance_num = Vcb->balance.balance_num;
3206 
3207                 consolidated = true;
3208 
3209                 if (Vcb->balance.stopping)
3210                     goto end;
3211             }
3212         }
3213 
3214         if (okay_data_chunks == 0) {
3215             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3216 
3217             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3218             if (NT_SUCCESS(Status))
3219                 c->balance_num = Vcb->balance.balance_num;
3220             else if (Status != STATUS_DISK_FULL || consolidated) {
3221                 ERR("alloc_chunk returned %08lx\n", Status);
3222                 ExReleaseResourceLite(&Vcb->chunk_lock);
3223                 Vcb->balance.status = Status;
3224                 goto end;
3225             }
3226 
3227             ExReleaseResourceLite(&Vcb->chunk_lock);
3228 
3229             if (Status == STATUS_DISK_FULL) {
3230                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3231                 if (!NT_SUCCESS(Status)) {
3232                     ERR("try_consolidation returned %08lx\n", Status);
3233                     Vcb->balance.status = Status;
3234                     goto end;
3235                 } else
3236                     c->balance_num = Vcb->balance.balance_num;
3237 
3238                 consolidated = true;
3239 
3240                 if (Vcb->balance.stopping)
3241                     goto end;
3242             }
3243         }
3244 
3245         if (okay_system_chunks == 0) {
3246             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3247 
3248             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3249             if (NT_SUCCESS(Status))
3250                 c->balance_num = Vcb->balance.balance_num;
3251             else if (Status != STATUS_DISK_FULL || consolidated) {
3252                 ERR("alloc_chunk returned %08lx\n", Status);
3253                 ExReleaseResourceLite(&Vcb->chunk_lock);
3254                 Vcb->balance.status = Status;
3255                 goto end;
3256             }
3257 
3258             ExReleaseResourceLite(&Vcb->chunk_lock);
3259 
3260             if (Status == STATUS_DISK_FULL) {
3261                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3262                 if (!NT_SUCCESS(Status)) {
3263                     ERR("try_consolidation returned %08lx\n", Status);
3264                     Vcb->balance.status = Status;
3265                     goto end;
3266                 } else
3267                     c->balance_num = Vcb->balance.balance_num;
3268 
3269                 consolidated = true;
3270 
3271                 if (Vcb->balance.stopping)
3272                     goto end;
3273             }
3274         }
3275     }
3276 
3277     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3278 
3279     le = chunks.Flink;
3280     while (le != &chunks) {
3281         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3282 
3283         c->reloc = true;
3284 
3285         le = le->Flink;
3286     }
3287 
3288     ExReleaseResourceLite(&Vcb->chunk_lock);
3289 
3290     // do data chunks before metadata
3291     le = chunks.Flink;
3292     while (le != &chunks) {
3293         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3294         LIST_ENTRY* le2 = le->Flink;
3295 
3296         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3297             bool changed;
3298 
3299             do {
3300                 changed = false;
3301 
3302                 Status = balance_data_chunk(Vcb, c, &changed);
3303                 if (!NT_SUCCESS(Status)) {
3304                     ERR("balance_data_chunk returned %08lx\n", Status);
3305                     Vcb->balance.status = Status;
3306                     goto end;
3307                 }
3308 
3309                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3310 
3311                 if (Vcb->readonly)
3312                     Vcb->balance.stopping = true;
3313 
3314                 if (Vcb->balance.stopping)
3315                     break;
3316             } while (changed);
3317 
3318             c->changed = true;
3319             c->space_changed = true;
3320         }
3321 
3322         if (Vcb->balance.stopping)
3323             goto end;
3324 
3325         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3326             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3327             RemoveEntryList(&c->list_entry_balance);
3328             c->list_entry_balance.Flink = NULL;
3329 
3330             Vcb->balance.chunks_left--;
3331         }
3332 
3333         le = le2;
3334     }
3335 
3336     // do metadata chunks
3337     while (!IsListEmpty(&chunks)) {
3338         chunk* c;
3339         bool changed;
3340 
3341         le = RemoveHeadList(&chunks);
3342         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3343 
3344         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3345             do {
3346                 Status = balance_metadata_chunk(Vcb, c, &changed);
3347                 if (!NT_SUCCESS(Status)) {
3348                     ERR("balance_metadata_chunk returned %08lx\n", Status);
3349                     Vcb->balance.status = Status;
3350                     goto end;
3351                 }
3352 
3353                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3354 
3355                 if (Vcb->readonly)
3356                     Vcb->balance.stopping = true;
3357 
3358                 if (Vcb->balance.stopping)
3359                     break;
3360             } while (changed);
3361 
3362             c->changed = true;
3363             c->space_changed = true;
3364         }
3365 
3366         if (Vcb->balance.stopping)
3367             break;
3368 
3369         c->list_entry_balance.Flink = NULL;
3370 
3371         Vcb->balance.chunks_left--;
3372     }
3373 
3374 end:
3375     if (!Vcb->readonly) {
3376         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3377             le = chunks.Flink;
3378             while (le != &chunks) {
3379                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3380                 c->reloc = false;
3381 
3382                 le = le->Flink;
3383                 c->list_entry_balance.Flink = NULL;
3384             }
3385 
3386             if (old_data_flags != 0)
3387                 Vcb->data_flags = old_data_flags;
3388 
3389             if (old_metadata_flags != 0)
3390                 Vcb->metadata_flags = old_metadata_flags;
3391 
3392             if (old_system_flags != 0)
3393                 Vcb->system_flags = old_system_flags;
3394         }
3395 
3396         if (Vcb->balance.removing) {
3397             device* dev = NULL;
3398 
3399             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3400 
3401             le = Vcb->devices.Flink;
3402             while (le != &Vcb->devices) {
3403                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3404 
3405                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3406                     dev = dev2;
3407                     break;
3408                 }
3409 
3410                 le = le->Flink;
3411             }
3412 
3413             if (dev) {
3414                 if (Vcb->balance.chunks_left == 0) {
3415                     Status = finish_removing_device(Vcb, dev);
3416 
3417                     if (!NT_SUCCESS(Status)) {
3418                         ERR("finish_removing_device returned %08lx\n", Status);
3419                         dev->reloc = false;
3420                     }
3421                 } else
3422                     dev->reloc = false;
3423             }
3424 
3425             ExReleaseResourceLite(&Vcb->tree_lock);
3426         } else if (Vcb->balance.shrinking) {
3427             device* dev = NULL;
3428 
3429             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3430 
3431             le = Vcb->devices.Flink;
3432             while (le != &Vcb->devices) {
3433                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3434 
3435                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3436                     dev = dev2;
3437                     break;
3438                 }
3439 
3440                 le = le->Flink;
3441             }
3442 
3443             if (!dev) {
3444                 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3445                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3446             }
3447 
3448             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3449                 if (dev) {
3450                     Status = regenerate_space_list(Vcb, dev);
3451                     if (!NT_SUCCESS(Status))
3452                         WARN("regenerate_space_list returned %08lx\n", Status);
3453                 }
3454             } else {
3455                 uint64_t old_size;
3456 
3457                 old_size = dev->devitem.num_bytes;
3458                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3459 
3460                 Status = update_dev_item(Vcb, dev, NULL);
3461                 if (!NT_SUCCESS(Status)) {
3462                     ERR("update_dev_item returned %08lx\n", Status);
3463                     dev->devitem.num_bytes = old_size;
3464                     Vcb->balance.status = Status;
3465 
3466                     Status = regenerate_space_list(Vcb, dev);
3467                     if (!NT_SUCCESS(Status))
3468                         WARN("regenerate_space_list returned %08lx\n", Status);
3469                 } else {
3470                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3471 
3472                     Status = do_write(Vcb, NULL);
3473                     if (!NT_SUCCESS(Status))
3474                         ERR("do_write returned %08lx\n", Status);
3475 
3476                     free_trees(Vcb);
3477                 }
3478             }
3479 
3480             ExReleaseResourceLite(&Vcb->tree_lock);
3481 
3482             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3483                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3484         } else {
3485             Status = remove_balance_item(Vcb);
3486             if (!NT_SUCCESS(Status)) {
3487                 ERR("remove_balance_item returned %08lx\n", Status);
3488                 goto end;
3489             }
3490         }
3491 
3492         if (Vcb->trim && !Vcb->options.no_trim) {
3493             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3494 
3495             le = Vcb->devices.Flink;
3496             while (le != &Vcb->devices) {
3497                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3498 
3499                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3500                     trim_unalloc_space(Vcb, dev2);
3501 
3502                 le = le->Flink;
3503             }
3504 
3505             ExReleaseResourceLite(&Vcb->tree_lock);
3506         }
3507     }
3508 
3509     ZwClose(Vcb->balance.thread);
3510     Vcb->balance.thread = NULL;
3511 
3512     KeSetEvent(&Vcb->balance.finished, 0, false);
3513 }
3514 
3515 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3516     NTSTATUS Status;
3517     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3518     OBJECT_ATTRIBUTES oa;
3519     uint8_t i;
3520 
3521     if (length < sizeof(btrfs_start_balance) || !data)
3522         return STATUS_INVALID_PARAMETER;
3523 
3524     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3525         return STATUS_PRIVILEGE_NOT_HELD;
3526 
3527     if (Vcb->locked) {
3528         WARN("cannot start balance while locked\n");
3529         return STATUS_DEVICE_NOT_READY;
3530     }
3531 
3532     if (Vcb->scrub.thread) {
3533         WARN("cannot start balance while scrub running\n");
3534         return STATUS_DEVICE_NOT_READY;
3535     }
3536 
3537     if (Vcb->balance.thread) {
3538         WARN("balance already running\n");
3539         return STATUS_DEVICE_NOT_READY;
3540     }
3541 
3542     if (Vcb->readonly)
3543         return STATUS_MEDIA_WRITE_PROTECTED;
3544 
3545     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3546         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3547         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3548         return STATUS_SUCCESS;
3549 
3550     for (i = 0; i < 3; i++) {
3551         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3552             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3553                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3554                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 |
3555                                          BLOCK_FLAG_RAID1C4;
3556 
3557                 if (bsb->opts[i].profiles == 0)
3558                     return STATUS_INVALID_PARAMETER;
3559             }
3560 
3561             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3562                 if (bsb->opts[i].devid == 0)
3563                     return STATUS_INVALID_PARAMETER;
3564             }
3565 
3566             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3567                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3568                     return STATUS_INVALID_PARAMETER;
3569             }
3570 
3571             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3572                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3573                     return STATUS_INVALID_PARAMETER;
3574             }
3575 
3576             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3577                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3578                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3579 
3580                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3581                     return STATUS_INVALID_PARAMETER;
3582             }
3583 
3584             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3585                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3586                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3587 
3588                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3589                     return STATUS_INVALID_PARAMETER;
3590             }
3591 
3592             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3593                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3594                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3595 
3596                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3597                     return STATUS_INVALID_PARAMETER;
3598             }
3599 
3600             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3601                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3602                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3603                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3604                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 &&
3605                     bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3606                     return STATUS_INVALID_PARAMETER;
3607             }
3608         }
3609     }
3610 
3611     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3612     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3613     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3614 
3615     Vcb->balance.paused = false;
3616     Vcb->balance.removing = false;
3617     Vcb->balance.shrinking = false;
3618     Vcb->balance.status = STATUS_SUCCESS;
3619     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3620 
3621     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3622 
3623     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3624     if (!NT_SUCCESS(Status)) {
3625         ERR("PsCreateSystemThread returned %08lx\n", Status);
3626         return Status;
3627     }
3628 
3629     return STATUS_SUCCESS;
3630 }
3631 
3632 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3633     KEY searchkey;
3634     traverse_ptr tp;
3635     NTSTATUS Status;
3636     BALANCE_ITEM* bi;
3637     OBJECT_ATTRIBUTES oa;
3638     int i;
3639 
3640     searchkey.obj_id = BALANCE_ITEM_ID;
3641     searchkey.obj_type = TYPE_TEMP_ITEM;
3642     searchkey.offset = 0;
3643 
3644     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3645     if (!NT_SUCCESS(Status)) {
3646         ERR("find_item returned %08lx\n", Status);
3647         return Status;
3648     }
3649 
3650     if (keycmp(tp.item->key, searchkey)) {
3651         TRACE("no balance item found\n");
3652         return STATUS_NOT_FOUND;
3653     }
3654 
3655     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3656         WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3657              tp.item->size, sizeof(BALANCE_ITEM));
3658         return STATUS_INTERNAL_ERROR;
3659     }
3660 
3661     bi = (BALANCE_ITEM*)tp.item->data;
3662 
3663     if (bi->flags & BALANCE_FLAGS_DATA)
3664         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3665 
3666     if (bi->flags & BALANCE_FLAGS_METADATA)
3667         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3668 
3669     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3670         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3671 
3672     // do the heuristics that Linux driver does
3673 
3674     for (i = 0; i < 3; i++) {
3675         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3676             // if converting, don't redo chunks already done
3677 
3678             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3680 
3681             // don't balance chunks more than 90% filled - presumably these
3682             // have already been done
3683 
3684             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3685                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3686             ) {
3687                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3688                 Vcb->balance.opts[i].usage_start = 0;
3689                 Vcb->balance.opts[i].usage_end = 90;
3690             }
3691         }
3692     }
3693 
3694     if (Vcb->readonly || Vcb->options.skip_balance)
3695         Vcb->balance.paused = true;
3696     else
3697         Vcb->balance.paused = false;
3698 
3699     Vcb->balance.removing = false;
3700     Vcb->balance.shrinking = false;
3701     Vcb->balance.status = STATUS_SUCCESS;
3702     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3703 
3704     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3705 
3706     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3707     if (!NT_SUCCESS(Status)) {
3708         ERR("PsCreateSystemThread returned %08lx\n", Status);
3709         return Status;
3710     }
3711 
3712     return STATUS_SUCCESS;
3713 }
3714 
3715 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3716     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3717 
3718     if (length < sizeof(btrfs_query_balance) || !data)
3719         return STATUS_INVALID_PARAMETER;
3720 
3721     if (!Vcb->balance.thread) {
3722         bqb->status = BTRFS_BALANCE_STOPPED;
3723 
3724         if (!NT_SUCCESS(Vcb->balance.status)) {
3725             bqb->status |= BTRFS_BALANCE_ERROR;
3726             bqb->error = Vcb->balance.status;
3727         }
3728 
3729         return STATUS_SUCCESS;
3730     }
3731 
3732     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3733 
3734     if (Vcb->balance.removing)
3735         bqb->status |= BTRFS_BALANCE_REMOVAL;
3736 
3737     if (Vcb->balance.shrinking)
3738         bqb->status |= BTRFS_BALANCE_SHRINKING;
3739 
3740     if (!NT_SUCCESS(Vcb->balance.status))
3741         bqb->status |= BTRFS_BALANCE_ERROR;
3742 
3743     bqb->chunks_left = Vcb->balance.chunks_left;
3744     bqb->total_chunks = Vcb->balance.total_chunks;
3745     bqb->error = Vcb->balance.status;
3746     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3747     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3748     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3749 
3750     return STATUS_SUCCESS;
3751 }
3752 
3753 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3754     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3755         return STATUS_PRIVILEGE_NOT_HELD;
3756 
3757     if (!Vcb->balance.thread)
3758         return STATUS_DEVICE_NOT_READY;
3759 
3760     if (Vcb->balance.paused)
3761         return STATUS_DEVICE_NOT_READY;
3762 
3763     Vcb->balance.paused = true;
3764     KeClearEvent(&Vcb->balance.event);
3765 
3766     return STATUS_SUCCESS;
3767 }
3768 
3769 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3770     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3771         return STATUS_PRIVILEGE_NOT_HELD;
3772 
3773     if (!Vcb->balance.thread)
3774         return STATUS_DEVICE_NOT_READY;
3775 
3776     if (!Vcb->balance.paused)
3777         return STATUS_DEVICE_NOT_READY;
3778 
3779     if (Vcb->readonly)
3780         return STATUS_MEDIA_WRITE_PROTECTED;
3781 
3782     Vcb->balance.paused = false;
3783     KeSetEvent(&Vcb->balance.event, 0, false);
3784 
3785     return STATUS_SUCCESS;
3786 }
3787 
3788 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3789     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3790         return STATUS_PRIVILEGE_NOT_HELD;
3791 
3792     if (!Vcb->balance.thread)
3793         return STATUS_DEVICE_NOT_READY;
3794 
3795     Vcb->balance.paused = false;
3796     Vcb->balance.stopping = true;
3797     Vcb->balance.status = STATUS_SUCCESS;
3798     KeSetEvent(&Vcb->balance.event, 0, false);
3799 
3800     return STATUS_SUCCESS;
3801 }
3802 
3803 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3804     uint64_t devid;
3805     LIST_ENTRY* le;
3806     device* dev = NULL;
3807     NTSTATUS Status;
3808     int i;
3809     uint64_t num_rw_devices;
3810     OBJECT_ATTRIBUTES oa;
3811 
3812     TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3813 
3814     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3815         return STATUS_PRIVILEGE_NOT_HELD;
3816 
3817     if (length < sizeof(uint64_t))
3818         return STATUS_INVALID_PARAMETER;
3819 
3820     devid = *(uint64_t*)data;
3821 
3822     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3823 
3824     if (Vcb->readonly) {
3825         ExReleaseResourceLite(&Vcb->tree_lock);
3826         return STATUS_MEDIA_WRITE_PROTECTED;
3827     }
3828 
3829     num_rw_devices = 0;
3830 
3831     le = Vcb->devices.Flink;
3832     while (le != &Vcb->devices) {
3833         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3834 
3835         if (dev2->devitem.dev_id == devid)
3836             dev = dev2;
3837 
3838         if (!dev2->readonly)
3839             num_rw_devices++;
3840 
3841         le = le->Flink;
3842     }
3843 
3844     if (!dev) {
3845         ExReleaseResourceLite(&Vcb->tree_lock);
3846         WARN("device %I64x not found\n", devid);
3847         return STATUS_NOT_FOUND;
3848     }
3849 
3850     if (!dev->readonly) {
3851         if (num_rw_devices == 1) {
3852             ExReleaseResourceLite(&Vcb->tree_lock);
3853             WARN("not removing last non-readonly device\n");
3854             return STATUS_INVALID_PARAMETER;
3855         }
3856 
3857         if (num_rw_devices == 4 &&
3858             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3859              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3860              (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3861             )
3862         ) {
3863             ExReleaseResourceLite(&Vcb->tree_lock);
3864             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3865             return STATUS_CANNOT_DELETE;
3866         }
3867 
3868         if (num_rw_devices == 3 &&
3869             ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) ||
3870             (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3))
3871             ) {
3872             ExReleaseResourceLite(&Vcb->tree_lock);
3873             ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n");
3874             return STATUS_CANNOT_DELETE;
3875         }
3876 
3877         if (num_rw_devices == 2 &&
3878             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3879              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3880         ) {
3881             ExReleaseResourceLite(&Vcb->tree_lock);
3882             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3883             return STATUS_CANNOT_DELETE;
3884         }
3885     }
3886 
3887     ExReleaseResourceLite(&Vcb->tree_lock);
3888 
3889     if (Vcb->balance.thread) {
3890         WARN("balance already running\n");
3891         return STATUS_DEVICE_NOT_READY;
3892     }
3893 
3894     dev->reloc = true;
3895 
3896     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3897 
3898     for (i = 0; i < 3; i++) {
3899         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3900         Vcb->balance.opts[i].devid = devid;
3901     }
3902 
3903     Vcb->balance.paused = false;
3904     Vcb->balance.removing = true;
3905     Vcb->balance.shrinking = false;
3906     Vcb->balance.status = STATUS_SUCCESS;
3907     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3908 
3909     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3910 
3911     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3912     if (!NT_SUCCESS(Status)) {
3913         ERR("PsCreateSystemThread returned %08lx\n", Status);
3914         dev->reloc = false;
3915         return Status;
3916     }
3917 
3918     return STATUS_SUCCESS;
3919 }
3920