xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 318da0c1)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
23     uint64_t address;
24     uint64_t new_address;
25     tree_header* data;
26     EXTENT_ITEM* ei;
27     tree* t;
28     bool system;
29     LIST_ENTRY refs;
30     LIST_ENTRY list_entry;
31 } metadata_reloc;
32 
33 typedef struct {
34     uint8_t type;
35     uint64_t hash;
36 
37     union {
38         TREE_BLOCK_REF tbr;
39         SHARED_BLOCK_REF sbr;
40     };
41 
42     metadata_reloc* parent;
43     bool top;
44     LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46 
47 typedef struct {
48     uint64_t address;
49     uint64_t size;
50     uint64_t new_address;
51     chunk* newchunk;
52     EXTENT_ITEM* ei;
53     LIST_ENTRY refs;
54     LIST_ENTRY list_entry;
55 } data_reloc;
56 
57 typedef struct {
58     uint8_t type;
59     uint64_t hash;
60 
61     union {
62         EXTENT_DATA_REF edr;
63         SHARED_DATA_REF sdr;
64     };
65 
66     metadata_reloc* parent;
67     LIST_ENTRY list_entry;
68 } data_reloc_ref;
69 
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73 
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75 
76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
77                                    bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
78     NTSTATUS Status;
79     metadata_reloc* mr;
80     EXTENT_ITEM* ei;
81     uint16_t len;
82     uint64_t inline_rc;
83     uint8_t* ptr;
84 
85     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
86     if (!mr) {
87         ERR("out of memory\n");
88         return STATUS_INSUFFICIENT_RESOURCES;
89     }
90 
91     mr->address = tp->item->key.obj_id;
92     mr->data = NULL;
93     mr->ei = (EXTENT_ITEM*)tp->item->data;
94     mr->system = false;
95     InitializeListHead(&mr->refs);
96 
97     Status = delete_tree_item(Vcb, tp);
98     if (!NT_SUCCESS(Status)) {
99         ERR("delete_tree_item returned %08x\n", Status);
100         ExFreePool(mr);
101         return Status;
102     }
103 
104     if (!c)
105         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
106 
107     if (c) {
108         acquire_chunk_lock(c, Vcb);
109 
110         c->used -= Vcb->superblock.node_size;
111 
112         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113 
114         release_chunk_lock(c, Vcb);
115     }
116 
117     ei = (EXTENT_ITEM*)tp->item->data;
118     inline_rc = 0;
119 
120     len = tp->item->size - sizeof(EXTENT_ITEM);
121     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
122     if (!skinny) {
123         len -= sizeof(EXTENT_ITEM2);
124         ptr += sizeof(EXTENT_ITEM2);
125     }
126 
127     while (len > 0) {
128         uint8_t secttype = *ptr;
129         uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
130         metadata_reloc_ref* ref;
131 
132         len--;
133 
134         if (sectlen > len) {
135             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136             return STATUS_INTERNAL_ERROR;
137         }
138 
139         if (sectlen == 0) {
140             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141             return STATUS_INTERNAL_ERROR;
142         }
143 
144         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
145         if (!ref) {
146             ERR("out of memory\n");
147             return STATUS_INSUFFICIENT_RESOURCES;
148         }
149 
150         if (secttype == TYPE_TREE_BLOCK_REF) {
151             ref->type = TYPE_TREE_BLOCK_REF;
152             RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
153             inline_rc++;
154         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155             ref->type = TYPE_SHARED_BLOCK_REF;
156             RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
157             inline_rc++;
158         } else {
159             ERR("unexpected tree type %x\n", secttype);
160             ExFreePool(ref);
161             return STATUS_INTERNAL_ERROR;
162         }
163 
164         ref->parent = NULL;
165         ref->top = false;
166         InsertTailList(&mr->refs, &ref->list_entry);
167 
168         len -= sectlen;
169         ptr += sizeof(uint8_t) + sectlen;
170     }
171 
172     if (inline_rc < ei->refcount) { // look for non-inline entries
173         traverse_ptr tp2 = *tp, next_tp;
174 
175         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
176             tp2 = next_tp;
177 
178             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
180                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
181                     if (!ref) {
182                         ERR("out of memory\n");
183                         return STATUS_INSUFFICIENT_RESOURCES;
184                     }
185 
186                     ref->type = TYPE_TREE_BLOCK_REF;
187                     ref->tbr.offset = tp2.item->key.offset;
188                     ref->parent = NULL;
189                     ref->top = false;
190                     InsertTailList(&mr->refs, &ref->list_entry);
191 
192                     Status = delete_tree_item(Vcb, &tp2);
193                     if (!NT_SUCCESS(Status)) {
194                         ERR("delete_tree_item returned %08x\n", Status);
195                         return Status;
196                     }
197                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
198                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
199                     if (!ref) {
200                         ERR("out of memory\n");
201                         return STATUS_INSUFFICIENT_RESOURCES;
202                     }
203 
204                     ref->type = TYPE_SHARED_BLOCK_REF;
205                     ref->sbr.offset = tp2.item->key.offset;
206                     ref->parent = NULL;
207                     ref->top = false;
208                     InsertTailList(&mr->refs, &ref->list_entry);
209 
210                     Status = delete_tree_item(Vcb, &tp2);
211                     if (!NT_SUCCESS(Status)) {
212                         ERR("delete_tree_item returned %08x\n", Status);
213                         return Status;
214                     }
215                 }
216             } else
217                 break;
218         }
219     }
220 
221     InsertTailList(items, &mr->list_entry);
222 
223     if (mr2)
224         *mr2 = mr;
225 
226     return STATUS_SUCCESS;
227 }
228 
229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
230                                           uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
231     LIST_ENTRY* le;
232     KEY searchkey;
233     traverse_ptr tp;
234     bool skinny = false;
235     NTSTATUS Status;
236 
237     le = items->Flink;
238     while (le != items) {
239         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
240 
241         if (mr->address == address) {
242             *mr2 = mr;
243             return STATUS_SUCCESS;
244         }
245 
246         le = le->Flink;
247     }
248 
249     searchkey.obj_id = address;
250     searchkey.obj_type = TYPE_METADATA_ITEM;
251     searchkey.offset = 0xffffffffffffffff;
252 
253     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
254     if (!NT_SUCCESS(Status)) {
255         ERR("find_item returned %08x\n", Status);
256         return Status;
257     }
258 
259     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
260         skinny = true;
261     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262              tp.item->size >= sizeof(EXTENT_ITEM)) {
263         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
264 
265         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266             ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
267             return STATUS_INTERNAL_ERROR;
268         }
269     } else {
270         ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
271         return STATUS_INTERNAL_ERROR;
272     }
273 
274     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275     if (!NT_SUCCESS(Status)) {
276         ERR("add_metadata_reloc returned %08x\n", Status);
277         return Status;
278     }
279 
280     return STATUS_SUCCESS;
281 }
282 
283 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
284     LIST_ENTRY newlist, *le;
285 
286     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287         return;
288 
289     // insertion sort
290 
291     InitializeListHead(&newlist);
292 
293     while (!IsListEmpty(&mr->refs)) {
294         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
295         bool inserted = false;
296 
297         if (ref->type == TYPE_TREE_BLOCK_REF)
298             ref->hash = ref->tbr.offset;
299         else if (ref->type == TYPE_SHARED_BLOCK_REF)
300             ref->hash = ref->parent->new_address;
301 
302         le = newlist.Flink;
303         while (le != &newlist) {
304             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
305 
306             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
307                 InsertHeadList(le->Blink, &ref->list_entry);
308                 inserted = true;
309                 break;
310             }
311 
312             le = le->Flink;
313         }
314 
315         if (!inserted)
316             InsertTailList(&newlist, &ref->list_entry);
317     }
318 
319     newlist.Flink->Blink = &mr->refs;
320     newlist.Blink->Flink = &mr->refs;
321     mr->refs.Flink = newlist.Flink;
322     mr->refs.Blink = newlist.Blink;
323 }
324 
325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
326     NTSTATUS Status;
327     LIST_ENTRY* le;
328     uint64_t rc = 0;
329     uint16_t inline_len;
330     bool all_inline = true;
331     metadata_reloc_ref* first_noninline = NULL;
332     EXTENT_ITEM* ei;
333     uint8_t* ptr;
334 
335     inline_len = sizeof(EXTENT_ITEM);
336     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337         inline_len += sizeof(EXTENT_ITEM2);
338 
339     sort_metadata_reloc_refs(mr);
340 
341     le = mr->refs.Flink;
342     while (le != &mr->refs) {
343         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
344         uint16_t extlen = 0;
345 
346         rc++;
347 
348         if (ref->type == TYPE_TREE_BLOCK_REF)
349             extlen += sizeof(TREE_BLOCK_REF);
350         else if (ref->type == TYPE_SHARED_BLOCK_REF)
351             extlen += sizeof(SHARED_BLOCK_REF);
352 
353         if (all_inline) {
354             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355                 all_inline = false;
356                 first_noninline = ref;
357             } else
358                 inline_len += extlen + 1;
359         }
360 
361         le = le->Flink;
362     }
363 
364     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365     if (!ei) {
366         ERR("out of memory\n");
367         return STATUS_INSUFFICIENT_RESOURCES;
368     }
369 
370     ei->refcount = rc;
371     ei->generation = mr->ei->generation;
372     ei->flags = mr->ei->flags;
373     ptr = (uint8_t*)&ei[1];
374 
375     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377 
378         ei2->firstitem = *(KEY*)&mr->data[1];
379         ei2->level = mr->data->level;
380 
381         ptr += sizeof(EXTENT_ITEM2);
382     }
383 
384     le = mr->refs.Flink;
385     while (le != &mr->refs) {
386         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
387 
388         if (ref == first_noninline)
389             break;
390 
391         *ptr = ref->type;
392         ptr++;
393 
394         if (ref->type == TYPE_TREE_BLOCK_REF) {
395             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
396 
397             tbr->offset = ref->tbr.offset;
398 
399             ptr += sizeof(TREE_BLOCK_REF);
400         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
401             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
402 
403             sbr->offset = ref->parent->new_address;
404 
405             ptr += sizeof(SHARED_BLOCK_REF);
406         }
407 
408         le = le->Flink;
409     }
410 
411     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413     else
414         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415 
416     if (!NT_SUCCESS(Status)) {
417         ERR("insert_tree_item returned %08x\n", Status);
418         ExFreePool(ei);
419         return Status;
420     }
421 
422     if (!all_inline) {
423         le = &first_noninline->list_entry;
424 
425         while (le != &mr->refs) {
426             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
427 
428             if (ref->type == TYPE_TREE_BLOCK_REF) {
429                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430                 if (!NT_SUCCESS(Status)) {
431                     ERR("insert_tree_item returned %08x\n", Status);
432                     return Status;
433                 }
434             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436                 if (!NT_SUCCESS(Status)) {
437                     ERR("insert_tree_item returned %08x\n", Status);
438                     return Status;
439                 }
440             }
441 
442             le = le->Flink;
443         }
444     }
445 
446     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
447         if (mr->data->level > 0) {
448             uint16_t i;
449             internal_node* in = (internal_node*)&mr->data[1];
450 
451             for (i = 0; i < mr->data->num_items; i++) {
452                 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
453 
454                 if (sbrrc > 0) {
455                     SHARED_BLOCK_REF sbr;
456 
457                     sbr.offset = mr->new_address;
458 
459                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460                     if (!NT_SUCCESS(Status)) {
461                         ERR("increase_extent_refcount returned %08x\n", Status);
462                         return Status;
463                     }
464 
465                     sbr.offset = mr->address;
466 
467                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468                                                       sbr.offset, false, NULL);
469                     if (!NT_SUCCESS(Status)) {
470                         ERR("decrease_extent_refcount returned %08x\n", Status);
471                         return Status;
472                     }
473                 }
474             }
475         } else {
476             uint16_t i;
477             leaf_node* ln = (leaf_node*)&mr->data[1];
478 
479             for (i = 0; i < mr->data->num_items; i++) {
480                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481                     EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
482 
483                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
484                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
485 
486                         if (ed2->size > 0) { // not sparse
487                             uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
488 
489                             if (sdrrc > 0) {
490                                 SHARED_DATA_REF sdr;
491                                 chunk* c;
492 
493                                 sdr.offset = mr->new_address;
494                                 sdr.count = sdrrc;
495 
496                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
497                                 if (!NT_SUCCESS(Status)) {
498                                     ERR("increase_extent_refcount returned %08x\n", Status);
499                                     return Status;
500                                 }
501 
502                                 sdr.offset = mr->address;
503 
504                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
505                                                                   sdr.offset, false, NULL);
506                                 if (!NT_SUCCESS(Status)) {
507                                     ERR("decrease_extent_refcount returned %08x\n", Status);
508                                     return Status;
509                                 }
510 
511                                 c = get_chunk_from_address(Vcb, ed2->address);
512 
513                                 if (c) {
514                                     // check changed_extents
515 
516                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
517 
518                                     le = c->changed_extents.Flink;
519 
520                                     while (le != &c->changed_extents) {
521                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
522 
523                                         if (ce->address == ed2->address) {
524                                             LIST_ENTRY* le2;
525 
526                                             le2 = ce->refs.Flink;
527                                             while (le2 != &ce->refs) {
528                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
529 
530                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531                                                     cer->sdr.offset = mr->new_address;
532                                                     break;
533                                                 }
534 
535                                                 le2 = le2->Flink;
536                                             }
537 
538                                             le2 = ce->old_refs.Flink;
539                                             while (le2 != &ce->old_refs) {
540                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
541 
542                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543                                                     cer->sdr.offset = mr->new_address;
544                                                     break;
545                                                 }
546 
547                                                 le2 = le2->Flink;
548                                             }
549 
550                                             break;
551                                         }
552 
553                                         le = le->Flink;
554                                     }
555 
556                                     ExReleaseResourceLite(&c->changed_extents_lock);
557                                 }
558                             }
559                         }
560                     }
561                 }
562             }
563         }
564     }
565 
566     return STATUS_SUCCESS;
567 }
568 
569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
570                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571     LIST_ENTRY tree_writes, *le;
572     NTSTATUS Status;
573     traverse_ptr tp;
574     uint8_t level, max_level = 0;
575     chunk* newchunk = NULL;
576 
577     InitializeListHead(&tree_writes);
578 
579     le = items->Flink;
580     while (le != items) {
581         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
582         LIST_ENTRY* le2;
583         chunk* pc;
584 
585         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586         if (!mr->data) {
587             ERR("out of memory\n");
588             return STATUS_INSUFFICIENT_RESOURCES;
589         }
590 
591         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
592                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
593         if (!NT_SUCCESS(Status)) {
594             ERR("read_data returned %08x\n", Status);
595             return Status;
596         }
597 
598         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599             mr->system = true;
600 
601         if (data_items && mr->data->level == 0) {
602             le2 = data_items->Flink;
603             while (le2 != data_items) {
604                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
605                 leaf_node* ln = (leaf_node*)&mr->data[1];
606                 uint16_t i;
607 
608                 for (i = 0; i < mr->data->num_items; i++) {
609                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610                         EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
611 
612                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
613                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
614 
615                             if (ed2->address == dr->address)
616                                 ed2->address = dr->new_address;
617                         }
618                     }
619                 }
620 
621                 le2 = le2->Flink;
622             }
623         }
624 
625         if (mr->data->level > max_level)
626             max_level = mr->data->level;
627 
628         le2 = mr->refs.Flink;
629         while (le2 != &mr->refs) {
630             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
631 
632             if (ref->type == TYPE_TREE_BLOCK_REF) {
633                 KEY* firstitem;
634                 root* r = NULL;
635                 LIST_ENTRY* le3;
636                 tree* t;
637 
638                 firstitem = (KEY*)&mr->data[1];
639 
640                 le3 = Vcb->roots.Flink;
641                 while (le3 != &Vcb->roots) {
642                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
643 
644                     if (r2->id == ref->tbr.offset) {
645                         r = r2;
646                         break;
647                     }
648 
649                     le3 = le3->Flink;
650                 }
651 
652                 if (!r) {
653                     ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
654                     return STATUS_INTERNAL_ERROR;
655                 }
656 
657                 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
658                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
659                     ERR("find_item_to_level returned %08x\n", Status);
660                     return Status;
661                 }
662 
663                 t = tp.tree;
664                 while (t && t->header.level < mr->data->level + 1) {
665                     t = t->parent;
666                 }
667 
668                 if (!t)
669                     ref->top = true;
670                 else {
671                     metadata_reloc* mr2;
672 
673                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674                     if (!NT_SUCCESS(Status)) {
675                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
676                         return Status;
677                     }
678 
679                     ref->parent = mr2;
680                 }
681             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682                 metadata_reloc* mr2;
683 
684                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685                 if (!NT_SUCCESS(Status)) {
686                     ERR("add_metadata_reloc_parent returned %08x\n", Status);
687                     return Status;
688                 }
689 
690                 ref->parent = mr2;
691             }
692 
693             le2 = le2->Flink;
694         }
695 
696         le = le->Flink;
697     }
698 
699     le = items->Flink;
700     while (le != items) {
701         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
702         LIST_ENTRY* le2;
703         uint32_t hash;
704 
705         mr->t = NULL;
706 
707         hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
708 
709         le2 = Vcb->trees_ptrs[hash >> 24];
710 
711         if (le2) {
712             while (le2 != &Vcb->trees_hash) {
713                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714 
715                 if (t->header.address == mr->address) {
716                     mr->t = t;
717                     break;
718                 } else if (t->hash > hash)
719                     break;
720 
721                 le2 = le2->Flink;
722             }
723         }
724 
725         le = le->Flink;
726     }
727 
728     for (level = 0; level <= max_level; level++) {
729         le = items->Flink;
730         while (le != items) {
731             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
732 
733             if (mr->data->level == level) {
734                 bool done = false;
735                 LIST_ENTRY* le2;
736                 tree_write* tw;
737                 uint64_t flags;
738                 tree* t3;
739 
740                 if (mr->system)
741                     flags = Vcb->system_flags;
742                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743                     flags = Vcb->data_flags;
744                 else
745                     flags = Vcb->metadata_flags;
746 
747                 if (newchunk) {
748                     acquire_chunk_lock(newchunk, Vcb);
749 
750                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751                         newchunk->used += Vcb->superblock.node_size;
752                         space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
753                         done = true;
754                     }
755 
756                     release_chunk_lock(newchunk, Vcb);
757                 }
758 
759                 if (!done) {
760                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
761 
762                     le2 = Vcb->chunks.Flink;
763                     while (le2 != &Vcb->chunks) {
764                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
765 
766                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767                             acquire_chunk_lock(c2, Vcb);
768 
769                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
770                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
771                                     c2->used += Vcb->superblock.node_size;
772                                     space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
773                                     release_chunk_lock(c2, Vcb);
774                                     newchunk = c2;
775                                     done = true;
776                                     break;
777                                 }
778                             }
779 
780                             release_chunk_lock(c2, Vcb);
781                         }
782 
783                         le2 = le2->Flink;
784                     }
785 
786                     // allocate new chunk if necessary
787                     if (!done) {
788                         Status = alloc_chunk(Vcb, flags, &newchunk, false);
789 
790                         if (!NT_SUCCESS(Status)) {
791                             ERR("alloc_chunk returned %08x\n", Status);
792                             ExReleaseResourceLite(&Vcb->chunk_lock);
793                             goto end;
794                         }
795 
796                         acquire_chunk_lock(newchunk, Vcb);
797 
798                         newchunk->balance_num = Vcb->balance.balance_num;
799 
800                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801                             release_chunk_lock(newchunk, Vcb);
802                             ExReleaseResourceLite(&Vcb->chunk_lock);
803                             ERR("could not find address in new chunk\n");
804                             Status = STATUS_DISK_FULL;
805                             goto end;
806                         } else {
807                             newchunk->used += Vcb->superblock.node_size;
808                             space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
809                         }
810 
811                         release_chunk_lock(newchunk, Vcb);
812                     }
813 
814                     ExReleaseResourceLite(&Vcb->chunk_lock);
815                 }
816 
817                 // update parents
818                 le2 = mr->refs.Flink;
819                 while (le2 != &mr->refs) {
820                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
821 
822                     if (ref->parent) {
823                         uint16_t i;
824                         internal_node* in = (internal_node*)&ref->parent->data[1];
825 
826                         for (i = 0; i < ref->parent->data->num_items; i++) {
827                             if (in[i].address == mr->address) {
828                                 in[i].address = mr->new_address;
829                                 break;
830                             }
831                         }
832 
833                         if (ref->parent->t) {
834                             LIST_ENTRY* le3;
835 
836                             le3 = ref->parent->t->itemlist.Flink;
837                             while (le3 != &ref->parent->t->itemlist) {
838                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
839 
840                                 if (!td->inserted && td->treeholder.address == mr->address)
841                                     td->treeholder.address = mr->new_address;
842 
843                                 le3 = le3->Flink;
844                             }
845                         }
846                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847                         LIST_ENTRY* le3;
848                         root* r = NULL;
849 
850                         // alter ROOT_ITEM
851 
852                         le3 = Vcb->roots.Flink;
853                         while (le3 != &Vcb->roots) {
854                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
855 
856                             if (r2->id == ref->tbr.offset) {
857                                 r = r2;
858                                 break;
859                             }
860 
861                             le3 = le3->Flink;
862                         }
863 
864                         if (r) {
865                             r->treeholder.address = mr->new_address;
866 
867                             if (r == Vcb->root_root)
868                                 Vcb->superblock.root_tree_addr = mr->new_address;
869                             else if (r == Vcb->chunk_root)
870                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
871                             else if (r->root_item.block_number == mr->address) {
872                                 KEY searchkey;
873                                 ROOT_ITEM* ri;
874 
875                                 r->root_item.block_number = mr->new_address;
876 
877                                 searchkey.obj_id = r->id;
878                                 searchkey.obj_type = TYPE_ROOT_ITEM;
879                                 searchkey.offset = 0xffffffffffffffff;
880 
881                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
882                                 if (!NT_SUCCESS(Status)) {
883                                     ERR("find_item returned %08x\n", Status);
884                                     goto end;
885                                 }
886 
887                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888                                     ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
889                                     Status = STATUS_INTERNAL_ERROR;
890                                     goto end;
891                                 }
892 
893                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
894                                 if (!ri) {
895                                     ERR("out of memory\n");
896                                     Status = STATUS_INSUFFICIENT_RESOURCES;
897                                     goto end;
898                                 }
899 
900                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901 
902                                 Status = delete_tree_item(Vcb, &tp);
903                                 if (!NT_SUCCESS(Status)) {
904                                     ERR("delete_tree_item returned %08x\n", Status);
905                                     goto end;
906                                 }
907 
908                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909                                 if (!NT_SUCCESS(Status)) {
910                                     ERR("insert_tree_item returned %08x\n", Status);
911                                     goto end;
912                                 }
913                             }
914                         }
915                     }
916 
917                     le2 = le2->Flink;
918                 }
919 
920                 mr->data->address = mr->new_address;
921 
922                 t3 = mr->t;
923 
924                 while (t3) {
925                     uint8_t h;
926                     bool inserted;
927                     tree* t4 = NULL;
928 
929                     // check if tree loaded more than once
930                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
931                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932 
933                         if (nt->header.address == t3->header.address)
934                             t4 = nt;
935                     }
936 
937                     t3->header.address = mr->new_address;
938 
939                     h = t3->hash >> 24;
940 
941                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943                             Vcb->trees_ptrs[h] = NULL;
944                         else {
945                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946 
947                             if (t2->hash >> 24 == h)
948                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949                             else
950                                 Vcb->trees_ptrs[h] = NULL;
951                         }
952                     }
953 
954                     RemoveEntryList(&t3->list_entry_hash);
955 
956                     t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
957                     h = t3->hash >> 24;
958 
959                     if (!Vcb->trees_ptrs[h]) {
960                         uint8_t h2 = h;
961 
962                         le2 = Vcb->trees_hash.Flink;
963 
964                         if (h2 > 0) {
965                             h2--;
966                             do {
967                                 if (Vcb->trees_ptrs[h2]) {
968                                     le2 = Vcb->trees_ptrs[h2];
969                                     break;
970                                 }
971 
972                                 h2--;
973                             } while (h2 > 0);
974                         }
975                     } else
976                         le2 = Vcb->trees_ptrs[h];
977 
978                     inserted = false;
979                     while (le2 != &Vcb->trees_hash) {
980                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981 
982                         if (t2->hash >= t3->hash) {
983                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
984                             inserted = true;
985                             break;
986                         }
987 
988                         le2 = le2->Flink;
989                     }
990 
991                     if (!inserted)
992                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993 
994                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996 
997                     if (data_items && level == 0) {
998                         le2 = data_items->Flink;
999 
1000                         while (le2 != data_items) {
1001                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1002                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1003 
1004                             while (le3 != &t3->itemlist) {
1005                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1006 
1007                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009 
1010                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1011                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1012 
1013                                         if (ed2->address == dr->address)
1014                                             ed2->address = dr->new_address;
1015                                     }
1016                                 }
1017 
1018                                 le3 = le3->Flink;
1019                             }
1020 
1021                             le2 = le2->Flink;
1022                         }
1023                     }
1024 
1025                     t3 = t4;
1026                 }
1027 
1028                 *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029 
1030                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1031                 if (!tw) {
1032                     ERR("out of memory\n");
1033                     Status = STATUS_INSUFFICIENT_RESOURCES;
1034                     goto end;
1035                 }
1036 
1037                 tw->address = mr->new_address;
1038                 tw->length = Vcb->superblock.node_size;
1039                 tw->data = (uint8_t*)mr->data;
1040 
1041                 if (IsListEmpty(&tree_writes))
1042                     InsertTailList(&tree_writes, &tw->list_entry);
1043                 else {
1044                     bool inserted = false;
1045 
1046                     le2 = tree_writes.Flink;
1047                     while (le2 != &tree_writes) {
1048                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1049 
1050                         if (tw2->address > tw->address) {
1051                             InsertHeadList(le2->Blink, &tw->list_entry);
1052                             inserted = true;
1053                             break;
1054                         }
1055 
1056                         le2 = le2->Flink;
1057                     }
1058 
1059                     if (!inserted)
1060                         InsertTailList(&tree_writes, &tw->list_entry);
1061                 }
1062             }
1063 
1064             le = le->Flink;
1065         }
1066     }
1067 
1068     Status = do_tree_writes(Vcb, &tree_writes, true);
1069     if (!NT_SUCCESS(Status)) {
1070         ERR("do_tree_writes returned %08x\n", Status);
1071         goto end;
1072     }
1073 
1074     le = items->Flink;
1075     while (le != items) {
1076         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1077 
1078         Status = add_metadata_reloc_extent_item(Vcb, mr);
1079         if (!NT_SUCCESS(Status)) {
1080             ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1081             goto end;
1082         }
1083 
1084         le = le->Flink;
1085     }
1086 
1087     Status = STATUS_SUCCESS;
1088 
1089 end:
1090     while (!IsListEmpty(&tree_writes)) {
1091         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1092         ExFreePool(tw);
1093     }
1094 
1095     return Status;
1096 }
1097 
1098 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1099     KEY searchkey;
1100     traverse_ptr tp;
1101     NTSTATUS Status;
1102     bool b;
1103     LIST_ENTRY items, rollback;
1104     uint32_t loaded = 0;
1105 
1106     TRACE("chunk %I64x\n", c->offset);
1107 
1108     InitializeListHead(&rollback);
1109     InitializeListHead(&items);
1110 
1111     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1112 
1113     searchkey.obj_id = c->offset;
1114     searchkey.obj_type = TYPE_METADATA_ITEM;
1115     searchkey.offset = 0xffffffffffffffff;
1116 
1117     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1118     if (!NT_SUCCESS(Status)) {
1119         ERR("find_item returned %08x\n", Status);
1120         goto end;
1121     }
1122 
1123     do {
1124         traverse_ptr next_tp;
1125 
1126         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1127             break;
1128 
1129         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1130             bool tree = false, skinny = false;
1131 
1132             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1133                 tree = true;
1134                 skinny = true;
1135             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1136                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1137                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1138 
1139                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1140                     tree = true;
1141             }
1142 
1143             if (tree) {
1144                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1145 
1146                 if (!NT_SUCCESS(Status)) {
1147                     ERR("add_metadata_reloc returned %08x\n", Status);
1148                     goto end;
1149                 }
1150 
1151                 loaded++;
1152 
1153                 if (loaded >= 64) // only do 64 at a time
1154                     break;
1155             }
1156         }
1157 
1158         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1159 
1160         if (b)
1161             tp = next_tp;
1162     } while (b);
1163 
1164     if (IsListEmpty(&items)) {
1165         *changed = false;
1166         Status = STATUS_SUCCESS;
1167         goto end;
1168     } else
1169         *changed = true;
1170 
1171     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1172     if (!NT_SUCCESS(Status)) {
1173         ERR("write_metadata_items returned %08x\n", Status);
1174         goto end;
1175     }
1176 
1177     Status = STATUS_SUCCESS;
1178 
1179     Vcb->need_write = true;
1180 
1181 end:
1182     if (NT_SUCCESS(Status)) {
1183         Status = do_write(Vcb, NULL);
1184         if (!NT_SUCCESS(Status))
1185             ERR("do_write returned %08x\n", Status);
1186     }
1187 
1188     if (NT_SUCCESS(Status))
1189         clear_rollback(&rollback);
1190     else
1191         do_rollback(Vcb, &rollback);
1192 
1193     free_trees(Vcb);
1194 
1195     ExReleaseResourceLite(&Vcb->tree_lock);
1196 
1197     while (!IsListEmpty(&items)) {
1198         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1199 
1200         while (!IsListEmpty(&mr->refs)) {
1201             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1202 
1203             ExFreePool(ref);
1204         }
1205 
1206         ExFreePool(mr);
1207     }
1208 
1209     return Status;
1210 }
1211 
1212 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1213                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1214     NTSTATUS Status;
1215     LIST_ENTRY* le;
1216     KEY searchkey;
1217     traverse_ptr tp;
1218     root* r = NULL;
1219     metadata_reloc* mr;
1220     uint64_t last_tree = 0;
1221     data_reloc_ref* ref;
1222 
1223     le = Vcb->roots.Flink;
1224     while (le != &Vcb->roots) {
1225         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1226 
1227         if (r2->id == edr->root) {
1228             r = r2;
1229             break;
1230         }
1231 
1232         le = le->Flink;
1233     }
1234 
1235     if (!r) {
1236         ERR("could not find subvol %I64x\n", edr->count);
1237         return STATUS_INTERNAL_ERROR;
1238     }
1239 
1240     searchkey.obj_id = edr->objid;
1241     searchkey.obj_type = TYPE_EXTENT_DATA;
1242     searchkey.offset = 0;
1243 
1244     Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1245     if (!NT_SUCCESS(Status)) {
1246         ERR("find_item returned %08x\n", Status);
1247         return Status;
1248     }
1249 
1250     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1251         traverse_ptr tp2;
1252 
1253         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1254             tp = tp2;
1255         else {
1256             ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1257             return STATUS_INTERNAL_ERROR;
1258         }
1259     }
1260 
1261     ref = NULL;
1262 
1263     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1264         traverse_ptr tp2;
1265 
1266         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1267             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1268 
1269             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1270                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1271 
1272                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1273                     if (ref && last_tree == tp.tree->header.address)
1274                         ref->edr.count++;
1275                     else {
1276                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1277                         if (!ref) {
1278                             ERR("out of memory\n");
1279                             return STATUS_INSUFFICIENT_RESOURCES;
1280                         }
1281 
1282                         ref->type = TYPE_EXTENT_DATA_REF;
1283                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1284                         ref->edr.count = 1;
1285 
1286                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1287                         if (!NT_SUCCESS(Status)) {
1288                             ERR("add_metadata_reloc_parent returned %08x\n", Status);
1289                             ExFreePool(ref);
1290                             return Status;
1291                         }
1292 
1293                         last_tree = tp.tree->header.address;
1294                         ref->parent = mr;
1295 
1296                         InsertTailList(&dr->refs, &ref->list_entry);
1297                     }
1298                 }
1299             }
1300         }
1301 
1302         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1303             tp = tp2;
1304         else
1305             break;
1306     }
1307 
1308     return STATUS_SUCCESS;
1309 }
1310 
1311 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1312                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1313     NTSTATUS Status;
1314     data_reloc* dr;
1315     EXTENT_ITEM* ei;
1316     uint16_t len;
1317     uint64_t inline_rc;
1318     uint8_t* ptr;
1319 
1320     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1321     if (!dr) {
1322         ERR("out of memory\n");
1323         return STATUS_INSUFFICIENT_RESOURCES;
1324     }
1325 
1326     dr->address = tp->item->key.obj_id;
1327     dr->size = tp->item->key.offset;
1328     dr->ei = (EXTENT_ITEM*)tp->item->data;
1329     InitializeListHead(&dr->refs);
1330 
1331     Status = delete_tree_item(Vcb, tp);
1332     if (!NT_SUCCESS(Status)) {
1333         ERR("delete_tree_item returned %08x\n", Status);
1334         return Status;
1335     }
1336 
1337     if (!c)
1338         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1339 
1340     if (c) {
1341         acquire_chunk_lock(c, Vcb);
1342 
1343         c->used -= tp->item->key.offset;
1344 
1345         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1346 
1347         release_chunk_lock(c, Vcb);
1348     }
1349 
1350     ei = (EXTENT_ITEM*)tp->item->data;
1351     inline_rc = 0;
1352 
1353     len = tp->item->size - sizeof(EXTENT_ITEM);
1354     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1355 
1356     while (len > 0) {
1357         uint8_t secttype = *ptr;
1358         uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1359 
1360         len--;
1361 
1362         if (sectlen > len) {
1363             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1364             return STATUS_INTERNAL_ERROR;
1365         }
1366 
1367         if (sectlen == 0) {
1368             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1369             return STATUS_INTERNAL_ERROR;
1370         }
1371 
1372         if (secttype == TYPE_EXTENT_DATA_REF) {
1373             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1374 
1375             inline_rc += edr->count;
1376 
1377             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1378             if (!NT_SUCCESS(Status)) {
1379                 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1380                 return Status;
1381             }
1382         } else if (secttype == TYPE_SHARED_DATA_REF) {
1383             metadata_reloc* mr;
1384             data_reloc_ref* ref;
1385 
1386             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1387             if (!ref) {
1388                 ERR("out of memory\n");
1389                 return STATUS_INSUFFICIENT_RESOURCES;
1390             }
1391 
1392             ref->type = TYPE_SHARED_DATA_REF;
1393             RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1394             inline_rc += ref->sdr.count;
1395 
1396             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1397             if (!NT_SUCCESS(Status)) {
1398                 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1399                 ExFreePool(ref);
1400                 return Status;
1401             }
1402 
1403             ref->parent = mr;
1404 
1405             InsertTailList(&dr->refs, &ref->list_entry);
1406         } else {
1407             ERR("unexpected tree type %x\n", secttype);
1408             return STATUS_INTERNAL_ERROR;
1409         }
1410 
1411 
1412         len -= sectlen;
1413         ptr += sizeof(uint8_t) + sectlen;
1414     }
1415 
1416     if (inline_rc < ei->refcount) { // look for non-inline entries
1417         traverse_ptr tp2 = *tp, next_tp;
1418 
1419         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1420             tp2 = next_tp;
1421 
1422             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1423                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1424                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1425                     if (!NT_SUCCESS(Status)) {
1426                         ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1427                         return Status;
1428                     }
1429 
1430                     Status = delete_tree_item(Vcb, &tp2);
1431                     if (!NT_SUCCESS(Status)) {
1432                         ERR("delete_tree_item returned %08x\n", Status);
1433                         return Status;
1434                     }
1435                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1436                     metadata_reloc* mr;
1437                     data_reloc_ref* ref;
1438 
1439                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1440                     if (!ref) {
1441                         ERR("out of memory\n");
1442                         return STATUS_INSUFFICIENT_RESOURCES;
1443                     }
1444 
1445                     ref->type = TYPE_SHARED_DATA_REF;
1446                     ref->sdr.offset = tp2.item->key.offset;
1447                     ref->sdr.count = *((uint32_t*)tp2.item->data);
1448 
1449                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1450                     if (!NT_SUCCESS(Status)) {
1451                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
1452                         ExFreePool(ref);
1453                         return Status;
1454                     }
1455 
1456                     ref->parent = mr;
1457                     InsertTailList(&dr->refs, &ref->list_entry);
1458 
1459                     Status = delete_tree_item(Vcb, &tp2);
1460                     if (!NT_SUCCESS(Status)) {
1461                         ERR("delete_tree_item returned %08x\n", Status);
1462                         return Status;
1463                     }
1464                 }
1465             } else
1466                 break;
1467         }
1468     }
1469 
1470     InsertTailList(items, &dr->list_entry);
1471 
1472     return STATUS_SUCCESS;
1473 }
1474 
1475 static void sort_data_reloc_refs(data_reloc* dr) {
1476     LIST_ENTRY newlist, *le;
1477 
1478     if (IsListEmpty(&dr->refs))
1479         return;
1480 
1481     // insertion sort
1482 
1483     InitializeListHead(&newlist);
1484 
1485     while (!IsListEmpty(&dr->refs)) {
1486         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1487         bool inserted = false;
1488 
1489         if (ref->type == TYPE_EXTENT_DATA_REF)
1490             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1491         else if (ref->type == TYPE_SHARED_DATA_REF)
1492             ref->hash = ref->parent->new_address;
1493 
1494         le = newlist.Flink;
1495         while (le != &newlist) {
1496             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1497 
1498             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1499                 InsertHeadList(le->Blink, &ref->list_entry);
1500                 inserted = true;
1501                 break;
1502             }
1503 
1504             le = le->Flink;
1505         }
1506 
1507         if (!inserted)
1508             InsertTailList(&newlist, &ref->list_entry);
1509     }
1510 
1511     le = newlist.Flink;
1512     while (le != &newlist) {
1513         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1514 
1515         if (le->Flink != &newlist) {
1516             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1517 
1518             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1519                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1520                 RemoveEntryList(&ref2->list_entry);
1521                 ref->edr.count += ref2->edr.count;
1522                 ExFreePool(ref2);
1523                 continue;
1524             }
1525         }
1526 
1527         le = le->Flink;
1528     }
1529 
1530     newlist.Flink->Blink = &dr->refs;
1531     newlist.Blink->Flink = &dr->refs;
1532     dr->refs.Flink = newlist.Flink;
1533     dr->refs.Blink = newlist.Blink;
1534 }
1535 
1536 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1537     NTSTATUS Status;
1538     LIST_ENTRY* le;
1539     uint64_t rc = 0;
1540     uint16_t inline_len;
1541     bool all_inline = true;
1542     data_reloc_ref* first_noninline = NULL;
1543     EXTENT_ITEM* ei;
1544     uint8_t* ptr;
1545 
1546     inline_len = sizeof(EXTENT_ITEM);
1547 
1548     sort_data_reloc_refs(dr);
1549 
1550     le = dr->refs.Flink;
1551     while (le != &dr->refs) {
1552         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1553         uint16_t extlen = 0;
1554 
1555         if (ref->type == TYPE_EXTENT_DATA_REF) {
1556             extlen += sizeof(EXTENT_DATA_REF);
1557             rc += ref->edr.count;
1558         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1559             extlen += sizeof(SHARED_DATA_REF);
1560             rc++;
1561         }
1562 
1563         if (all_inline) {
1564             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1565                 all_inline = false;
1566                 first_noninline = ref;
1567             } else
1568                 inline_len += extlen + 1;
1569         }
1570 
1571         le = le->Flink;
1572     }
1573 
1574     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1575     if (!ei) {
1576         ERR("out of memory\n");
1577         return STATUS_INSUFFICIENT_RESOURCES;
1578     }
1579 
1580     ei->refcount = rc;
1581     ei->generation = dr->ei->generation;
1582     ei->flags = dr->ei->flags;
1583     ptr = (uint8_t*)&ei[1];
1584 
1585     le = dr->refs.Flink;
1586     while (le != &dr->refs) {
1587         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1588 
1589         if (ref == first_noninline)
1590             break;
1591 
1592         *ptr = ref->type;
1593         ptr++;
1594 
1595         if (ref->type == TYPE_EXTENT_DATA_REF) {
1596             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1597 
1598             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1599 
1600             ptr += sizeof(EXTENT_DATA_REF);
1601         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1602             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1603 
1604             sdr->offset = ref->parent->new_address;
1605             sdr->count = ref->sdr.count;
1606 
1607             ptr += sizeof(SHARED_DATA_REF);
1608         }
1609 
1610         le = le->Flink;
1611     }
1612 
1613     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1614     if (!NT_SUCCESS(Status)) {
1615         ERR("insert_tree_item returned %08x\n", Status);
1616         return Status;
1617     }
1618 
1619     if (!all_inline) {
1620         le = &first_noninline->list_entry;
1621 
1622         while (le != &dr->refs) {
1623             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1624 
1625             if (ref->type == TYPE_EXTENT_DATA_REF) {
1626                 EXTENT_DATA_REF* edr;
1627 
1628                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1629                 if (!edr) {
1630                     ERR("out of memory\n");
1631                     return STATUS_INSUFFICIENT_RESOURCES;
1632                 }
1633 
1634                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1635 
1636                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1637                 if (!NT_SUCCESS(Status)) {
1638                     ERR("insert_tree_item returned %08x\n", Status);
1639                     return Status;
1640                 }
1641             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1642                 uint32_t* sdr;
1643 
1644                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1645                 if (!sdr) {
1646                     ERR("out of memory\n");
1647                     return STATUS_INSUFFICIENT_RESOURCES;
1648                 }
1649 
1650                 *sdr = ref->sdr.count;
1651 
1652                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1653                 if (!NT_SUCCESS(Status)) {
1654                     ERR("insert_tree_item returned %08x\n", Status);
1655                     return Status;
1656                 }
1657             }
1658 
1659             le = le->Flink;
1660         }
1661     }
1662 
1663     return STATUS_SUCCESS;
1664 }
1665 
1666 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1667     KEY searchkey;
1668     traverse_ptr tp;
1669     NTSTATUS Status;
1670     bool b;
1671     LIST_ENTRY items, metadata_items, rollback, *le;
1672     uint64_t loaded = 0, num_loaded = 0;
1673     chunk* newchunk = NULL;
1674     uint8_t* data = NULL;
1675 
1676     TRACE("chunk %I64x\n", c->offset);
1677 
1678     InitializeListHead(&rollback);
1679     InitializeListHead(&items);
1680     InitializeListHead(&metadata_items);
1681 
1682     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1683 
1684     searchkey.obj_id = c->offset;
1685     searchkey.obj_type = TYPE_EXTENT_ITEM;
1686     searchkey.offset = 0xffffffffffffffff;
1687 
1688     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1689     if (!NT_SUCCESS(Status)) {
1690         ERR("find_item returned %08x\n", Status);
1691         goto end;
1692     }
1693 
1694     do {
1695         traverse_ptr next_tp;
1696 
1697         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1698             break;
1699 
1700         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1701             bool tree = false;
1702 
1703             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1704                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1705 
1706                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1707                     tree = true;
1708             }
1709 
1710             if (!tree) {
1711                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1712 
1713                 if (!NT_SUCCESS(Status)) {
1714                     ERR("add_data_reloc returned %08x\n", Status);
1715                     goto end;
1716                 }
1717 
1718                 loaded += tp.item->key.offset;
1719                 num_loaded++;
1720 
1721                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1722                     break;
1723             }
1724         }
1725 
1726         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1727 
1728         if (b)
1729             tp = next_tp;
1730     } while (b);
1731 
1732     if (IsListEmpty(&items)) {
1733         *changed = false;
1734         Status = STATUS_SUCCESS;
1735         goto end;
1736     } else
1737         *changed = true;
1738 
1739     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1740     if (!data) {
1741         ERR("out of memory\n");
1742         Status = STATUS_INSUFFICIENT_RESOURCES;
1743         goto end;
1744     }
1745 
1746     le = items.Flink;
1747     while (le != &items) {
1748         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1749         bool done = false;
1750         LIST_ENTRY* le2;
1751         uint32_t* csum;
1752         RTL_BITMAP bmp;
1753         ULONG* bmparr;
1754         ULONG bmplen, runlength, index, lastoff;
1755 
1756         if (newchunk) {
1757             acquire_chunk_lock(newchunk, Vcb);
1758 
1759             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1760                 newchunk->used += dr->size;
1761                 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1762                 done = true;
1763             }
1764 
1765             release_chunk_lock(newchunk, Vcb);
1766         }
1767 
1768         if (!done) {
1769             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1770 
1771             le2 = Vcb->chunks.Flink;
1772             while (le2 != &Vcb->chunks) {
1773                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1774 
1775                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1776                     acquire_chunk_lock(c2, Vcb);
1777 
1778                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1779                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1780                             c2->used += dr->size;
1781                             space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1782                             release_chunk_lock(c2, Vcb);
1783                             newchunk = c2;
1784                             done = true;
1785                             break;
1786                         }
1787                     }
1788 
1789                     release_chunk_lock(c2, Vcb);
1790                 }
1791 
1792                 le2 = le2->Flink;
1793             }
1794 
1795             // allocate new chunk if necessary
1796             if (!done) {
1797                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1798 
1799                 if (!NT_SUCCESS(Status)) {
1800                     ERR("alloc_chunk returned %08x\n", Status);
1801                     ExReleaseResourceLite(&Vcb->chunk_lock);
1802                     goto end;
1803                 }
1804 
1805                 acquire_chunk_lock(newchunk, Vcb);
1806 
1807                 newchunk->balance_num = Vcb->balance.balance_num;
1808 
1809                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1810                     release_chunk_lock(newchunk, Vcb);
1811                     ExReleaseResourceLite(&Vcb->chunk_lock);
1812                     ERR("could not find address in new chunk\n");
1813                     Status = STATUS_DISK_FULL;
1814                     goto end;
1815                 } else {
1816                     newchunk->used += dr->size;
1817                     space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1818                 }
1819 
1820                 release_chunk_lock(newchunk, Vcb);
1821             }
1822 
1823             ExReleaseResourceLite(&Vcb->chunk_lock);
1824         }
1825 
1826         dr->newchunk = newchunk;
1827 
1828         bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1829 
1830         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1831         if (!bmparr) {
1832             ERR("out of memory\n");
1833             Status = STATUS_INSUFFICIENT_RESOURCES;
1834             goto end;
1835         }
1836 
1837         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG);
1838         if (!csum) {
1839             ERR("out of memory\n");
1840             ExFreePool(bmparr);
1841             Status = STATUS_INSUFFICIENT_RESOURCES;
1842             goto end;
1843         }
1844 
1845         RtlInitializeBitMap(&bmp, bmparr, bmplen);
1846         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1847 
1848         searchkey.obj_id = EXTENT_CSUM_ID;
1849         searchkey.obj_type = TYPE_EXTENT_CSUM;
1850         searchkey.offset = dr->address;
1851 
1852         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1853         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1854             ERR("find_item returned %08x\n", Status);
1855             ExFreePool(csum);
1856             ExFreePool(bmparr);
1857             goto end;
1858         }
1859 
1860         if (Status != STATUS_NOT_FOUND) {
1861             do {
1862                 traverse_ptr next_tp;
1863 
1864                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1865                     if (tp.item->key.offset >= dr->address + dr->size)
1866                         break;
1867                     else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) {
1868                         uint64_t cs = max(dr->address, tp.item->key.offset);
1869                         uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
1870 
1871                         RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1872                                       tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
1873                                       (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
1874 
1875                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1876 
1877                         if (ce == dr->address + dr->size)
1878                             break;
1879                     }
1880                 }
1881 
1882                 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1883                     tp = next_tp;
1884                 else
1885                     break;
1886             } while (true);
1887         }
1888 
1889         lastoff = 0;
1890         runlength = RtlFindFirstRunClear(&bmp, &index);
1891 
1892         while (runlength != 0) {
1893             if (index >= bmplen)
1894                 break;
1895 
1896             if (index + runlength >= bmplen) {
1897                 runlength = bmplen - index;
1898 
1899                 if (runlength == 0)
1900                     break;
1901             }
1902 
1903             if (index > lastoff) {
1904                 ULONG off = lastoff;
1905                 ULONG size = index - lastoff;
1906 
1907                 // handle no csum run
1908                 do {
1909                     ULONG rl;
1910 
1911                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1912                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1913                     else
1914                         rl = size;
1915 
1916                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1917                                        c, NULL, NULL, 0, false, NormalPagePriority);
1918                     if (!NT_SUCCESS(Status)) {
1919                         ERR("read_data returned %08x\n", Status);
1920                         ExFreePool(csum);
1921                         ExFreePool(bmparr);
1922                         goto end;
1923                     }
1924 
1925                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1926                                                  NULL, newchunk, false, 0, NormalPagePriority);
1927                     if (!NT_SUCCESS(Status)) {
1928                         ERR("write_data_complete returned %08x\n", Status);
1929                         ExFreePool(csum);
1930                         ExFreePool(bmparr);
1931                         goto end;
1932                     }
1933 
1934                     size -= rl;
1935                     off += rl;
1936                 } while (size > 0);
1937             }
1938 
1939             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1940             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1941 
1942             // handle csum run
1943             do {
1944                 ULONG rl;
1945 
1946                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1947                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1948                 else
1949                     rl = runlength;
1950 
1951                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data,
1952                                    c, NULL, NULL, 0, false, NormalPagePriority);
1953                 if (!NT_SUCCESS(Status)) {
1954                     ERR("read_data returned %08x\n", Status);
1955                     ExFreePool(csum);
1956                     ExFreePool(bmparr);
1957                     goto end;
1958                 }
1959 
1960                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1961                                              NULL, newchunk, false, 0, NormalPagePriority);
1962                 if (!NT_SUCCESS(Status)) {
1963                     ERR("write_data_complete returned %08x\n", Status);
1964                     ExFreePool(csum);
1965                     ExFreePool(bmparr);
1966                     goto end;
1967                 }
1968 
1969                 runlength -= rl;
1970                 index += rl;
1971             } while (runlength > 0);
1972 
1973             lastoff = index;
1974             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1975         }
1976 
1977         ExFreePool(csum);
1978         ExFreePool(bmparr);
1979 
1980         // handle final nocsum run
1981         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1982             ULONG off = lastoff;
1983             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1984 
1985             do {
1986                 ULONG rl;
1987 
1988                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1989                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1990                 else
1991                     rl = size;
1992 
1993                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1994                                    c, NULL, NULL, 0, false, NormalPagePriority);
1995                 if (!NT_SUCCESS(Status)) {
1996                     ERR("read_data returned %08x\n", Status);
1997                     goto end;
1998                 }
1999 
2000                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2001                                              NULL, newchunk, false, 0, NormalPagePriority);
2002                 if (!NT_SUCCESS(Status)) {
2003                     ERR("write_data_complete returned %08x\n", Status);
2004                     goto end;
2005                 }
2006 
2007                 size -= rl;
2008                 off += rl;
2009             } while (size > 0);
2010         }
2011 
2012         le = le->Flink;
2013     }
2014 
2015     ExFreePool(data);
2016     data = NULL;
2017 
2018     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2019     if (!NT_SUCCESS(Status)) {
2020         ERR("write_metadata_items returned %08x\n", Status);
2021         goto end;
2022     }
2023 
2024     le = items.Flink;
2025     while (le != &items) {
2026         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2027 
2028         Status = add_data_reloc_extent_item(Vcb, dr);
2029         if (!NT_SUCCESS(Status)) {
2030             ERR("add_data_reloc_extent_item returned %08x\n", Status);
2031             goto end;
2032         }
2033 
2034         le = le->Flink;
2035     }
2036 
2037     le = c->changed_extents.Flink;
2038     while (le != &c->changed_extents) {
2039         LIST_ENTRY *le2, *le3;
2040         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2041 
2042         le3 = le->Flink;
2043 
2044         le2 = items.Flink;
2045         while (le2 != &items) {
2046             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2047 
2048             if (ce->address == dr->address) {
2049                 ce->address = dr->new_address;
2050                 RemoveEntryList(&ce->list_entry);
2051                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2052                 break;
2053             }
2054 
2055             le2 = le2->Flink;
2056         }
2057 
2058         le = le3;
2059     }
2060 
2061     Status = STATUS_SUCCESS;
2062 
2063     Vcb->need_write = true;
2064 
2065 end:
2066     if (NT_SUCCESS(Status)) {
2067         // update extents in cache inodes before we flush
2068         le = Vcb->chunks.Flink;
2069         while (le != &Vcb->chunks) {
2070             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2071 
2072             if (c2->cache) {
2073                 LIST_ENTRY* le2;
2074 
2075                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2076 
2077                 le2 = c2->cache->extents.Flink;
2078                 while (le2 != &c2->cache->extents) {
2079                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2080 
2081                     if (!ext->ignore) {
2082                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2083                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2084 
2085                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2086                                 LIST_ENTRY* le3 = items.Flink;
2087                                 while (le3 != &items) {
2088                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2089 
2090                                     if (ed2->address == dr->address) {
2091                                         ed2->address = dr->new_address;
2092                                         break;
2093                                     }
2094 
2095                                     le3 = le3->Flink;
2096                                 }
2097                             }
2098                         }
2099                     }
2100 
2101                     le2 = le2->Flink;
2102                 }
2103 
2104                 ExReleaseResourceLite(c2->cache->Header.Resource);
2105             }
2106 
2107             le = le->Flink;
2108         }
2109 
2110         Status = do_write(Vcb, NULL);
2111         if (!NT_SUCCESS(Status))
2112             ERR("do_write returned %08x\n", Status);
2113     }
2114 
2115     if (NT_SUCCESS(Status)) {
2116         clear_rollback(&rollback);
2117 
2118         // update open FCBs
2119         // FIXME - speed this up(?)
2120 
2121         le = Vcb->all_fcbs.Flink;
2122         while (le != &Vcb->all_fcbs) {
2123             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2124             LIST_ENTRY* le2;
2125 
2126             ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2127 
2128             le2 = fcb->extents.Flink;
2129             while (le2 != &fcb->extents) {
2130                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2131 
2132                 if (!ext->ignore) {
2133                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2134                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2135 
2136                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2137                             LIST_ENTRY* le3 = items.Flink;
2138                             while (le3 != &items) {
2139                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2140 
2141                                 if (ed2->address == dr->address) {
2142                                     ed2->address = dr->new_address;
2143                                     break;
2144                                 }
2145 
2146                                 le3 = le3->Flink;
2147                             }
2148                         }
2149                     }
2150                 }
2151 
2152                 le2 = le2->Flink;
2153             }
2154 
2155             ExReleaseResourceLite(fcb->Header.Resource);
2156 
2157             le = le->Flink;
2158         }
2159     } else
2160         do_rollback(Vcb, &rollback);
2161 
2162     free_trees(Vcb);
2163 
2164     ExReleaseResourceLite(&Vcb->tree_lock);
2165 
2166     if (data)
2167         ExFreePool(data);
2168 
2169     while (!IsListEmpty(&items)) {
2170         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2171 
2172         while (!IsListEmpty(&dr->refs)) {
2173             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2174 
2175             ExFreePool(ref);
2176         }
2177 
2178         ExFreePool(dr);
2179     }
2180 
2181     while (!IsListEmpty(&metadata_items)) {
2182         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2183 
2184         while (!IsListEmpty(&mr->refs)) {
2185             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2186 
2187             ExFreePool(ref);
2188         }
2189 
2190         ExFreePool(mr);
2191     }
2192 
2193     return Status;
2194 }
2195 
2196 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2197     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2198         return BLOCK_FLAG_RAID0;
2199     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2200         return BLOCK_FLAG_RAID1;
2201     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2202         return BLOCK_FLAG_DUPLICATE;
2203     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2204         return BLOCK_FLAG_RAID10;
2205     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2206         return BLOCK_FLAG_RAID5;
2207     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2208         return BLOCK_FLAG_RAID6;
2209     else
2210         return BLOCK_FLAG_SINGLE;
2211 }
2212 
2213 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2214     btrfs_balance_opts* opts;
2215 
2216     opts = &Vcb->balance.opts[sort];
2217 
2218     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2219         return false;
2220 
2221     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2222         uint64_t type = get_chunk_dup_type(c);
2223 
2224         if (!(type & opts->profiles))
2225             return false;
2226     }
2227 
2228     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2229         uint16_t i;
2230         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2231         bool b = false;
2232 
2233         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2234             if (cis[i].dev_id == opts->devid) {
2235                 b = true;
2236                 break;
2237             }
2238         }
2239 
2240         if (!b)
2241             return false;
2242     }
2243 
2244     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2245         uint16_t i, factor;
2246         uint64_t physsize;
2247         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2248         bool b = false;
2249 
2250         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2251             factor = c->chunk_item->num_stripes;
2252         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2253             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2254         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2255             factor = c->chunk_item->num_stripes - 1;
2256         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2257             factor = c->chunk_item->num_stripes - 2;
2258         else // SINGLE, DUPLICATE, RAID1
2259             factor = 1;
2260 
2261         physsize = c->chunk_item->size / factor;
2262 
2263         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2264             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2265                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2266                 b = true;
2267                 break;
2268             }
2269         }
2270 
2271         if (!b)
2272             return false;
2273     }
2274 
2275     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2276         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2277             return false;
2278     }
2279 
2280     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2281         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2282             return false;
2283     }
2284 
2285     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2286         uint64_t usage = c->used * 100 / c->chunk_item->size;
2287 
2288         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2289         if (c->used > 0 && usage == 0)
2290             usage = 1;
2291 
2292         if (usage < opts->usage_start || usage > opts->usage_end)
2293             return false;
2294     }
2295 
2296     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2297         uint64_t type = get_chunk_dup_type(c);
2298 
2299         if (type == opts->convert)
2300             return false;
2301     }
2302 
2303     return true;
2304 }
2305 
2306 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2307     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2308         args->profiles = opts->profiles;
2309         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2310     }
2311 
2312     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2313         if (args->usage_start == 0) {
2314             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2315             args->usage_start = opts->usage_start;
2316             args->usage_end = opts->usage_end;
2317         } else {
2318             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2319             args->usage = opts->usage_end;
2320         }
2321     }
2322 
2323     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2324         args->devid = opts->devid;
2325         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2326     }
2327 
2328     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2329         args->drange_start = opts->drange_start;
2330         args->drange_end = opts->drange_end;
2331         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2332     }
2333 
2334     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2335         args->vrange_start = opts->vrange_start;
2336         args->vrange_end = opts->vrange_end;
2337         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2338     }
2339 
2340     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2341         args->convert = opts->convert;
2342         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2343 
2344         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2345             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2346     }
2347 
2348     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2349         if (args->limit_start == 0) {
2350             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2351             args->limit_start = (uint32_t)opts->limit_start;
2352             args->limit_end = (uint32_t)opts->limit_end;
2353         } else {
2354             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2355             args->limit = opts->limit_end;
2356         }
2357     }
2358 
2359     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2360         args->stripes_start = opts->stripes_start;
2361         args->stripes_end = opts->stripes_end;
2362         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2363     }
2364 }
2365 
2366 static NTSTATUS add_balance_item(device_extension* Vcb) {
2367     KEY searchkey;
2368     traverse_ptr tp;
2369     NTSTATUS Status;
2370     BALANCE_ITEM* bi;
2371 
2372     searchkey.obj_id = BALANCE_ITEM_ID;
2373     searchkey.obj_type = TYPE_TEMP_ITEM;
2374     searchkey.offset = 0;
2375 
2376     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2377 
2378     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2379     if (!NT_SUCCESS(Status)) {
2380         ERR("find_item returned %08x\n", Status);
2381         goto end;
2382     }
2383 
2384     if (!keycmp(tp.item->key, searchkey)) {
2385         Status = delete_tree_item(Vcb, &tp);
2386         if (!NT_SUCCESS(Status)) {
2387             ERR("delete_tree_item returned %08x\n", Status);
2388             goto end;
2389         }
2390     }
2391 
2392     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2393     if (!bi) {
2394         ERR("out of memory\n");
2395         Status = STATUS_INSUFFICIENT_RESOURCES;
2396         goto end;
2397     }
2398 
2399     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2400 
2401     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2402         bi->flags |= BALANCE_FLAGS_DATA;
2403         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2404     }
2405 
2406     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2407         bi->flags |= BALANCE_FLAGS_METADATA;
2408         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2409     }
2410 
2411     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2412         bi->flags |= BALANCE_FLAGS_SYSTEM;
2413         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2414     }
2415 
2416     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2417     if (!NT_SUCCESS(Status)) {
2418         ERR("insert_tree_item returned %08x\n", Status);
2419         ExFreePool(bi);
2420         goto end;
2421     }
2422 
2423     Status = STATUS_SUCCESS;
2424 
2425 end:
2426     if (NT_SUCCESS(Status)) {
2427         Status = do_write(Vcb, NULL);
2428         if (!NT_SUCCESS(Status))
2429             ERR("do_write returned %08x\n", Status);
2430     }
2431 
2432     free_trees(Vcb);
2433 
2434     ExReleaseResourceLite(&Vcb->tree_lock);
2435 
2436     return Status;
2437 }
2438 
2439 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2440     KEY searchkey;
2441     traverse_ptr tp;
2442     NTSTATUS Status;
2443 
2444     searchkey.obj_id = BALANCE_ITEM_ID;
2445     searchkey.obj_type = TYPE_TEMP_ITEM;
2446     searchkey.offset = 0;
2447 
2448     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2449 
2450     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2451     if (!NT_SUCCESS(Status)) {
2452         ERR("find_item returned %08x\n", Status);
2453         goto end;
2454     }
2455 
2456     if (!keycmp(tp.item->key, searchkey)) {
2457         Status = delete_tree_item(Vcb, &tp);
2458         if (!NT_SUCCESS(Status)) {
2459             ERR("delete_tree_item returned %08x\n", Status);
2460             goto end;
2461         }
2462 
2463         Status = do_write(Vcb, NULL);
2464         if (!NT_SUCCESS(Status)) {
2465             ERR("do_write returned %08x\n", Status);
2466             goto end;
2467         }
2468 
2469         free_trees(Vcb);
2470     }
2471 
2472     Status = STATUS_SUCCESS;
2473 
2474 end:
2475     ExReleaseResourceLite(&Vcb->tree_lock);
2476 
2477     return Status;
2478 }
2479 
2480 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2481     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2482 
2483     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2484         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2485         opts->profiles = args->profiles;
2486     }
2487 
2488     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2489         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2490 
2491         opts->usage_start = 0;
2492         opts->usage_end = (uint8_t)args->usage;
2493     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2494         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2495 
2496         opts->usage_start = (uint8_t)args->usage_start;
2497         opts->usage_end = (uint8_t)args->usage_end;
2498     }
2499 
2500     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2501         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2502         opts->devid = args->devid;
2503     }
2504 
2505     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2506         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2507         opts->drange_start = args->drange_start;
2508         opts->drange_end = args->drange_end;
2509     }
2510 
2511     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2512         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2513         opts->vrange_start = args->vrange_start;
2514         opts->vrange_end = args->vrange_end;
2515     }
2516 
2517     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2518         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2519 
2520         opts->limit_start = 0;
2521         opts->limit_end = args->limit;
2522     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2523         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2524 
2525         opts->limit_start = args->limit_start;
2526         opts->limit_end = args->limit_end;
2527     }
2528 
2529     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2530         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2531 
2532         opts->stripes_start = (uint16_t)args->stripes_start;
2533         opts->stripes_end = (uint16_t)args->stripes_end;
2534     }
2535 
2536     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2537         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2538         opts->convert = args->convert;
2539 
2540         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2541             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2542     }
2543 }
2544 
2545 static NTSTATUS remove_superblocks(device* dev) {
2546     NTSTATUS Status;
2547     superblock* sb;
2548     int i = 0;
2549 
2550     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2551     if (!sb) {
2552         ERR("out of memory\n");
2553         return STATUS_INSUFFICIENT_RESOURCES;
2554     }
2555 
2556     RtlZeroMemory(sb, sizeof(superblock));
2557 
2558     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2559         Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2560 
2561         if (!NT_SUCCESS(Status)) {
2562             ExFreePool(sb);
2563             return Status;
2564         }
2565 
2566         i++;
2567     }
2568 
2569     ExFreePool(sb);
2570 
2571     return STATUS_SUCCESS;
2572 }
2573 
2574 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2575     KEY searchkey;
2576     traverse_ptr tp;
2577     NTSTATUS Status;
2578     LIST_ENTRY* le;
2579     volume_device_extension* vde;
2580 
2581     if (Vcb->need_write) {
2582         Status = do_write(Vcb, NULL);
2583 
2584         if (!NT_SUCCESS(Status))
2585             ERR("do_write returned %08x\n", Status);
2586     } else
2587         Status = STATUS_SUCCESS;
2588 
2589     free_trees(Vcb);
2590 
2591     if (!NT_SUCCESS(Status))
2592         return Status;
2593 
2594     // remove entry in chunk tree
2595 
2596     searchkey.obj_id = 1;
2597     searchkey.obj_type = TYPE_DEV_ITEM;
2598     searchkey.offset = dev->devitem.dev_id;
2599 
2600     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2601     if (!NT_SUCCESS(Status)) {
2602         ERR("find_item returned %08x\n", Status);
2603         return Status;
2604     }
2605 
2606     if (!keycmp(searchkey, tp.item->key)) {
2607         Status = delete_tree_item(Vcb, &tp);
2608 
2609         if (!NT_SUCCESS(Status)) {
2610             ERR("delete_tree_item returned %08x\n", Status);
2611             return Status;
2612         }
2613     }
2614 
2615     // remove stats entry in device tree
2616 
2617     searchkey.obj_id = 0;
2618     searchkey.obj_type = TYPE_DEV_STATS;
2619     searchkey.offset = dev->devitem.dev_id;
2620 
2621     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2622     if (!NT_SUCCESS(Status)) {
2623         ERR("find_item returned %08x\n", Status);
2624         return Status;
2625     }
2626 
2627     if (!keycmp(searchkey, tp.item->key)) {
2628         Status = delete_tree_item(Vcb, &tp);
2629 
2630         if (!NT_SUCCESS(Status)) {
2631             ERR("delete_tree_item returned %08x\n", Status);
2632             return Status;
2633         }
2634     }
2635 
2636     // update superblock
2637 
2638     Vcb->superblock.num_devices--;
2639     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2640     Vcb->devices_loaded--;
2641 
2642     RemoveEntryList(&dev->list_entry);
2643 
2644     // flush
2645 
2646     Status = do_write(Vcb, NULL);
2647     if (!NT_SUCCESS(Status))
2648         ERR("do_write returned %08x\n", Status);
2649 
2650     free_trees(Vcb);
2651 
2652     if (!NT_SUCCESS(Status))
2653         return Status;
2654 
2655     if (!dev->readonly && dev->devobj) {
2656         Status = remove_superblocks(dev);
2657         if (!NT_SUCCESS(Status))
2658             WARN("remove_superblocks returned %08x\n", Status);
2659     }
2660 
2661     // remove entry in volume list
2662 
2663     vde = Vcb->vde;
2664 
2665     if (dev->devobj) {
2666         pdo_device_extension* pdode = vde->pdode;
2667 
2668         ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2669 
2670         le = pdode->children.Flink;
2671         while (le != &pdode->children) {
2672             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2673 
2674             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2675                 PFILE_OBJECT FileObject;
2676                 PDEVICE_OBJECT mountmgr;
2677                 UNICODE_STRING mmdevpath;
2678 
2679                 pdode->children_loaded--;
2680 
2681                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2682                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2683                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2684                     if (!NT_SUCCESS(Status))
2685                         ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2686                     else {
2687                         MOUNTDEV_NAME mdn;
2688 
2689                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2690                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2691                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2692                         else {
2693                             MOUNTDEV_NAME* mdn2;
2694                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2695 
2696                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2697                             if (!mdn2)
2698                                 ERR("out of memory\n");
2699                             else {
2700                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2701                                 if (!NT_SUCCESS(Status))
2702                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2703                                 else {
2704                                     UNICODE_STRING name;
2705 
2706                                     name.Buffer = mdn2->Name;
2707                                     name.Length = name.MaximumLength = mdn2->NameLength;
2708 
2709                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2710                                     if (!NT_SUCCESS(Status))
2711                                         WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2712                                 }
2713 
2714                                 ExFreePool(mdn2);
2715                             }
2716                         }
2717 
2718                         ObDereferenceObject(FileObject);
2719                     }
2720                 }
2721 
2722                 ExFreePool(vc->pnp_name.Buffer);
2723                 RemoveEntryList(&vc->list_entry);
2724                 ExFreePool(vc);
2725 
2726                 ObDereferenceObject(vc->fileobj);
2727 
2728                 break;
2729             }
2730 
2731             le = le->Flink;
2732         }
2733 
2734         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2735             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2736 
2737             le = pdode->children.Flink;
2738             while (le != &pdode->children) {
2739                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2740 
2741                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2742                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2743                     break;
2744                 }
2745 
2746                 le = le->Flink;
2747             }
2748         }
2749 
2750         pdode->num_children = Vcb->superblock.num_devices;
2751 
2752         ExReleaseResourceLite(&pdode->child_lock);
2753 
2754         // free dev
2755 
2756         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2757             trim_whole_device(dev);
2758     }
2759 
2760     while (!IsListEmpty(&dev->space)) {
2761         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2762         space* s = CONTAINING_RECORD(le2, space, list_entry);
2763 
2764         ExFreePool(s);
2765     }
2766 
2767     ExFreePool(dev);
2768 
2769     if (Vcb->trim) {
2770         Vcb->trim = false;
2771 
2772         le = Vcb->devices.Flink;
2773         while (le != &Vcb->devices) {
2774             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2775 
2776             if (dev2->trim) {
2777                 Vcb->trim = true;
2778                 break;
2779             }
2780 
2781             le = le->Flink;
2782         }
2783     }
2784 
2785     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2786 
2787     return STATUS_SUCCESS;
2788 }
2789 
2790 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2791     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2792     DEVICE_DATA_SET_RANGE* ranges;
2793     ULONG datalen, i;
2794     KEY searchkey;
2795     traverse_ptr tp;
2796     NTSTATUS Status;
2797     bool b;
2798     uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2799     LIST_ENTRY* le;
2800 
2801     dev->num_trim_entries = 0;
2802 
2803     searchkey.obj_id = dev->devitem.dev_id;
2804     searchkey.obj_type = TYPE_DEV_EXTENT;
2805     searchkey.offset = 0;
2806 
2807     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2808     if (!NT_SUCCESS(Status)) {
2809         ERR("find_item returned %08x\n", Status);
2810         return;
2811     }
2812 
2813     do {
2814         traverse_ptr next_tp;
2815 
2816         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2817             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2818                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2819 
2820                 if (tp.item->key.offset > lastoff)
2821                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2822 
2823                 lastoff = tp.item->key.offset + de->length;
2824             } else {
2825                 ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2826                 return;
2827             }
2828         }
2829 
2830         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2831 
2832         if (b) {
2833             tp = next_tp;
2834             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2835                 break;
2836         }
2837     } while (b);
2838 
2839     if (lastoff < dev->devitem.num_bytes)
2840         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2841 
2842     if (dev->num_trim_entries == 0)
2843         return;
2844 
2845     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2846 
2847     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2848     if (!dmdsa) {
2849         ERR("out of memory\n");
2850         goto end;
2851     }
2852 
2853     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2854     dmdsa->Action = DeviceDsmAction_Trim;
2855     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2856     dmdsa->ParameterBlockOffset = 0;
2857     dmdsa->ParameterBlockLength = 0;
2858     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2859     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2860 
2861     ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2862 
2863     i = 0;
2864     le = dev->trim_list.Flink;
2865     while (le != &dev->trim_list) {
2866         space* s = CONTAINING_RECORD(le, space, list_entry);
2867 
2868         ranges[i].StartingOffset = s->address;
2869         ranges[i].LengthInBytes = s->size;
2870         i++;
2871 
2872         le = le->Flink;
2873     }
2874 
2875     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2876     if (!NT_SUCCESS(Status))
2877         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2878 
2879     ExFreePool(dmdsa);
2880 
2881 end:
2882     while (!IsListEmpty(&dev->trim_list)) {
2883         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2884         ExFreePool(s);
2885     }
2886 
2887     dev->num_trim_entries = 0;
2888 }
2889 
2890 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2891     NTSTATUS Status;
2892     bool changed;
2893     LIST_ENTRY* le;
2894     chunk* rc;
2895 
2896     // FIXME - allow with metadata chunks?
2897 
2898     while (true) {
2899         rc = NULL;
2900 
2901         ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2902 
2903         ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2904 
2905         // choose the least-used chunk we haven't looked at yet
2906         le = Vcb->chunks.Flink;
2907         while (le != &Vcb->chunks) {
2908             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2909 
2910             // FIXME - skip full-size chunks over e.g. 90% full?
2911             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2912                 rc = c;
2913 
2914             le = le->Flink;
2915         }
2916 
2917         ExReleaseResourceLite(&Vcb->chunk_lock);
2918 
2919         if (!rc) {
2920             ExReleaseResourceLite(&Vcb->tree_lock);
2921             break;
2922         }
2923 
2924         if (rc->list_entry_balance.Flink) {
2925             RemoveEntryList(&rc->list_entry_balance);
2926             Vcb->balance.chunks_left--;
2927         }
2928 
2929         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2930         rc->reloc = true;
2931 
2932         ExReleaseResourceLite(&Vcb->tree_lock);
2933 
2934         do {
2935             changed = false;
2936 
2937             Status = balance_data_chunk(Vcb, rc, &changed);
2938             if (!NT_SUCCESS(Status)) {
2939                 ERR("balance_data_chunk returned %08x\n", Status);
2940                 Vcb->balance.status = Status;
2941                 rc->list_entry_balance.Flink = NULL;
2942                 rc->reloc = false;
2943                 return Status;
2944             }
2945 
2946             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2947 
2948             if (Vcb->readonly)
2949                 Vcb->balance.stopping = true;
2950 
2951             if (Vcb->balance.stopping)
2952                 return STATUS_SUCCESS;
2953         } while (changed);
2954 
2955         rc->list_entry_balance.Flink = NULL;
2956 
2957         rc->changed = true;
2958         rc->space_changed = true;
2959         rc->balance_num = Vcb->balance.balance_num;
2960 
2961         Status = do_write(Vcb, NULL);
2962         if (!NT_SUCCESS(Status)) {
2963             ERR("do_write returned %08x\n", Status);
2964             return Status;
2965         }
2966 
2967         free_trees(Vcb);
2968     }
2969 
2970     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2971 
2972     Status = alloc_chunk(Vcb, flags, &rc, true);
2973 
2974     ExReleaseResourceLite(&Vcb->chunk_lock);
2975 
2976     if (NT_SUCCESS(Status)) {
2977         *newchunk = rc;
2978         return Status;
2979     } else {
2980         ERR("alloc_chunk returned %08x\n", Status);
2981         return Status;
2982     }
2983 }
2984 
2985 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2986     LIST_ENTRY* le;
2987 
2988     while (!IsListEmpty(&dev->space)) {
2989         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
2990 
2991         ExFreePool(s);
2992     }
2993 
2994     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
2995 
2996     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
2997 
2998     le = Vcb->chunks.Flink;
2999     while (le != &Vcb->chunks) {
3000         uint16_t n;
3001         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3002         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3003 
3004         for (n = 0; n < c->chunk_item->num_stripes; n++) {
3005             uint64_t stripe_size = 0;
3006 
3007             if (cis[n].dev_id == dev->devitem.dev_id) {
3008                 if (stripe_size == 0) {
3009                     uint16_t factor;
3010 
3011                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3012                         factor = c->chunk_item->num_stripes;
3013                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3014                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3015                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3016                         factor = c->chunk_item->num_stripes - 1;
3017                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3018                         factor = c->chunk_item->num_stripes - 2;
3019                     else // SINGLE, DUP, RAID1
3020                         factor = 1;
3021 
3022                     stripe_size = c->chunk_item->size / factor;
3023                 }
3024 
3025                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3026             }
3027         }
3028 
3029         le = le->Flink;
3030     }
3031 
3032     return STATUS_SUCCESS;
3033 }
3034 
3035 _Function_class_(KSTART_ROUTINE)
3036 void __stdcall balance_thread(void* context) {
3037     device_extension* Vcb = (device_extension*)context;
3038     LIST_ENTRY chunks;
3039     LIST_ENTRY* le;
3040     uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3041     uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3042     NTSTATUS Status;
3043 
3044     Vcb->balance.balance_num++;
3045 
3046     Vcb->balance.stopping = false;
3047     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3048 
3049     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3050         old_data_flags = Vcb->data_flags;
3051         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3052 
3053         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3054     }
3055 
3056     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3057         old_metadata_flags = Vcb->metadata_flags;
3058         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3059     }
3060 
3061     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062         old_system_flags = Vcb->system_flags;
3063         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3064     }
3065 
3066     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3067         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3068             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3069         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3070             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3071     }
3072 
3073     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3074     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3075 
3076     InitializeListHead(&chunks);
3077 
3078     // FIXME - what are we supposed to do with limit_start?
3079 
3080     if (!Vcb->readonly) {
3081         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3082             Status = add_balance_item(Vcb);
3083             if (!NT_SUCCESS(Status)) {
3084                 ERR("add_balance_item returned %08x\n", Status);
3085                 Vcb->balance.status = Status;
3086                 goto end;
3087             }
3088         } else {
3089             if (Vcb->need_write) {
3090                 Status = do_write(Vcb, NULL);
3091 
3092                 free_trees(Vcb);
3093 
3094                 if (!NT_SUCCESS(Status)) {
3095                     ERR("do_write returned %08x\n", Status);
3096                     Vcb->balance.status = Status;
3097                     goto end;
3098                 }
3099             }
3100         }
3101     }
3102 
3103     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3104 
3105     if (Vcb->balance.stopping)
3106         goto end;
3107 
3108     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3109 
3110     le = Vcb->chunks.Flink;
3111     while (le != &Vcb->chunks) {
3112         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3113         uint8_t sort;
3114 
3115         acquire_chunk_lock(c, Vcb);
3116 
3117         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3118             sort = BALANCE_OPTS_DATA;
3119         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3120             sort = BALANCE_OPTS_METADATA;
3121         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3122             sort = BALANCE_OPTS_SYSTEM;
3123         else {
3124             ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3125             release_chunk_lock(c, Vcb);
3126             break;
3127         }
3128 
3129         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3130             should_balance_chunk(Vcb, sort, c)) {
3131             InsertTailList(&chunks, &c->list_entry_balance);
3132 
3133             num_chunks[sort]++;
3134             Vcb->balance.total_chunks++;
3135             Vcb->balance.chunks_left++;
3136         } else if (sort == BALANCE_OPTS_METADATA)
3137             okay_metadata_chunks++;
3138         else if (sort == BALANCE_OPTS_DATA)
3139             okay_data_chunks++;
3140         else if (sort == BALANCE_OPTS_SYSTEM)
3141             okay_system_chunks++;
3142 
3143         if (!c->cache_loaded) {
3144             Status = load_cache_chunk(Vcb, c, NULL);
3145 
3146             if (!NT_SUCCESS(Status)) {
3147                 ERR("load_cache_chunk returned %08x\n", Status);
3148                 Vcb->balance.status = Status;
3149                 release_chunk_lock(c, Vcb);
3150                 ExReleaseResourceLite(&Vcb->chunk_lock);
3151                 goto end;
3152             }
3153         }
3154 
3155         release_chunk_lock(c, Vcb);
3156 
3157         le = le->Flink;
3158     }
3159 
3160     ExReleaseResourceLite(&Vcb->chunk_lock);
3161 
3162     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3163     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3164         bool consolidated = false;
3165         chunk* c;
3166 
3167         if (okay_metadata_chunks == 0) {
3168             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3169 
3170             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3171             if (NT_SUCCESS(Status))
3172                 c->balance_num = Vcb->balance.balance_num;
3173             else if (Status != STATUS_DISK_FULL || consolidated) {
3174                 ERR("alloc_chunk returned %08x\n", Status);
3175                 ExReleaseResourceLite(&Vcb->chunk_lock);
3176                 Vcb->balance.status = Status;
3177                 goto end;
3178             }
3179 
3180             ExReleaseResourceLite(&Vcb->chunk_lock);
3181 
3182             if (Status == STATUS_DISK_FULL) {
3183                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3184                 if (!NT_SUCCESS(Status)) {
3185                     ERR("try_consolidation returned %08x\n", Status);
3186                     Vcb->balance.status = Status;
3187                     goto end;
3188                 } else
3189                     c->balance_num = Vcb->balance.balance_num;
3190 
3191                 consolidated = true;
3192 
3193                 if (Vcb->balance.stopping)
3194                     goto end;
3195             }
3196         }
3197 
3198         if (okay_data_chunks == 0) {
3199             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3200 
3201             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3202             if (NT_SUCCESS(Status))
3203                 c->balance_num = Vcb->balance.balance_num;
3204             else if (Status != STATUS_DISK_FULL || consolidated) {
3205                 ERR("alloc_chunk returned %08x\n", Status);
3206                 ExReleaseResourceLite(&Vcb->chunk_lock);
3207                 Vcb->balance.status = Status;
3208                 goto end;
3209             }
3210 
3211             ExReleaseResourceLite(&Vcb->chunk_lock);
3212 
3213             if (Status == STATUS_DISK_FULL) {
3214                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3215                 if (!NT_SUCCESS(Status)) {
3216                     ERR("try_consolidation returned %08x\n", Status);
3217                     Vcb->balance.status = Status;
3218                     goto end;
3219                 } else
3220                     c->balance_num = Vcb->balance.balance_num;
3221 
3222                 consolidated = true;
3223 
3224                 if (Vcb->balance.stopping)
3225                     goto end;
3226             }
3227         }
3228 
3229         if (okay_system_chunks == 0) {
3230             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3231 
3232             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3233             if (NT_SUCCESS(Status))
3234                 c->balance_num = Vcb->balance.balance_num;
3235             else if (Status != STATUS_DISK_FULL || consolidated) {
3236                 ERR("alloc_chunk returned %08x\n", Status);
3237                 ExReleaseResourceLite(&Vcb->chunk_lock);
3238                 Vcb->balance.status = Status;
3239                 goto end;
3240             }
3241 
3242             ExReleaseResourceLite(&Vcb->chunk_lock);
3243 
3244             if (Status == STATUS_DISK_FULL) {
3245                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3246                 if (!NT_SUCCESS(Status)) {
3247                     ERR("try_consolidation returned %08x\n", Status);
3248                     Vcb->balance.status = Status;
3249                     goto end;
3250                 } else
3251                     c->balance_num = Vcb->balance.balance_num;
3252 
3253                 consolidated = true;
3254 
3255                 if (Vcb->balance.stopping)
3256                     goto end;
3257             }
3258         }
3259     }
3260 
3261     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3262 
3263     le = chunks.Flink;
3264     while (le != &chunks) {
3265         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3266 
3267         c->reloc = true;
3268 
3269         le = le->Flink;
3270     }
3271 
3272     ExReleaseResourceLite(&Vcb->chunk_lock);
3273 
3274     // do data chunks before metadata
3275     le = chunks.Flink;
3276     while (le != &chunks) {
3277         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278         LIST_ENTRY* le2 = le->Flink;
3279 
3280         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3281             bool changed;
3282 
3283             do {
3284                 changed = false;
3285 
3286                 Status = balance_data_chunk(Vcb, c, &changed);
3287                 if (!NT_SUCCESS(Status)) {
3288                     ERR("balance_data_chunk returned %08x\n", Status);
3289                     Vcb->balance.status = Status;
3290                     goto end;
3291                 }
3292 
3293                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3294 
3295                 if (Vcb->readonly)
3296                     Vcb->balance.stopping = true;
3297 
3298                 if (Vcb->balance.stopping)
3299                     break;
3300             } while (changed);
3301 
3302             c->changed = true;
3303             c->space_changed = true;
3304         }
3305 
3306         if (Vcb->balance.stopping)
3307             goto end;
3308 
3309         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3310             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3311             RemoveEntryList(&c->list_entry_balance);
3312             c->list_entry_balance.Flink = NULL;
3313 
3314             Vcb->balance.chunks_left--;
3315         }
3316 
3317         le = le2;
3318     }
3319 
3320     // do metadata chunks
3321     while (!IsListEmpty(&chunks)) {
3322         chunk* c;
3323         bool changed;
3324 
3325         le = RemoveHeadList(&chunks);
3326         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3327 
3328         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3329             do {
3330                 Status = balance_metadata_chunk(Vcb, c, &changed);
3331                 if (!NT_SUCCESS(Status)) {
3332                     ERR("balance_metadata_chunk returned %08x\n", Status);
3333                     Vcb->balance.status = Status;
3334                     goto end;
3335                 }
3336 
3337                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3338 
3339                 if (Vcb->readonly)
3340                     Vcb->balance.stopping = true;
3341 
3342                 if (Vcb->balance.stopping)
3343                     break;
3344             } while (changed);
3345 
3346             c->changed = true;
3347             c->space_changed = true;
3348         }
3349 
3350         if (Vcb->balance.stopping)
3351             break;
3352 
3353         c->list_entry_balance.Flink = NULL;
3354 
3355         Vcb->balance.chunks_left--;
3356     }
3357 
3358 end:
3359     if (!Vcb->readonly) {
3360         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3361             le = chunks.Flink;
3362             while (le != &chunks) {
3363                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3364                 c->reloc = false;
3365 
3366                 le = le->Flink;
3367                 c->list_entry_balance.Flink = NULL;
3368             }
3369 
3370             if (old_data_flags != 0)
3371                 Vcb->data_flags = old_data_flags;
3372 
3373             if (old_metadata_flags != 0)
3374                 Vcb->metadata_flags = old_metadata_flags;
3375 
3376             if (old_system_flags != 0)
3377                 Vcb->system_flags = old_system_flags;
3378         }
3379 
3380         if (Vcb->balance.removing) {
3381             device* dev = NULL;
3382 
3383             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3384 
3385             le = Vcb->devices.Flink;
3386             while (le != &Vcb->devices) {
3387                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3388 
3389                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3390                     dev = dev2;
3391                     break;
3392                 }
3393 
3394                 le = le->Flink;
3395             }
3396 
3397             if (dev) {
3398                 if (Vcb->balance.chunks_left == 0) {
3399                     Status = finish_removing_device(Vcb, dev);
3400 
3401                     if (!NT_SUCCESS(Status)) {
3402                         ERR("finish_removing_device returned %08x\n", Status);
3403                         dev->reloc = false;
3404                     }
3405                 } else
3406                     dev->reloc = false;
3407             }
3408 
3409             ExReleaseResourceLite(&Vcb->tree_lock);
3410         } else if (Vcb->balance.shrinking) {
3411             device* dev = NULL;
3412 
3413             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3414 
3415             le = Vcb->devices.Flink;
3416             while (le != &Vcb->devices) {
3417                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3418 
3419                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3420                     dev = dev2;
3421                     break;
3422                 }
3423 
3424                 le = le->Flink;
3425             }
3426 
3427             if (!dev) {
3428                 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3429                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3430             }
3431 
3432             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3433                 if (dev) {
3434                     Status = regenerate_space_list(Vcb, dev);
3435                     if (!NT_SUCCESS(Status))
3436                         WARN("regenerate_space_list returned %08x\n", Status);
3437                 }
3438             } else {
3439                 uint64_t old_size;
3440 
3441                 old_size = dev->devitem.num_bytes;
3442                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3443 
3444                 Status = update_dev_item(Vcb, dev, NULL);
3445                 if (!NT_SUCCESS(Status)) {
3446                     ERR("update_dev_item returned %08x\n", Status);
3447                     dev->devitem.num_bytes = old_size;
3448                     Vcb->balance.status = Status;
3449 
3450                     Status = regenerate_space_list(Vcb, dev);
3451                     if (!NT_SUCCESS(Status))
3452                         WARN("regenerate_space_list returned %08x\n", Status);
3453                 } else {
3454                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3455 
3456                     Status = do_write(Vcb, NULL);
3457                     if (!NT_SUCCESS(Status))
3458                         ERR("do_write returned %08x\n", Status);
3459 
3460                     free_trees(Vcb);
3461                 }
3462             }
3463 
3464             ExReleaseResourceLite(&Vcb->tree_lock);
3465 
3466             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3467                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3468         } else {
3469             Status = remove_balance_item(Vcb);
3470             if (!NT_SUCCESS(Status)) {
3471                 ERR("remove_balance_item returned %08x\n", Status);
3472                 goto end;
3473             }
3474         }
3475 
3476         if (Vcb->trim && !Vcb->options.no_trim) {
3477             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3478 
3479             le = Vcb->devices.Flink;
3480             while (le != &Vcb->devices) {
3481                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3482 
3483                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3484                     trim_unalloc_space(Vcb, dev2);
3485 
3486                 le = le->Flink;
3487             }
3488 
3489             ExReleaseResourceLite(&Vcb->tree_lock);
3490         }
3491     }
3492 
3493     ZwClose(Vcb->balance.thread);
3494     Vcb->balance.thread = NULL;
3495 
3496     KeSetEvent(&Vcb->balance.finished, 0, false);
3497 }
3498 
3499 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3500     NTSTATUS Status;
3501     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3502     uint8_t i;
3503 
3504     if (length < sizeof(btrfs_start_balance) || !data)
3505         return STATUS_INVALID_PARAMETER;
3506 
3507     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3508         return STATUS_PRIVILEGE_NOT_HELD;
3509 
3510     if (Vcb->locked) {
3511         WARN("cannot start balance while locked\n");
3512         return STATUS_DEVICE_NOT_READY;
3513     }
3514 
3515     if (Vcb->scrub.thread) {
3516         WARN("cannot start balance while scrub running\n");
3517         return STATUS_DEVICE_NOT_READY;
3518     }
3519 
3520     if (Vcb->balance.thread) {
3521         WARN("balance already running\n");
3522         return STATUS_DEVICE_NOT_READY;
3523     }
3524 
3525     if (Vcb->readonly)
3526         return STATUS_MEDIA_WRITE_PROTECTED;
3527 
3528     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3529         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3530         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3531         return STATUS_SUCCESS;
3532 
3533     for (i = 0; i < 3; i++) {
3534         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3535             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3536                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3537                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3538 
3539                 if (bsb->opts[i].profiles == 0)
3540                     return STATUS_INVALID_PARAMETER;
3541             }
3542 
3543             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3544                 if (bsb->opts[i].devid == 0)
3545                     return STATUS_INVALID_PARAMETER;
3546             }
3547 
3548             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3549                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3550                     return STATUS_INVALID_PARAMETER;
3551             }
3552 
3553             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3554                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3555                     return STATUS_INVALID_PARAMETER;
3556             }
3557 
3558             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3559                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3560                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3561 
3562                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3563                     return STATUS_INVALID_PARAMETER;
3564             }
3565 
3566             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3567                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3568                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3569 
3570                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3571                     return STATUS_INVALID_PARAMETER;
3572             }
3573 
3574             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3575                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3576                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3577 
3578                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3579                     return STATUS_INVALID_PARAMETER;
3580             }
3581 
3582             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3583                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3584                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3585                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3586                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3587                     return STATUS_INVALID_PARAMETER;
3588             }
3589         }
3590     }
3591 
3592     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3593     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3594     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3595 
3596     Vcb->balance.paused = false;
3597     Vcb->balance.removing = false;
3598     Vcb->balance.shrinking = false;
3599     Vcb->balance.status = STATUS_SUCCESS;
3600     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3601 
3602     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3603     if (!NT_SUCCESS(Status)) {
3604         ERR("PsCreateSystemThread returned %08x\n", Status);
3605         return Status;
3606     }
3607 
3608     return STATUS_SUCCESS;
3609 }
3610 
3611 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3612     KEY searchkey;
3613     traverse_ptr tp;
3614     NTSTATUS Status;
3615     BALANCE_ITEM* bi;
3616     int i;
3617 
3618     searchkey.obj_id = BALANCE_ITEM_ID;
3619     searchkey.obj_type = TYPE_TEMP_ITEM;
3620     searchkey.offset = 0;
3621 
3622     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3623     if (!NT_SUCCESS(Status)) {
3624         ERR("find_item returned %08x\n", Status);
3625         return Status;
3626     }
3627 
3628     if (keycmp(tp.item->key, searchkey)) {
3629         TRACE("no balance item found\n");
3630         return STATUS_NOT_FOUND;
3631     }
3632 
3633     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3634         WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3635              tp.item->size, sizeof(BALANCE_ITEM));
3636         return STATUS_INTERNAL_ERROR;
3637     }
3638 
3639     bi = (BALANCE_ITEM*)tp.item->data;
3640 
3641     if (bi->flags & BALANCE_FLAGS_DATA)
3642         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3643 
3644     if (bi->flags & BALANCE_FLAGS_METADATA)
3645         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3646 
3647     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3648         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3649 
3650     // do the heuristics that Linux driver does
3651 
3652     for (i = 0; i < 3; i++) {
3653         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3654             // if converting, don't redo chunks already done
3655 
3656             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3657                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3658 
3659             // don't balance chunks more than 90% filled - presumably these
3660             // have already been done
3661 
3662             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3663                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3664             ) {
3665                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3666                 Vcb->balance.opts[i].usage_start = 0;
3667                 Vcb->balance.opts[i].usage_end = 90;
3668             }
3669         }
3670     }
3671 
3672     if (Vcb->readonly || Vcb->options.skip_balance)
3673         Vcb->balance.paused = true;
3674     else
3675         Vcb->balance.paused = false;
3676 
3677     Vcb->balance.removing = false;
3678     Vcb->balance.shrinking = false;
3679     Vcb->balance.status = STATUS_SUCCESS;
3680     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3681 
3682     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3683     if (!NT_SUCCESS(Status)) {
3684         ERR("PsCreateSystemThread returned %08x\n", Status);
3685         return Status;
3686     }
3687 
3688     return STATUS_SUCCESS;
3689 }
3690 
3691 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3692     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3693 
3694     if (length < sizeof(btrfs_query_balance) || !data)
3695         return STATUS_INVALID_PARAMETER;
3696 
3697     if (!Vcb->balance.thread) {
3698         bqb->status = BTRFS_BALANCE_STOPPED;
3699 
3700         if (!NT_SUCCESS(Vcb->balance.status)) {
3701             bqb->status |= BTRFS_BALANCE_ERROR;
3702             bqb->error = Vcb->balance.status;
3703         }
3704 
3705         return STATUS_SUCCESS;
3706     }
3707 
3708     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3709 
3710     if (Vcb->balance.removing)
3711         bqb->status |= BTRFS_BALANCE_REMOVAL;
3712 
3713     if (Vcb->balance.shrinking)
3714         bqb->status |= BTRFS_BALANCE_SHRINKING;
3715 
3716     if (!NT_SUCCESS(Vcb->balance.status))
3717         bqb->status |= BTRFS_BALANCE_ERROR;
3718 
3719     bqb->chunks_left = Vcb->balance.chunks_left;
3720     bqb->total_chunks = Vcb->balance.total_chunks;
3721     bqb->error = Vcb->balance.status;
3722     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3723     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3724     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3725 
3726     return STATUS_SUCCESS;
3727 }
3728 
3729 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3730     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3731         return STATUS_PRIVILEGE_NOT_HELD;
3732 
3733     if (!Vcb->balance.thread)
3734         return STATUS_DEVICE_NOT_READY;
3735 
3736     if (Vcb->balance.paused)
3737         return STATUS_DEVICE_NOT_READY;
3738 
3739     Vcb->balance.paused = true;
3740     KeClearEvent(&Vcb->balance.event);
3741 
3742     return STATUS_SUCCESS;
3743 }
3744 
3745 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3746     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3747         return STATUS_PRIVILEGE_NOT_HELD;
3748 
3749     if (!Vcb->balance.thread)
3750         return STATUS_DEVICE_NOT_READY;
3751 
3752     if (!Vcb->balance.paused)
3753         return STATUS_DEVICE_NOT_READY;
3754 
3755     if (Vcb->readonly)
3756         return STATUS_MEDIA_WRITE_PROTECTED;
3757 
3758     Vcb->balance.paused = false;
3759     KeSetEvent(&Vcb->balance.event, 0, false);
3760 
3761     return STATUS_SUCCESS;
3762 }
3763 
3764 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3765     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3766         return STATUS_PRIVILEGE_NOT_HELD;
3767 
3768     if (!Vcb->balance.thread)
3769         return STATUS_DEVICE_NOT_READY;
3770 
3771     Vcb->balance.paused = false;
3772     Vcb->balance.stopping = true;
3773     Vcb->balance.status = STATUS_SUCCESS;
3774     KeSetEvent(&Vcb->balance.event, 0, false);
3775 
3776     return STATUS_SUCCESS;
3777 }
3778 
3779 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3780     uint64_t devid;
3781     LIST_ENTRY* le;
3782     device* dev = NULL;
3783     NTSTATUS Status;
3784     int i;
3785     uint64_t num_rw_devices;
3786 
3787     TRACE("(%p, %p, %x)\n", Vcb, data, length);
3788 
3789     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3790         return STATUS_PRIVILEGE_NOT_HELD;
3791 
3792     if (length < sizeof(uint64_t))
3793         return STATUS_INVALID_PARAMETER;
3794 
3795     devid = *(uint64_t*)data;
3796 
3797     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3798 
3799     if (Vcb->readonly) {
3800         ExReleaseResourceLite(&Vcb->tree_lock);
3801         return STATUS_MEDIA_WRITE_PROTECTED;
3802     }
3803 
3804     num_rw_devices = 0;
3805 
3806     le = Vcb->devices.Flink;
3807     while (le != &Vcb->devices) {
3808         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3809 
3810         if (dev2->devitem.dev_id == devid)
3811             dev = dev2;
3812 
3813         if (!dev2->readonly)
3814             num_rw_devices++;
3815 
3816         le = le->Flink;
3817     }
3818 
3819     if (!dev) {
3820         ExReleaseResourceLite(&Vcb->tree_lock);
3821         WARN("device %I64x not found\n", devid);
3822         return STATUS_NOT_FOUND;
3823     }
3824 
3825     if (!dev->readonly) {
3826         if (num_rw_devices == 1) {
3827             ExReleaseResourceLite(&Vcb->tree_lock);
3828             WARN("not removing last non-readonly device\n");
3829             return STATUS_INVALID_PARAMETER;
3830         }
3831 
3832         if (num_rw_devices == 4 &&
3833             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3834              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3835         ) {
3836             ExReleaseResourceLite(&Vcb->tree_lock);
3837             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3838             return STATUS_CANNOT_DELETE;
3839         }
3840 
3841         if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3842             ExReleaseResourceLite(&Vcb->tree_lock);
3843             ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3844             return STATUS_CANNOT_DELETE;
3845         }
3846 
3847         if (num_rw_devices == 2 &&
3848             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3849              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3850         ) {
3851             ExReleaseResourceLite(&Vcb->tree_lock);
3852             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3853             return STATUS_CANNOT_DELETE;
3854         }
3855     }
3856 
3857     ExReleaseResourceLite(&Vcb->tree_lock);
3858 
3859     if (Vcb->balance.thread) {
3860         WARN("balance already running\n");
3861         return STATUS_DEVICE_NOT_READY;
3862     }
3863 
3864     dev->reloc = true;
3865 
3866     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3867 
3868     for (i = 0; i < 3; i++) {
3869         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3870         Vcb->balance.opts[i].devid = devid;
3871     }
3872 
3873     Vcb->balance.paused = false;
3874     Vcb->balance.removing = true;
3875     Vcb->balance.shrinking = false;
3876     Vcb->balance.status = STATUS_SUCCESS;
3877     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3878 
3879     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3880     if (!NT_SUCCESS(Status)) {
3881         ERR("PsCreateSystemThread returned %08x\n", Status);
3882         dev->reloc = false;
3883         return Status;
3884     }
3885 
3886     return STATUS_SUCCESS;
3887 }
3888