xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 1a83762c)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
23     uint64_t address;
24     uint64_t new_address;
25     tree_header* data;
26     EXTENT_ITEM* ei;
27     tree* t;
28     bool system;
29     LIST_ENTRY refs;
30     LIST_ENTRY list_entry;
31 } metadata_reloc;
32 
33 typedef struct {
34     uint8_t type;
35     uint64_t hash;
36 
37     union {
38         TREE_BLOCK_REF tbr;
39         SHARED_BLOCK_REF sbr;
40     };
41 
42     metadata_reloc* parent;
43     bool top;
44     LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46 
47 typedef struct {
48     uint64_t address;
49     uint64_t size;
50     uint64_t new_address;
51     chunk* newchunk;
52     EXTENT_ITEM* ei;
53     LIST_ENTRY refs;
54     LIST_ENTRY list_entry;
55 } data_reloc;
56 
57 typedef struct {
58     uint8_t type;
59     uint64_t hash;
60 
61     union {
62         EXTENT_DATA_REF edr;
63         SHARED_DATA_REF sdr;
64     };
65 
66     metadata_reloc* parent;
67     LIST_ENTRY list_entry;
68 } data_reloc_ref;
69 
70 #ifndef _MSC_VER // not in mingw yet
71 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
72 #endif
73 
74 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
75 
76 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
77                                    bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
78     NTSTATUS Status;
79     metadata_reloc* mr;
80     EXTENT_ITEM* ei;
81     uint16_t len;
82     uint64_t inline_rc;
83     uint8_t* ptr;
84 
85     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
86     if (!mr) {
87         ERR("out of memory\n");
88         return STATUS_INSUFFICIENT_RESOURCES;
89     }
90 
91     mr->address = tp->item->key.obj_id;
92     mr->data = NULL;
93     mr->ei = (EXTENT_ITEM*)tp->item->data;
94     mr->system = false;
95     InitializeListHead(&mr->refs);
96 
97     Status = delete_tree_item(Vcb, tp);
98     if (!NT_SUCCESS(Status)) {
99         ERR("delete_tree_item returned %08x\n", Status);
100         ExFreePool(mr);
101         return Status;
102     }
103 
104     if (!c)
105         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
106 
107     if (c) {
108         acquire_chunk_lock(c, Vcb);
109 
110         c->used -= Vcb->superblock.node_size;
111 
112         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
113 
114         release_chunk_lock(c, Vcb);
115     }
116 
117     ei = (EXTENT_ITEM*)tp->item->data;
118     inline_rc = 0;
119 
120     len = tp->item->size - sizeof(EXTENT_ITEM);
121     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
122     if (!skinny) {
123         len -= sizeof(EXTENT_ITEM2);
124         ptr += sizeof(EXTENT_ITEM2);
125     }
126 
127     while (len > 0) {
128         uint8_t secttype = *ptr;
129         uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
130         metadata_reloc_ref* ref;
131 
132         len--;
133 
134         if (sectlen > len) {
135             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
136             return STATUS_INTERNAL_ERROR;
137         }
138 
139         if (sectlen == 0) {
140             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
141             return STATUS_INTERNAL_ERROR;
142         }
143 
144         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
145         if (!ref) {
146             ERR("out of memory\n");
147             return STATUS_INSUFFICIENT_RESOURCES;
148         }
149 
150         if (secttype == TYPE_TREE_BLOCK_REF) {
151             ref->type = TYPE_TREE_BLOCK_REF;
152             RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
153             inline_rc++;
154         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
155             ref->type = TYPE_SHARED_BLOCK_REF;
156             RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
157             inline_rc++;
158         } else {
159             ERR("unexpected tree type %x\n", secttype);
160             ExFreePool(ref);
161             return STATUS_INTERNAL_ERROR;
162         }
163 
164         ref->parent = NULL;
165         ref->top = false;
166         InsertTailList(&mr->refs, &ref->list_entry);
167 
168         len -= sectlen;
169         ptr += sizeof(uint8_t) + sectlen;
170     }
171 
172     if (inline_rc < ei->refcount) { // look for non-inline entries
173         traverse_ptr tp2 = *tp, next_tp;
174 
175         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
176             tp2 = next_tp;
177 
178             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
179                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
180                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
181                     if (!ref) {
182                         ERR("out of memory\n");
183                         return STATUS_INSUFFICIENT_RESOURCES;
184                     }
185 
186                     ref->type = TYPE_TREE_BLOCK_REF;
187                     ref->tbr.offset = tp2.item->key.offset;
188                     ref->parent = NULL;
189                     ref->top = false;
190                     InsertTailList(&mr->refs, &ref->list_entry);
191 
192                     Status = delete_tree_item(Vcb, &tp2);
193                     if (!NT_SUCCESS(Status)) {
194                         ERR("delete_tree_item returned %08x\n", Status);
195                         return Status;
196                     }
197                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
198                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
199                     if (!ref) {
200                         ERR("out of memory\n");
201                         return STATUS_INSUFFICIENT_RESOURCES;
202                     }
203 
204                     ref->type = TYPE_SHARED_BLOCK_REF;
205                     ref->sbr.offset = tp2.item->key.offset;
206                     ref->parent = NULL;
207                     ref->top = false;
208                     InsertTailList(&mr->refs, &ref->list_entry);
209 
210                     Status = delete_tree_item(Vcb, &tp2);
211                     if (!NT_SUCCESS(Status)) {
212                         ERR("delete_tree_item returned %08x\n", Status);
213                         return Status;
214                     }
215                 }
216             } else
217                 break;
218         }
219     }
220 
221     InsertTailList(items, &mr->list_entry);
222 
223     if (mr2)
224         *mr2 = mr;
225 
226     return STATUS_SUCCESS;
227 }
228 
229 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
230                                           uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
231     LIST_ENTRY* le;
232     KEY searchkey;
233     traverse_ptr tp;
234     bool skinny = false;
235     NTSTATUS Status;
236 
237     le = items->Flink;
238     while (le != items) {
239         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
240 
241         if (mr->address == address) {
242             *mr2 = mr;
243             return STATUS_SUCCESS;
244         }
245 
246         le = le->Flink;
247     }
248 
249     searchkey.obj_id = address;
250     searchkey.obj_type = TYPE_METADATA_ITEM;
251     searchkey.offset = 0xffffffffffffffff;
252 
253     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
254     if (!NT_SUCCESS(Status)) {
255         ERR("find_item returned %08x\n", Status);
256         return Status;
257     }
258 
259     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
260         skinny = true;
261     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
262              tp.item->size >= sizeof(EXTENT_ITEM)) {
263         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
264 
265         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
266             ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
267             return STATUS_INTERNAL_ERROR;
268         }
269     } else {
270         ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
271         return STATUS_INTERNAL_ERROR;
272     }
273 
274     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
275     if (!NT_SUCCESS(Status)) {
276         ERR("add_metadata_reloc returned %08x\n", Status);
277         return Status;
278     }
279 
280     return STATUS_SUCCESS;
281 }
282 
283 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
284     LIST_ENTRY newlist, *le;
285 
286     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
287         return;
288 
289     // insertion sort
290 
291     InitializeListHead(&newlist);
292 
293     while (!IsListEmpty(&mr->refs)) {
294         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
295         bool inserted = false;
296 
297         if (ref->type == TYPE_TREE_BLOCK_REF)
298             ref->hash = ref->tbr.offset;
299         else if (ref->type == TYPE_SHARED_BLOCK_REF)
300             ref->hash = ref->parent->new_address;
301 
302         le = newlist.Flink;
303         while (le != &newlist) {
304             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
305 
306             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
307                 InsertHeadList(le->Blink, &ref->list_entry);
308                 inserted = true;
309                 break;
310             }
311 
312             le = le->Flink;
313         }
314 
315         if (!inserted)
316             InsertTailList(&newlist, &ref->list_entry);
317     }
318 
319     newlist.Flink->Blink = &mr->refs;
320     newlist.Blink->Flink = &mr->refs;
321     mr->refs.Flink = newlist.Flink;
322     mr->refs.Blink = newlist.Blink;
323 }
324 
325 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
326     NTSTATUS Status;
327     LIST_ENTRY* le;
328     uint64_t rc = 0;
329     uint16_t inline_len;
330     bool all_inline = true;
331     metadata_reloc_ref* first_noninline = NULL;
332     EXTENT_ITEM* ei;
333     uint8_t* ptr;
334 
335     inline_len = sizeof(EXTENT_ITEM);
336     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
337         inline_len += sizeof(EXTENT_ITEM2);
338 
339     sort_metadata_reloc_refs(mr);
340 
341     le = mr->refs.Flink;
342     while (le != &mr->refs) {
343         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
344         uint16_t extlen = 0;
345 
346         rc++;
347 
348         if (ref->type == TYPE_TREE_BLOCK_REF)
349             extlen += sizeof(TREE_BLOCK_REF);
350         else if (ref->type == TYPE_SHARED_BLOCK_REF)
351             extlen += sizeof(SHARED_BLOCK_REF);
352 
353         if (all_inline) {
354             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
355                 all_inline = false;
356                 first_noninline = ref;
357             } else
358                 inline_len += extlen + 1;
359         }
360 
361         le = le->Flink;
362     }
363 
364     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
365     if (!ei) {
366         ERR("out of memory\n");
367         return STATUS_INSUFFICIENT_RESOURCES;
368     }
369 
370     ei->refcount = rc;
371     ei->generation = mr->ei->generation;
372     ei->flags = mr->ei->flags;
373     ptr = (uint8_t*)&ei[1];
374 
375     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
376         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
377 
378         ei2->firstitem = *(KEY*)&mr->data[1];
379         ei2->level = mr->data->level;
380 
381         ptr += sizeof(EXTENT_ITEM2);
382     }
383 
384     le = mr->refs.Flink;
385     while (le != &mr->refs) {
386         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
387 
388         if (ref == first_noninline)
389             break;
390 
391         *ptr = ref->type;
392         ptr++;
393 
394         if (ref->type == TYPE_TREE_BLOCK_REF) {
395             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
396 
397             tbr->offset = ref->tbr.offset;
398 
399             ptr += sizeof(TREE_BLOCK_REF);
400         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
401             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
402 
403             sbr->offset = ref->parent->new_address;
404 
405             ptr += sizeof(SHARED_BLOCK_REF);
406         }
407 
408         le = le->Flink;
409     }
410 
411     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
412         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
413     else
414         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
415 
416     if (!NT_SUCCESS(Status)) {
417         ERR("insert_tree_item returned %08x\n", Status);
418         ExFreePool(ei);
419         return Status;
420     }
421 
422     if (!all_inline) {
423         le = &first_noninline->list_entry;
424 
425         while (le != &mr->refs) {
426             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
427 
428             if (ref->type == TYPE_TREE_BLOCK_REF) {
429                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
430                 if (!NT_SUCCESS(Status)) {
431                     ERR("insert_tree_item returned %08x\n", Status);
432                     return Status;
433                 }
434             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
435                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
436                 if (!NT_SUCCESS(Status)) {
437                     ERR("insert_tree_item returned %08x\n", Status);
438                     return Status;
439                 }
440             }
441 
442             le = le->Flink;
443         }
444     }
445 
446     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
447         if (mr->data->level > 0) {
448             uint16_t i;
449             internal_node* in = (internal_node*)&mr->data[1];
450 
451             for (i = 0; i < mr->data->num_items; i++) {
452                 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
453 
454                 if (sbrrc > 0) {
455                     SHARED_BLOCK_REF sbr;
456 
457                     sbr.offset = mr->new_address;
458 
459                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
460                     if (!NT_SUCCESS(Status)) {
461                         ERR("increase_extent_refcount returned %08x\n", Status);
462                         return Status;
463                     }
464 
465                     sbr.offset = mr->address;
466 
467                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
468                                                       sbr.offset, false, NULL);
469                     if (!NT_SUCCESS(Status)) {
470                         ERR("decrease_extent_refcount returned %08x\n", Status);
471                         return Status;
472                     }
473                 }
474             }
475         } else {
476             uint16_t i;
477             leaf_node* ln = (leaf_node*)&mr->data[1];
478 
479             for (i = 0; i < mr->data->num_items; i++) {
480                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
481                     EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
482 
483                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
484                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
485 
486                         if (ed2->size > 0) { // not sparse
487                             uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
488 
489                             if (sdrrc > 0) {
490                                 SHARED_DATA_REF sdr;
491                                 chunk* c;
492 
493                                 sdr.offset = mr->new_address;
494                                 sdr.count = sdrrc;
495 
496                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
497                                 if (!NT_SUCCESS(Status)) {
498                                     ERR("increase_extent_refcount returned %08x\n", Status);
499                                     return Status;
500                                 }
501 
502                                 sdr.offset = mr->address;
503 
504                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
505                                                                   sdr.offset, false, NULL);
506                                 if (!NT_SUCCESS(Status)) {
507                                     ERR("decrease_extent_refcount returned %08x\n", Status);
508                                     return Status;
509                                 }
510 
511                                 c = get_chunk_from_address(Vcb, ed2->address);
512 
513                                 if (c) {
514                                     // check changed_extents
515 
516                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
517 
518                                     le = c->changed_extents.Flink;
519 
520                                     while (le != &c->changed_extents) {
521                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
522 
523                                         if (ce->address == ed2->address) {
524                                             LIST_ENTRY* le2;
525 
526                                             le2 = ce->refs.Flink;
527                                             while (le2 != &ce->refs) {
528                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
529 
530                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
531                                                     cer->sdr.offset = mr->new_address;
532                                                     break;
533                                                 }
534 
535                                                 le2 = le2->Flink;
536                                             }
537 
538                                             le2 = ce->old_refs.Flink;
539                                             while (le2 != &ce->old_refs) {
540                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
541 
542                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
543                                                     cer->sdr.offset = mr->new_address;
544                                                     break;
545                                                 }
546 
547                                                 le2 = le2->Flink;
548                                             }
549 
550                                             break;
551                                         }
552 
553                                         le = le->Flink;
554                                     }
555 
556                                     ExReleaseResourceLite(&c->changed_extents_lock);
557                                 }
558                             }
559                         }
560                     }
561                 }
562             }
563         }
564     }
565 
566     return STATUS_SUCCESS;
567 }
568 
569 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
570                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
571     LIST_ENTRY tree_writes, *le;
572     NTSTATUS Status;
573     traverse_ptr tp;
574     uint8_t level, max_level = 0;
575     chunk* newchunk = NULL;
576 
577     InitializeListHead(&tree_writes);
578 
579     le = items->Flink;
580     while (le != items) {
581         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
582         LIST_ENTRY* le2;
583         chunk* pc;
584 
585         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
586         if (!mr->data) {
587             ERR("out of memory\n");
588             return STATUS_INSUFFICIENT_RESOURCES;
589         }
590 
591         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
592                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
593         if (!NT_SUCCESS(Status)) {
594             ERR("read_data returned %08x\n", Status);
595             return Status;
596         }
597 
598         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
599             mr->system = true;
600 
601         if (data_items && mr->data->level == 0) {
602             le2 = data_items->Flink;
603             while (le2 != data_items) {
604                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
605                 leaf_node* ln = (leaf_node*)&mr->data[1];
606                 uint16_t i;
607 
608                 for (i = 0; i < mr->data->num_items; i++) {
609                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
610                         EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
611 
612                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
613                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
614 
615                             if (ed2->address == dr->address)
616                                 ed2->address = dr->new_address;
617                         }
618                     }
619                 }
620 
621                 le2 = le2->Flink;
622             }
623         }
624 
625         if (mr->data->level > max_level)
626             max_level = mr->data->level;
627 
628         le2 = mr->refs.Flink;
629         while (le2 != &mr->refs) {
630             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
631 
632             if (ref->type == TYPE_TREE_BLOCK_REF) {
633                 KEY* firstitem;
634                 root* r = NULL;
635                 LIST_ENTRY* le3;
636                 tree* t;
637 
638                 firstitem = (KEY*)&mr->data[1];
639 
640                 le3 = Vcb->roots.Flink;
641                 while (le3 != &Vcb->roots) {
642                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
643 
644                     if (r2->id == ref->tbr.offset) {
645                         r = r2;
646                         break;
647                     }
648 
649                     le3 = le3->Flink;
650                 }
651 
652                 if (!r) {
653                     ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
654                     return STATUS_INTERNAL_ERROR;
655                 }
656 
657                 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
658                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
659                     ERR("find_item_to_level returned %08x\n", Status);
660                     return Status;
661                 }
662 
663                 t = tp.tree;
664                 while (t && t->header.level < mr->data->level + 1) {
665                     t = t->parent;
666                 }
667 
668                 if (!t)
669                     ref->top = true;
670                 else {
671                     metadata_reloc* mr2;
672 
673                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
674                     if (!NT_SUCCESS(Status)) {
675                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
676                         return Status;
677                     }
678 
679                     ref->parent = mr2;
680                 }
681             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
682                 metadata_reloc* mr2;
683 
684                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
685                 if (!NT_SUCCESS(Status)) {
686                     ERR("add_metadata_reloc_parent returned %08x\n", Status);
687                     return Status;
688                 }
689 
690                 ref->parent = mr2;
691             }
692 
693             le2 = le2->Flink;
694         }
695 
696         le = le->Flink;
697     }
698 
699     le = items->Flink;
700     while (le != items) {
701         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
702         LIST_ENTRY* le2;
703         uint32_t hash;
704 
705         mr->t = NULL;
706 
707         hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
708 
709         le2 = Vcb->trees_ptrs[hash >> 24];
710 
711         if (le2) {
712             while (le2 != &Vcb->trees_hash) {
713                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
714 
715                 if (t->header.address == mr->address) {
716                     mr->t = t;
717                     break;
718                 } else if (t->hash > hash)
719                     break;
720 
721                 le2 = le2->Flink;
722             }
723         }
724 
725         le = le->Flink;
726     }
727 
728     for (level = 0; level <= max_level; level++) {
729         le = items->Flink;
730         while (le != items) {
731             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
732 
733             if (mr->data->level == level) {
734                 bool done = false;
735                 LIST_ENTRY* le2;
736                 tree_write* tw;
737                 uint64_t flags;
738                 tree* t3;
739 
740                 if (mr->system)
741                     flags = Vcb->system_flags;
742                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
743                     flags = Vcb->data_flags;
744                 else
745                     flags = Vcb->metadata_flags;
746 
747                 if (newchunk) {
748                     acquire_chunk_lock(newchunk, Vcb);
749 
750                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
751                         newchunk->used += Vcb->superblock.node_size;
752                         space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
753                         done = true;
754                     }
755 
756                     release_chunk_lock(newchunk, Vcb);
757                 }
758 
759                 if (!done) {
760                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
761 
762                     le2 = Vcb->chunks.Flink;
763                     while (le2 != &Vcb->chunks) {
764                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
765 
766                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
767                             acquire_chunk_lock(c2, Vcb);
768 
769                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
770                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
771                                     c2->used += Vcb->superblock.node_size;
772                                     space_list_subtract(c2, false, mr->new_address, Vcb->superblock.node_size, rollback);
773                                     release_chunk_lock(c2, Vcb);
774                                     newchunk = c2;
775                                     done = true;
776                                     break;
777                                 }
778                             }
779 
780                             release_chunk_lock(c2, Vcb);
781                         }
782 
783                         le2 = le2->Flink;
784                     }
785 
786                     // allocate new chunk if necessary
787                     if (!done) {
788                         Status = alloc_chunk(Vcb, flags, &newchunk, false);
789 
790                         if (!NT_SUCCESS(Status)) {
791                             ERR("alloc_chunk returned %08x\n", Status);
792                             ExReleaseResourceLite(&Vcb->chunk_lock);
793                             goto end;
794                         }
795 
796                         acquire_chunk_lock(newchunk, Vcb);
797 
798                         newchunk->balance_num = Vcb->balance.balance_num;
799 
800                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
801                             release_chunk_lock(newchunk, Vcb);
802                             ExReleaseResourceLite(&Vcb->chunk_lock);
803                             ERR("could not find address in new chunk\n");
804                             Status = STATUS_DISK_FULL;
805                             goto end;
806                         } else {
807                             newchunk->used += Vcb->superblock.node_size;
808                             space_list_subtract(newchunk, false, mr->new_address, Vcb->superblock.node_size, rollback);
809                         }
810 
811                         release_chunk_lock(newchunk, Vcb);
812                     }
813 
814                     ExReleaseResourceLite(&Vcb->chunk_lock);
815                 }
816 
817                 // update parents
818                 le2 = mr->refs.Flink;
819                 while (le2 != &mr->refs) {
820                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
821 
822                     if (ref->parent) {
823                         uint16_t i;
824                         internal_node* in = (internal_node*)&ref->parent->data[1];
825 
826                         for (i = 0; i < ref->parent->data->num_items; i++) {
827                             if (in[i].address == mr->address) {
828                                 in[i].address = mr->new_address;
829                                 break;
830                             }
831                         }
832 
833                         if (ref->parent->t) {
834                             LIST_ENTRY* le3;
835 
836                             le3 = ref->parent->t->itemlist.Flink;
837                             while (le3 != &ref->parent->t->itemlist) {
838                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
839 
840                                 if (!td->inserted && td->treeholder.address == mr->address)
841                                     td->treeholder.address = mr->new_address;
842 
843                                 le3 = le3->Flink;
844                             }
845                         }
846                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
847                         LIST_ENTRY* le3;
848                         root* r = NULL;
849 
850                         // alter ROOT_ITEM
851 
852                         le3 = Vcb->roots.Flink;
853                         while (le3 != &Vcb->roots) {
854                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
855 
856                             if (r2->id == ref->tbr.offset) {
857                                 r = r2;
858                                 break;
859                             }
860 
861                             le3 = le3->Flink;
862                         }
863 
864                         if (r) {
865                             r->treeholder.address = mr->new_address;
866 
867                             if (r == Vcb->root_root)
868                                 Vcb->superblock.root_tree_addr = mr->new_address;
869                             else if (r == Vcb->chunk_root)
870                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
871                             else if (r->root_item.block_number == mr->address) {
872                                 KEY searchkey;
873                                 ROOT_ITEM* ri;
874 
875                                 r->root_item.block_number = mr->new_address;
876 
877                                 searchkey.obj_id = r->id;
878                                 searchkey.obj_type = TYPE_ROOT_ITEM;
879                                 searchkey.offset = 0xffffffffffffffff;
880 
881                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
882                                 if (!NT_SUCCESS(Status)) {
883                                     ERR("find_item returned %08x\n", Status);
884                                     goto end;
885                                 }
886 
887                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
888                                     ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
889                                     Status = STATUS_INTERNAL_ERROR;
890                                     goto end;
891                                 }
892 
893                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
894                                 if (!ri) {
895                                     ERR("out of memory\n");
896                                     Status = STATUS_INSUFFICIENT_RESOURCES;
897                                     goto end;
898                                 }
899 
900                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
901 
902                                 Status = delete_tree_item(Vcb, &tp);
903                                 if (!NT_SUCCESS(Status)) {
904                                     ERR("delete_tree_item returned %08x\n", Status);
905                                     goto end;
906                                 }
907 
908                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
909                                 if (!NT_SUCCESS(Status)) {
910                                     ERR("insert_tree_item returned %08x\n", Status);
911                                     goto end;
912                                 }
913                             }
914                         }
915                     }
916 
917                     le2 = le2->Flink;
918                 }
919 
920                 mr->data->address = mr->new_address;
921 
922                 t3 = mr->t;
923 
924                 while (t3) {
925                     uint8_t h;
926                     bool inserted;
927                     tree* t4 = NULL;
928 
929                     // check if tree loaded more than once
930                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
931                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
932 
933                         if (nt->header.address == t3->header.address)
934                             t4 = nt;
935                     }
936 
937                     t3->header.address = mr->new_address;
938 
939                     h = t3->hash >> 24;
940 
941                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
942                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
943                             Vcb->trees_ptrs[h] = NULL;
944                         else {
945                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
946 
947                             if (t2->hash >> 24 == h)
948                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
949                             else
950                                 Vcb->trees_ptrs[h] = NULL;
951                         }
952                     }
953 
954                     RemoveEntryList(&t3->list_entry_hash);
955 
956                     t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
957                     h = t3->hash >> 24;
958 
959                     if (!Vcb->trees_ptrs[h]) {
960                         uint8_t h2 = h;
961 
962                         le2 = Vcb->trees_hash.Flink;
963 
964                         if (h2 > 0) {
965                             h2--;
966                             do {
967                                 if (Vcb->trees_ptrs[h2]) {
968                                     le2 = Vcb->trees_ptrs[h2];
969                                     break;
970                                 }
971 
972                                 h2--;
973                             } while (h2 > 0);
974                         }
975                     } else
976                         le2 = Vcb->trees_ptrs[h];
977 
978                     inserted = false;
979                     while (le2 != &Vcb->trees_hash) {
980                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
981 
982                         if (t2->hash >= t3->hash) {
983                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
984                             inserted = true;
985                             break;
986                         }
987 
988                         le2 = le2->Flink;
989                     }
990 
991                     if (!inserted)
992                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
993 
994                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
995                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
996 
997                     if (data_items && level == 0) {
998                         le2 = data_items->Flink;
999 
1000                         while (le2 != data_items) {
1001                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1002                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1003 
1004                             while (le3 != &t3->itemlist) {
1005                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1006 
1007                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1008                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1009 
1010                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1011                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1012 
1013                                         if (ed2->address == dr->address)
1014                                             ed2->address = dr->new_address;
1015                                     }
1016                                 }
1017 
1018                                 le3 = le3->Flink;
1019                             }
1020 
1021                             le2 = le2->Flink;
1022                         }
1023                     }
1024 
1025                     t3 = t4;
1026                 }
1027 
1028                 *((uint32_t*)mr->data) = ~calc_crc32c(0xffffffff, (uint8_t*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1029 
1030                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1031                 if (!tw) {
1032                     ERR("out of memory\n");
1033                     Status = STATUS_INSUFFICIENT_RESOURCES;
1034                     goto end;
1035                 }
1036 
1037                 tw->address = mr->new_address;
1038                 tw->length = Vcb->superblock.node_size;
1039                 tw->data = (uint8_t*)mr->data;
1040                 tw->allocated = false;
1041 
1042                 if (IsListEmpty(&tree_writes))
1043                     InsertTailList(&tree_writes, &tw->list_entry);
1044                 else {
1045                     bool inserted = false;
1046 
1047                     le2 = tree_writes.Flink;
1048                     while (le2 != &tree_writes) {
1049                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1050 
1051                         if (tw2->address > tw->address) {
1052                             InsertHeadList(le2->Blink, &tw->list_entry);
1053                             inserted = true;
1054                             break;
1055                         }
1056 
1057                         le2 = le2->Flink;
1058                     }
1059 
1060                     if (!inserted)
1061                         InsertTailList(&tree_writes, &tw->list_entry);
1062                 }
1063             }
1064 
1065             le = le->Flink;
1066         }
1067     }
1068 
1069     Status = do_tree_writes(Vcb, &tree_writes, true);
1070     if (!NT_SUCCESS(Status)) {
1071         ERR("do_tree_writes returned %08x\n", Status);
1072         goto end;
1073     }
1074 
1075     le = items->Flink;
1076     while (le != items) {
1077         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1078 
1079         Status = add_metadata_reloc_extent_item(Vcb, mr);
1080         if (!NT_SUCCESS(Status)) {
1081             ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1082             goto end;
1083         }
1084 
1085         le = le->Flink;
1086     }
1087 
1088     Status = STATUS_SUCCESS;
1089 
1090 end:
1091     while (!IsListEmpty(&tree_writes)) {
1092         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1093 
1094         if (tw->allocated)
1095             ExFreePool(tw->data);
1096 
1097         ExFreePool(tw);
1098     }
1099 
1100     return Status;
1101 }
1102 
1103 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1104     KEY searchkey;
1105     traverse_ptr tp;
1106     NTSTATUS Status;
1107     bool b;
1108     LIST_ENTRY items, rollback;
1109     uint32_t loaded = 0;
1110 
1111     TRACE("chunk %I64x\n", c->offset);
1112 
1113     InitializeListHead(&rollback);
1114     InitializeListHead(&items);
1115 
1116     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1117 
1118     searchkey.obj_id = c->offset;
1119     searchkey.obj_type = TYPE_METADATA_ITEM;
1120     searchkey.offset = 0xffffffffffffffff;
1121 
1122     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1123     if (!NT_SUCCESS(Status)) {
1124         ERR("find_item returned %08x\n", Status);
1125         goto end;
1126     }
1127 
1128     do {
1129         traverse_ptr next_tp;
1130 
1131         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1132             break;
1133 
1134         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1135             bool tree = false, skinny = false;
1136 
1137             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1138                 tree = true;
1139                 skinny = true;
1140             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1141                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1142                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1143 
1144                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1145                     tree = true;
1146             }
1147 
1148             if (tree) {
1149                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1150 
1151                 if (!NT_SUCCESS(Status)) {
1152                     ERR("add_metadata_reloc returned %08x\n", Status);
1153                     goto end;
1154                 }
1155 
1156                 loaded++;
1157 
1158                 if (loaded >= 64) // only do 64 at a time
1159                     break;
1160             }
1161         }
1162 
1163         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1164 
1165         if (b)
1166             tp = next_tp;
1167     } while (b);
1168 
1169     if (IsListEmpty(&items)) {
1170         *changed = false;
1171         Status = STATUS_SUCCESS;
1172         goto end;
1173     } else
1174         *changed = true;
1175 
1176     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1177     if (!NT_SUCCESS(Status)) {
1178         ERR("write_metadata_items returned %08x\n", Status);
1179         goto end;
1180     }
1181 
1182     Status = STATUS_SUCCESS;
1183 
1184     Vcb->need_write = true;
1185 
1186 end:
1187     if (NT_SUCCESS(Status)) {
1188         Status = do_write(Vcb, NULL);
1189         if (!NT_SUCCESS(Status))
1190             ERR("do_write returned %08x\n", Status);
1191     }
1192 
1193     if (NT_SUCCESS(Status))
1194         clear_rollback(&rollback);
1195     else
1196         do_rollback(Vcb, &rollback);
1197 
1198     free_trees(Vcb);
1199 
1200     ExReleaseResourceLite(&Vcb->tree_lock);
1201 
1202     while (!IsListEmpty(&items)) {
1203         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1204 
1205         while (!IsListEmpty(&mr->refs)) {
1206             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1207 
1208             ExFreePool(ref);
1209         }
1210 
1211         if (mr->data)
1212             ExFreePool(mr->data);
1213 
1214         ExFreePool(mr);
1215     }
1216 
1217     return Status;
1218 }
1219 
1220 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1221                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1222     NTSTATUS Status;
1223     LIST_ENTRY* le;
1224     KEY searchkey;
1225     traverse_ptr tp;
1226     root* r = NULL;
1227     metadata_reloc* mr;
1228     uint64_t last_tree = 0;
1229     data_reloc_ref* ref;
1230 
1231     le = Vcb->roots.Flink;
1232     while (le != &Vcb->roots) {
1233         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1234 
1235         if (r2->id == edr->root) {
1236             r = r2;
1237             break;
1238         }
1239 
1240         le = le->Flink;
1241     }
1242 
1243     if (!r) {
1244         ERR("could not find subvol %I64x\n", edr->count);
1245         return STATUS_INTERNAL_ERROR;
1246     }
1247 
1248     searchkey.obj_id = edr->objid;
1249     searchkey.obj_type = TYPE_EXTENT_DATA;
1250     searchkey.offset = 0;
1251 
1252     Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1253     if (!NT_SUCCESS(Status)) {
1254         ERR("find_item returned %08x\n", Status);
1255         return Status;
1256     }
1257 
1258     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1259         traverse_ptr tp2;
1260 
1261         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1262             tp = tp2;
1263         else {
1264             ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1265             return STATUS_INTERNAL_ERROR;
1266         }
1267     }
1268 
1269     ref = NULL;
1270 
1271     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1272         traverse_ptr tp2;
1273 
1274         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1275             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1276 
1277             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1278                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1279 
1280                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1281                     if (ref && last_tree == tp.tree->header.address)
1282                         ref->edr.count++;
1283                     else {
1284                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1285                         if (!ref) {
1286                             ERR("out of memory\n");
1287                             return STATUS_INSUFFICIENT_RESOURCES;
1288                         }
1289 
1290                         ref->type = TYPE_EXTENT_DATA_REF;
1291                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1292                         ref->edr.count = 1;
1293 
1294                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1295                         if (!NT_SUCCESS(Status)) {
1296                             ERR("add_metadata_reloc_parent returned %08x\n", Status);
1297                             ExFreePool(ref);
1298                             return Status;
1299                         }
1300 
1301                         last_tree = tp.tree->header.address;
1302                         ref->parent = mr;
1303 
1304                         InsertTailList(&dr->refs, &ref->list_entry);
1305                     }
1306                 }
1307             }
1308         }
1309 
1310         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1311             tp = tp2;
1312         else
1313             break;
1314     }
1315 
1316     return STATUS_SUCCESS;
1317 }
1318 
1319 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1320                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1321     NTSTATUS Status;
1322     data_reloc* dr;
1323     EXTENT_ITEM* ei;
1324     uint16_t len;
1325     uint64_t inline_rc;
1326     uint8_t* ptr;
1327 
1328     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1329     if (!dr) {
1330         ERR("out of memory\n");
1331         return STATUS_INSUFFICIENT_RESOURCES;
1332     }
1333 
1334     dr->address = tp->item->key.obj_id;
1335     dr->size = tp->item->key.offset;
1336     dr->ei = (EXTENT_ITEM*)tp->item->data;
1337     InitializeListHead(&dr->refs);
1338 
1339     Status = delete_tree_item(Vcb, tp);
1340     if (!NT_SUCCESS(Status)) {
1341         ERR("delete_tree_item returned %08x\n", Status);
1342         return Status;
1343     }
1344 
1345     if (!c)
1346         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1347 
1348     if (c) {
1349         acquire_chunk_lock(c, Vcb);
1350 
1351         c->used -= tp->item->key.offset;
1352 
1353         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1354 
1355         release_chunk_lock(c, Vcb);
1356     }
1357 
1358     ei = (EXTENT_ITEM*)tp->item->data;
1359     inline_rc = 0;
1360 
1361     len = tp->item->size - sizeof(EXTENT_ITEM);
1362     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1363 
1364     while (len > 0) {
1365         uint8_t secttype = *ptr;
1366         uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1367 
1368         len--;
1369 
1370         if (sectlen > len) {
1371             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1372             return STATUS_INTERNAL_ERROR;
1373         }
1374 
1375         if (sectlen == 0) {
1376             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1377             return STATUS_INTERNAL_ERROR;
1378         }
1379 
1380         if (secttype == TYPE_EXTENT_DATA_REF) {
1381             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1382 
1383             inline_rc += edr->count;
1384 
1385             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1386             if (!NT_SUCCESS(Status)) {
1387                 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1388                 return Status;
1389             }
1390         } else if (secttype == TYPE_SHARED_DATA_REF) {
1391             metadata_reloc* mr;
1392             data_reloc_ref* ref;
1393 
1394             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1395             if (!ref) {
1396                 ERR("out of memory\n");
1397                 return STATUS_INSUFFICIENT_RESOURCES;
1398             }
1399 
1400             ref->type = TYPE_SHARED_DATA_REF;
1401             RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1402             inline_rc += ref->sdr.count;
1403 
1404             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1405             if (!NT_SUCCESS(Status)) {
1406                 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1407                 ExFreePool(ref);
1408                 return Status;
1409             }
1410 
1411             ref->parent = mr;
1412 
1413             InsertTailList(&dr->refs, &ref->list_entry);
1414         } else {
1415             ERR("unexpected tree type %x\n", secttype);
1416             return STATUS_INTERNAL_ERROR;
1417         }
1418 
1419 
1420         len -= sectlen;
1421         ptr += sizeof(uint8_t) + sectlen;
1422     }
1423 
1424     if (inline_rc < ei->refcount) { // look for non-inline entries
1425         traverse_ptr tp2 = *tp, next_tp;
1426 
1427         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1428             tp2 = next_tp;
1429 
1430             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1431                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1432                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1433                     if (!NT_SUCCESS(Status)) {
1434                         ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1435                         return Status;
1436                     }
1437 
1438                     Status = delete_tree_item(Vcb, &tp2);
1439                     if (!NT_SUCCESS(Status)) {
1440                         ERR("delete_tree_item returned %08x\n", Status);
1441                         return Status;
1442                     }
1443                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1444                     metadata_reloc* mr;
1445                     data_reloc_ref* ref;
1446 
1447                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1448                     if (!ref) {
1449                         ERR("out of memory\n");
1450                         return STATUS_INSUFFICIENT_RESOURCES;
1451                     }
1452 
1453                     ref->type = TYPE_SHARED_DATA_REF;
1454                     ref->sdr.offset = tp2.item->key.offset;
1455                     ref->sdr.count = *((uint32_t*)tp2.item->data);
1456 
1457                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1458                     if (!NT_SUCCESS(Status)) {
1459                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
1460                         ExFreePool(ref);
1461                         return Status;
1462                     }
1463 
1464                     ref->parent = mr;
1465                     InsertTailList(&dr->refs, &ref->list_entry);
1466 
1467                     Status = delete_tree_item(Vcb, &tp2);
1468                     if (!NT_SUCCESS(Status)) {
1469                         ERR("delete_tree_item returned %08x\n", Status);
1470                         return Status;
1471                     }
1472                 }
1473             } else
1474                 break;
1475         }
1476     }
1477 
1478     InsertTailList(items, &dr->list_entry);
1479 
1480     return STATUS_SUCCESS;
1481 }
1482 
1483 static void sort_data_reloc_refs(data_reloc* dr) {
1484     LIST_ENTRY newlist, *le;
1485 
1486     if (IsListEmpty(&dr->refs))
1487         return;
1488 
1489     // insertion sort
1490 
1491     InitializeListHead(&newlist);
1492 
1493     while (!IsListEmpty(&dr->refs)) {
1494         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1495         bool inserted = false;
1496 
1497         if (ref->type == TYPE_EXTENT_DATA_REF)
1498             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1499         else if (ref->type == TYPE_SHARED_DATA_REF)
1500             ref->hash = ref->parent->new_address;
1501 
1502         le = newlist.Flink;
1503         while (le != &newlist) {
1504             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1505 
1506             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1507                 InsertHeadList(le->Blink, &ref->list_entry);
1508                 inserted = true;
1509                 break;
1510             }
1511 
1512             le = le->Flink;
1513         }
1514 
1515         if (!inserted)
1516             InsertTailList(&newlist, &ref->list_entry);
1517     }
1518 
1519     le = newlist.Flink;
1520     while (le != &newlist) {
1521         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1522 
1523         if (le->Flink != &newlist) {
1524             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1525 
1526             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1527                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1528                 RemoveEntryList(&ref2->list_entry);
1529                 ref->edr.count += ref2->edr.count;
1530                 ExFreePool(ref2);
1531                 continue;
1532             }
1533         }
1534 
1535         le = le->Flink;
1536     }
1537 
1538     newlist.Flink->Blink = &dr->refs;
1539     newlist.Blink->Flink = &dr->refs;
1540     dr->refs.Flink = newlist.Flink;
1541     dr->refs.Blink = newlist.Blink;
1542 }
1543 
1544 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1545     NTSTATUS Status;
1546     LIST_ENTRY* le;
1547     uint64_t rc = 0;
1548     uint16_t inline_len;
1549     bool all_inline = true;
1550     data_reloc_ref* first_noninline = NULL;
1551     EXTENT_ITEM* ei;
1552     uint8_t* ptr;
1553 
1554     inline_len = sizeof(EXTENT_ITEM);
1555 
1556     sort_data_reloc_refs(dr);
1557 
1558     le = dr->refs.Flink;
1559     while (le != &dr->refs) {
1560         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1561         uint16_t extlen = 0;
1562 
1563         if (ref->type == TYPE_EXTENT_DATA_REF) {
1564             extlen += sizeof(EXTENT_DATA_REF);
1565             rc += ref->edr.count;
1566         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1567             extlen += sizeof(SHARED_DATA_REF);
1568             rc++;
1569         }
1570 
1571         if (all_inline) {
1572             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1573                 all_inline = false;
1574                 first_noninline = ref;
1575             } else
1576                 inline_len += extlen + 1;
1577         }
1578 
1579         le = le->Flink;
1580     }
1581 
1582     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1583     if (!ei) {
1584         ERR("out of memory\n");
1585         return STATUS_INSUFFICIENT_RESOURCES;
1586     }
1587 
1588     ei->refcount = rc;
1589     ei->generation = dr->ei->generation;
1590     ei->flags = dr->ei->flags;
1591     ptr = (uint8_t*)&ei[1];
1592 
1593     le = dr->refs.Flink;
1594     while (le != &dr->refs) {
1595         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1596 
1597         if (ref == first_noninline)
1598             break;
1599 
1600         *ptr = ref->type;
1601         ptr++;
1602 
1603         if (ref->type == TYPE_EXTENT_DATA_REF) {
1604             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1605 
1606             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1607 
1608             ptr += sizeof(EXTENT_DATA_REF);
1609         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1610             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1611 
1612             sdr->offset = ref->parent->new_address;
1613             sdr->count = ref->sdr.count;
1614 
1615             ptr += sizeof(SHARED_DATA_REF);
1616         }
1617 
1618         le = le->Flink;
1619     }
1620 
1621     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1622     if (!NT_SUCCESS(Status)) {
1623         ERR("insert_tree_item returned %08x\n", Status);
1624         return Status;
1625     }
1626 
1627     if (!all_inline) {
1628         le = &first_noninline->list_entry;
1629 
1630         while (le != &dr->refs) {
1631             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1632 
1633             if (ref->type == TYPE_EXTENT_DATA_REF) {
1634                 EXTENT_DATA_REF* edr;
1635 
1636                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1637                 if (!edr) {
1638                     ERR("out of memory\n");
1639                     return STATUS_INSUFFICIENT_RESOURCES;
1640                 }
1641 
1642                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1643 
1644                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1645                 if (!NT_SUCCESS(Status)) {
1646                     ERR("insert_tree_item returned %08x\n", Status);
1647                     return Status;
1648                 }
1649             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1650                 uint32_t* sdr;
1651 
1652                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1653                 if (!sdr) {
1654                     ERR("out of memory\n");
1655                     return STATUS_INSUFFICIENT_RESOURCES;
1656                 }
1657 
1658                 *sdr = ref->sdr.count;
1659 
1660                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1661                 if (!NT_SUCCESS(Status)) {
1662                     ERR("insert_tree_item returned %08x\n", Status);
1663                     return Status;
1664                 }
1665             }
1666 
1667             le = le->Flink;
1668         }
1669     }
1670 
1671     return STATUS_SUCCESS;
1672 }
1673 
1674 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1675     KEY searchkey;
1676     traverse_ptr tp;
1677     NTSTATUS Status;
1678     bool b;
1679     LIST_ENTRY items, metadata_items, rollback, *le;
1680     uint64_t loaded = 0, num_loaded = 0;
1681     chunk* newchunk = NULL;
1682     uint8_t* data = NULL;
1683 
1684     TRACE("chunk %I64x\n", c->offset);
1685 
1686     InitializeListHead(&rollback);
1687     InitializeListHead(&items);
1688     InitializeListHead(&metadata_items);
1689 
1690     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1691 
1692     searchkey.obj_id = c->offset;
1693     searchkey.obj_type = TYPE_EXTENT_ITEM;
1694     searchkey.offset = 0xffffffffffffffff;
1695 
1696     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1697     if (!NT_SUCCESS(Status)) {
1698         ERR("find_item returned %08x\n", Status);
1699         goto end;
1700     }
1701 
1702     do {
1703         traverse_ptr next_tp;
1704 
1705         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1706             break;
1707 
1708         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1709             bool tree = false;
1710 
1711             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1712                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1713 
1714                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1715                     tree = true;
1716             }
1717 
1718             if (!tree) {
1719                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1720 
1721                 if (!NT_SUCCESS(Status)) {
1722                     ERR("add_data_reloc returned %08x\n", Status);
1723                     goto end;
1724                 }
1725 
1726                 loaded += tp.item->key.offset;
1727                 num_loaded++;
1728 
1729                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1730                     break;
1731             }
1732         }
1733 
1734         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1735 
1736         if (b)
1737             tp = next_tp;
1738     } while (b);
1739 
1740     if (IsListEmpty(&items)) {
1741         *changed = false;
1742         Status = STATUS_SUCCESS;
1743         goto end;
1744     } else
1745         *changed = true;
1746 
1747     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1748     if (!data) {
1749         ERR("out of memory\n");
1750         Status = STATUS_INSUFFICIENT_RESOURCES;
1751         goto end;
1752     }
1753 
1754     le = items.Flink;
1755     while (le != &items) {
1756         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1757         bool done = false;
1758         LIST_ENTRY* le2;
1759         uint32_t* csum;
1760         RTL_BITMAP bmp;
1761         ULONG* bmparr;
1762         ULONG bmplen, runlength, index, lastoff;
1763 
1764         if (newchunk) {
1765             acquire_chunk_lock(newchunk, Vcb);
1766 
1767             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1768                 newchunk->used += dr->size;
1769                 space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1770                 done = true;
1771             }
1772 
1773             release_chunk_lock(newchunk, Vcb);
1774         }
1775 
1776         if (!done) {
1777             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1778 
1779             le2 = Vcb->chunks.Flink;
1780             while (le2 != &Vcb->chunks) {
1781                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1782 
1783                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1784                     acquire_chunk_lock(c2, Vcb);
1785 
1786                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1787                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1788                             c2->used += dr->size;
1789                             space_list_subtract(c2, false, dr->new_address, dr->size, &rollback);
1790                             release_chunk_lock(c2, Vcb);
1791                             newchunk = c2;
1792                             done = true;
1793                             break;
1794                         }
1795                     }
1796 
1797                     release_chunk_lock(c2, Vcb);
1798                 }
1799 
1800                 le2 = le2->Flink;
1801             }
1802 
1803             // allocate new chunk if necessary
1804             if (!done) {
1805                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1806 
1807                 if (!NT_SUCCESS(Status)) {
1808                     ERR("alloc_chunk returned %08x\n", Status);
1809                     ExReleaseResourceLite(&Vcb->chunk_lock);
1810                     goto end;
1811                 }
1812 
1813                 acquire_chunk_lock(newchunk, Vcb);
1814 
1815                 newchunk->balance_num = Vcb->balance.balance_num;
1816 
1817                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1818                     release_chunk_lock(newchunk, Vcb);
1819                     ExReleaseResourceLite(&Vcb->chunk_lock);
1820                     ERR("could not find address in new chunk\n");
1821                     Status = STATUS_DISK_FULL;
1822                     goto end;
1823                 } else {
1824                     newchunk->used += dr->size;
1825                     space_list_subtract(newchunk, false, dr->new_address, dr->size, &rollback);
1826                 }
1827 
1828                 release_chunk_lock(newchunk, Vcb);
1829             }
1830 
1831             ExReleaseResourceLite(&Vcb->chunk_lock);
1832         }
1833 
1834         dr->newchunk = newchunk;
1835 
1836         bmplen = (ULONG)(dr->size / Vcb->superblock.sector_size);
1837 
1838         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1839         if (!bmparr) {
1840             ERR("out of memory\n");
1841             Status = STATUS_INSUFFICIENT_RESOURCES;
1842             goto end;
1843         }
1844 
1845         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(uint32_t) / Vcb->superblock.sector_size), ALLOC_TAG);
1846         if (!csum) {
1847             ERR("out of memory\n");
1848             ExFreePool(bmparr);
1849             Status = STATUS_INSUFFICIENT_RESOURCES;
1850             goto end;
1851         }
1852 
1853         RtlInitializeBitMap(&bmp, bmparr, bmplen);
1854         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1855 
1856         searchkey.obj_id = EXTENT_CSUM_ID;
1857         searchkey.obj_type = TYPE_EXTENT_CSUM;
1858         searchkey.offset = dr->address;
1859 
1860         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1861         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1862             ERR("find_item returned %08x\n", Status);
1863             ExFreePool(csum);
1864             ExFreePool(bmparr);
1865             goto end;
1866         }
1867 
1868         if (Status != STATUS_NOT_FOUND) {
1869             do {
1870                 traverse_ptr next_tp;
1871 
1872                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1873                     if (tp.item->key.offset >= dr->address + dr->size)
1874                         break;
1875                     else if (tp.item->size >= sizeof(uint32_t) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= dr->address) {
1876                         uint64_t cs = max(dr->address, tp.item->key.offset);
1877                         uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
1878 
1879                         RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1880                                       tp.item->data + ((cs - tp.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
1881                                       (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
1882 
1883                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1884 
1885                         if (ce == dr->address + dr->size)
1886                             break;
1887                     }
1888                 }
1889 
1890                 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1891                     tp = next_tp;
1892                 else
1893                     break;
1894             } while (true);
1895         }
1896 
1897         lastoff = 0;
1898         runlength = RtlFindFirstRunClear(&bmp, &index);
1899 
1900         while (runlength != 0) {
1901             if (index >= bmplen)
1902                 break;
1903 
1904             if (index + runlength >= bmplen) {
1905                 runlength = bmplen - index;
1906 
1907                 if (runlength == 0)
1908                     break;
1909             }
1910 
1911             if (index > lastoff) {
1912                 ULONG off = lastoff;
1913                 ULONG size = index - lastoff;
1914 
1915                 // handle no csum run
1916                 do {
1917                     ULONG rl;
1918 
1919                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1920                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1921                     else
1922                         rl = size;
1923 
1924                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
1925                                        c, NULL, NULL, 0, false, NormalPagePriority);
1926                     if (!NT_SUCCESS(Status)) {
1927                         ERR("read_data returned %08x\n", Status);
1928                         ExFreePool(csum);
1929                         ExFreePool(bmparr);
1930                         goto end;
1931                     }
1932 
1933                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1934                                                  NULL, newchunk, false, 0, NormalPagePriority);
1935                     if (!NT_SUCCESS(Status)) {
1936                         ERR("write_data_complete returned %08x\n", Status);
1937                         ExFreePool(csum);
1938                         ExFreePool(bmparr);
1939                         goto end;
1940                     }
1941 
1942                     size -= rl;
1943                     off += rl;
1944                 } while (size > 0);
1945             }
1946 
1947             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1948             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1949 
1950             // handle csum run
1951             do {
1952                 ULONG rl;
1953 
1954                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1955                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1956                 else
1957                     rl = runlength;
1958 
1959                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], false, data,
1960                                    c, NULL, NULL, 0, false, NormalPagePriority);
1961                 if (!NT_SUCCESS(Status)) {
1962                     ERR("read_data returned %08x\n", Status);
1963                     ExFreePool(csum);
1964                     ExFreePool(bmparr);
1965                     goto end;
1966                 }
1967 
1968                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1969                                              NULL, newchunk, false, 0, NormalPagePriority);
1970                 if (!NT_SUCCESS(Status)) {
1971                     ERR("write_data_complete returned %08x\n", Status);
1972                     ExFreePool(csum);
1973                     ExFreePool(bmparr);
1974                     goto end;
1975                 }
1976 
1977                 runlength -= rl;
1978                 index += rl;
1979             } while (runlength > 0);
1980 
1981             lastoff = index;
1982             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1983         }
1984 
1985         ExFreePool(csum);
1986         ExFreePool(bmparr);
1987 
1988         // handle final nocsum run
1989         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1990             ULONG off = lastoff;
1991             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1992 
1993             do {
1994                 ULONG rl;
1995 
1996                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1997                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1998                 else
1999                     rl = size;
2000 
2001                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, false, data,
2002                                    c, NULL, NULL, 0, false, NormalPagePriority);
2003                 if (!NT_SUCCESS(Status)) {
2004                     ERR("read_data returned %08x\n", Status);
2005                     goto end;
2006                 }
2007 
2008                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
2009                                              NULL, newchunk, false, 0, NormalPagePriority);
2010                 if (!NT_SUCCESS(Status)) {
2011                     ERR("write_data_complete returned %08x\n", Status);
2012                     goto end;
2013                 }
2014 
2015                 size -= rl;
2016                 off += rl;
2017             } while (size > 0);
2018         }
2019 
2020         le = le->Flink;
2021     }
2022 
2023     ExFreePool(data);
2024     data = NULL;
2025 
2026     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2027     if (!NT_SUCCESS(Status)) {
2028         ERR("write_metadata_items returned %08x\n", Status);
2029         goto end;
2030     }
2031 
2032     le = items.Flink;
2033     while (le != &items) {
2034         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2035 
2036         Status = add_data_reloc_extent_item(Vcb, dr);
2037         if (!NT_SUCCESS(Status)) {
2038             ERR("add_data_reloc_extent_item returned %08x\n", Status);
2039             goto end;
2040         }
2041 
2042         le = le->Flink;
2043     }
2044 
2045     le = c->changed_extents.Flink;
2046     while (le != &c->changed_extents) {
2047         LIST_ENTRY *le2, *le3;
2048         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2049 
2050         le3 = le->Flink;
2051 
2052         le2 = items.Flink;
2053         while (le2 != &items) {
2054             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2055 
2056             if (ce->address == dr->address) {
2057                 ce->address = dr->new_address;
2058                 RemoveEntryList(&ce->list_entry);
2059                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2060                 break;
2061             }
2062 
2063             le2 = le2->Flink;
2064         }
2065 
2066         le = le3;
2067     }
2068 
2069     Status = STATUS_SUCCESS;
2070 
2071     Vcb->need_write = true;
2072 
2073 end:
2074     if (NT_SUCCESS(Status)) {
2075         // update extents in cache inodes before we flush
2076         le = Vcb->chunks.Flink;
2077         while (le != &Vcb->chunks) {
2078             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2079 
2080             if (c2->cache) {
2081                 LIST_ENTRY* le2;
2082 
2083                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2084 
2085                 le2 = c2->cache->extents.Flink;
2086                 while (le2 != &c2->cache->extents) {
2087                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2088 
2089                     if (!ext->ignore) {
2090                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2091                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2092 
2093                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2094                                 LIST_ENTRY* le3 = items.Flink;
2095                                 while (le3 != &items) {
2096                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2097 
2098                                     if (ed2->address == dr->address) {
2099                                         ed2->address = dr->new_address;
2100                                         break;
2101                                     }
2102 
2103                                     le3 = le3->Flink;
2104                                 }
2105                             }
2106                         }
2107                     }
2108 
2109                     le2 = le2->Flink;
2110                 }
2111 
2112                 ExReleaseResourceLite(c2->cache->Header.Resource);
2113             }
2114 
2115             le = le->Flink;
2116         }
2117 
2118         Status = do_write(Vcb, NULL);
2119         if (!NT_SUCCESS(Status))
2120             ERR("do_write returned %08x\n", Status);
2121     }
2122 
2123     if (NT_SUCCESS(Status)) {
2124         clear_rollback(&rollback);
2125 
2126         // update open FCBs
2127         // FIXME - speed this up(?)
2128 
2129         le = Vcb->all_fcbs.Flink;
2130         while (le != &Vcb->all_fcbs) {
2131             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2132             LIST_ENTRY* le2;
2133 
2134             ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2135 
2136             le2 = fcb->extents.Flink;
2137             while (le2 != &fcb->extents) {
2138                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2139 
2140                 if (!ext->ignore) {
2141                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2142                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2143 
2144                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2145                             LIST_ENTRY* le3 = items.Flink;
2146                             while (le3 != &items) {
2147                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2148 
2149                                 if (ed2->address == dr->address) {
2150                                     ed2->address = dr->new_address;
2151                                     break;
2152                                 }
2153 
2154                                 le3 = le3->Flink;
2155                             }
2156                         }
2157                     }
2158                 }
2159 
2160                 le2 = le2->Flink;
2161             }
2162 
2163             ExReleaseResourceLite(fcb->Header.Resource);
2164 
2165             le = le->Flink;
2166         }
2167     } else
2168         do_rollback(Vcb, &rollback);
2169 
2170     free_trees(Vcb);
2171 
2172     ExReleaseResourceLite(&Vcb->tree_lock);
2173 
2174     if (data)
2175         ExFreePool(data);
2176 
2177     while (!IsListEmpty(&items)) {
2178         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2179 
2180         while (!IsListEmpty(&dr->refs)) {
2181             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2182 
2183             ExFreePool(ref);
2184         }
2185 
2186         ExFreePool(dr);
2187     }
2188 
2189     while (!IsListEmpty(&metadata_items)) {
2190         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2191 
2192         while (!IsListEmpty(&mr->refs)) {
2193             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2194 
2195             ExFreePool(ref);
2196         }
2197 
2198         if (mr->data)
2199             ExFreePool(mr->data);
2200 
2201         ExFreePool(mr);
2202     }
2203 
2204     return Status;
2205 }
2206 
2207 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2208     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2209         return BLOCK_FLAG_RAID0;
2210     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2211         return BLOCK_FLAG_RAID1;
2212     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2213         return BLOCK_FLAG_DUPLICATE;
2214     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2215         return BLOCK_FLAG_RAID10;
2216     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2217         return BLOCK_FLAG_RAID5;
2218     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2219         return BLOCK_FLAG_RAID6;
2220     else
2221         return BLOCK_FLAG_SINGLE;
2222 }
2223 
2224 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2225     btrfs_balance_opts* opts;
2226 
2227     opts = &Vcb->balance.opts[sort];
2228 
2229     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2230         return false;
2231 
2232     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2233         uint64_t type = get_chunk_dup_type(c);
2234 
2235         if (!(type & opts->profiles))
2236             return false;
2237     }
2238 
2239     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2240         uint16_t i;
2241         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2242         bool b = false;
2243 
2244         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2245             if (cis[i].dev_id == opts->devid) {
2246                 b = true;
2247                 break;
2248             }
2249         }
2250 
2251         if (!b)
2252             return false;
2253     }
2254 
2255     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2256         uint16_t i, factor;
2257         uint64_t physsize;
2258         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2259         bool b = false;
2260 
2261         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2262             factor = c->chunk_item->num_stripes;
2263         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2264             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2265         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2266             factor = c->chunk_item->num_stripes - 1;
2267         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2268             factor = c->chunk_item->num_stripes - 2;
2269         else // SINGLE, DUPLICATE, RAID1
2270             factor = 1;
2271 
2272         physsize = c->chunk_item->size / factor;
2273 
2274         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2275             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2276                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2277                 b = true;
2278                 break;
2279             }
2280         }
2281 
2282         if (!b)
2283             return false;
2284     }
2285 
2286     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2287         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2288             return false;
2289     }
2290 
2291     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2292         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2293             return false;
2294     }
2295 
2296     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2297         uint64_t usage = c->used * 100 / c->chunk_item->size;
2298 
2299         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2300         if (c->used > 0 && usage == 0)
2301             usage = 1;
2302 
2303         if (usage < opts->usage_start || usage > opts->usage_end)
2304             return false;
2305     }
2306 
2307     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2308         uint64_t type = get_chunk_dup_type(c);
2309 
2310         if (type == opts->convert)
2311             return false;
2312     }
2313 
2314     return true;
2315 }
2316 
2317 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2318     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2319         args->profiles = opts->profiles;
2320         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2321     }
2322 
2323     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2324         if (args->usage_start == 0) {
2325             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2326             args->usage_start = opts->usage_start;
2327             args->usage_end = opts->usage_end;
2328         } else {
2329             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2330             args->usage = opts->usage_end;
2331         }
2332     }
2333 
2334     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2335         args->devid = opts->devid;
2336         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2337     }
2338 
2339     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2340         args->drange_start = opts->drange_start;
2341         args->drange_end = opts->drange_end;
2342         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2343     }
2344 
2345     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2346         args->vrange_start = opts->vrange_start;
2347         args->vrange_end = opts->vrange_end;
2348         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2349     }
2350 
2351     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2352         args->convert = opts->convert;
2353         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2354 
2355         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2356             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2357     }
2358 
2359     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2360         if (args->limit_start == 0) {
2361             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2362             args->limit_start = (uint32_t)opts->limit_start;
2363             args->limit_end = (uint32_t)opts->limit_end;
2364         } else {
2365             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2366             args->limit = opts->limit_end;
2367         }
2368     }
2369 
2370     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2371         args->stripes_start = opts->stripes_start;
2372         args->stripes_end = opts->stripes_end;
2373         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2374     }
2375 }
2376 
2377 static NTSTATUS add_balance_item(device_extension* Vcb) {
2378     KEY searchkey;
2379     traverse_ptr tp;
2380     NTSTATUS Status;
2381     BALANCE_ITEM* bi;
2382 
2383     searchkey.obj_id = BALANCE_ITEM_ID;
2384     searchkey.obj_type = TYPE_TEMP_ITEM;
2385     searchkey.offset = 0;
2386 
2387     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2388 
2389     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2390     if (!NT_SUCCESS(Status)) {
2391         ERR("find_item returned %08x\n", Status);
2392         goto end;
2393     }
2394 
2395     if (!keycmp(tp.item->key, searchkey)) {
2396         Status = delete_tree_item(Vcb, &tp);
2397         if (!NT_SUCCESS(Status)) {
2398             ERR("delete_tree_item returned %08x\n", Status);
2399             goto end;
2400         }
2401     }
2402 
2403     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2404     if (!bi) {
2405         ERR("out of memory\n");
2406         Status = STATUS_INSUFFICIENT_RESOURCES;
2407         goto end;
2408     }
2409 
2410     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2411 
2412     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2413         bi->flags |= BALANCE_FLAGS_DATA;
2414         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2415     }
2416 
2417     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2418         bi->flags |= BALANCE_FLAGS_METADATA;
2419         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2420     }
2421 
2422     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2423         bi->flags |= BALANCE_FLAGS_SYSTEM;
2424         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2425     }
2426 
2427     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2428     if (!NT_SUCCESS(Status)) {
2429         ERR("insert_tree_item returned %08x\n", Status);
2430         ExFreePool(bi);
2431         goto end;
2432     }
2433 
2434     Status = STATUS_SUCCESS;
2435 
2436 end:
2437     if (NT_SUCCESS(Status)) {
2438         Status = do_write(Vcb, NULL);
2439         if (!NT_SUCCESS(Status))
2440             ERR("do_write returned %08x\n", Status);
2441     }
2442 
2443     free_trees(Vcb);
2444 
2445     ExReleaseResourceLite(&Vcb->tree_lock);
2446 
2447     return Status;
2448 }
2449 
2450 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2451     KEY searchkey;
2452     traverse_ptr tp;
2453     NTSTATUS Status;
2454 
2455     searchkey.obj_id = BALANCE_ITEM_ID;
2456     searchkey.obj_type = TYPE_TEMP_ITEM;
2457     searchkey.offset = 0;
2458 
2459     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2460 
2461     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2462     if (!NT_SUCCESS(Status)) {
2463         ERR("find_item returned %08x\n", Status);
2464         goto end;
2465     }
2466 
2467     if (!keycmp(tp.item->key, searchkey)) {
2468         Status = delete_tree_item(Vcb, &tp);
2469         if (!NT_SUCCESS(Status)) {
2470             ERR("delete_tree_item returned %08x\n", Status);
2471             goto end;
2472         }
2473 
2474         Status = do_write(Vcb, NULL);
2475         if (!NT_SUCCESS(Status)) {
2476             ERR("do_write returned %08x\n", Status);
2477             goto end;
2478         }
2479 
2480         free_trees(Vcb);
2481     }
2482 
2483     Status = STATUS_SUCCESS;
2484 
2485 end:
2486     ExReleaseResourceLite(&Vcb->tree_lock);
2487 
2488     return Status;
2489 }
2490 
2491 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2492     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2493 
2494     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2495         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2496         opts->profiles = args->profiles;
2497     }
2498 
2499     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2500         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2501 
2502         opts->usage_start = 0;
2503         opts->usage_end = (uint8_t)args->usage;
2504     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2505         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2506 
2507         opts->usage_start = (uint8_t)args->usage_start;
2508         opts->usage_end = (uint8_t)args->usage_end;
2509     }
2510 
2511     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2512         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2513         opts->devid = args->devid;
2514     }
2515 
2516     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2517         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2518         opts->drange_start = args->drange_start;
2519         opts->drange_end = args->drange_end;
2520     }
2521 
2522     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2523         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2524         opts->vrange_start = args->vrange_start;
2525         opts->vrange_end = args->vrange_end;
2526     }
2527 
2528     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2529         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2530 
2531         opts->limit_start = 0;
2532         opts->limit_end = args->limit;
2533     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2534         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2535 
2536         opts->limit_start = args->limit_start;
2537         opts->limit_end = args->limit_end;
2538     }
2539 
2540     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2541         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2542 
2543         opts->stripes_start = (uint16_t)args->stripes_start;
2544         opts->stripes_end = (uint16_t)args->stripes_end;
2545     }
2546 
2547     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2548         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2549         opts->convert = args->convert;
2550 
2551         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2552             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2553     }
2554 }
2555 
2556 static NTSTATUS remove_superblocks(device* dev) {
2557     NTSTATUS Status;
2558     superblock* sb;
2559     int i = 0;
2560 
2561     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2562     if (!sb) {
2563         ERR("out of memory\n");
2564         return STATUS_INSUFFICIENT_RESOURCES;
2565     }
2566 
2567     RtlZeroMemory(sb, sizeof(superblock));
2568 
2569     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2570         Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2571 
2572         if (!NT_SUCCESS(Status)) {
2573             ExFreePool(sb);
2574             return Status;
2575         }
2576 
2577         i++;
2578     }
2579 
2580     ExFreePool(sb);
2581 
2582     return STATUS_SUCCESS;
2583 }
2584 
2585 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2586     KEY searchkey;
2587     traverse_ptr tp;
2588     NTSTATUS Status;
2589     LIST_ENTRY* le;
2590     volume_device_extension* vde;
2591 
2592     if (Vcb->need_write) {
2593         Status = do_write(Vcb, NULL);
2594 
2595         if (!NT_SUCCESS(Status))
2596             ERR("do_write returned %08x\n", Status);
2597     } else
2598         Status = STATUS_SUCCESS;
2599 
2600     free_trees(Vcb);
2601 
2602     if (!NT_SUCCESS(Status))
2603         return Status;
2604 
2605     // remove entry in chunk tree
2606 
2607     searchkey.obj_id = 1;
2608     searchkey.obj_type = TYPE_DEV_ITEM;
2609     searchkey.offset = dev->devitem.dev_id;
2610 
2611     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2612     if (!NT_SUCCESS(Status)) {
2613         ERR("find_item returned %08x\n", Status);
2614         return Status;
2615     }
2616 
2617     if (!keycmp(searchkey, tp.item->key)) {
2618         Status = delete_tree_item(Vcb, &tp);
2619 
2620         if (!NT_SUCCESS(Status)) {
2621             ERR("delete_tree_item returned %08x\n", Status);
2622             return Status;
2623         }
2624     }
2625 
2626     // remove stats entry in device tree
2627 
2628     searchkey.obj_id = 0;
2629     searchkey.obj_type = TYPE_DEV_STATS;
2630     searchkey.offset = dev->devitem.dev_id;
2631 
2632     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2633     if (!NT_SUCCESS(Status)) {
2634         ERR("find_item returned %08x\n", Status);
2635         return Status;
2636     }
2637 
2638     if (!keycmp(searchkey, tp.item->key)) {
2639         Status = delete_tree_item(Vcb, &tp);
2640 
2641         if (!NT_SUCCESS(Status)) {
2642             ERR("delete_tree_item returned %08x\n", Status);
2643             return Status;
2644         }
2645     }
2646 
2647     // update superblock
2648 
2649     Vcb->superblock.num_devices--;
2650     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2651     Vcb->devices_loaded--;
2652 
2653     RemoveEntryList(&dev->list_entry);
2654 
2655     // flush
2656 
2657     Status = do_write(Vcb, NULL);
2658     if (!NT_SUCCESS(Status))
2659         ERR("do_write returned %08x\n", Status);
2660 
2661     free_trees(Vcb);
2662 
2663     if (!NT_SUCCESS(Status))
2664         return Status;
2665 
2666     if (!dev->readonly && dev->devobj) {
2667         Status = remove_superblocks(dev);
2668         if (!NT_SUCCESS(Status))
2669             WARN("remove_superblocks returned %08x\n", Status);
2670     }
2671 
2672     // remove entry in volume list
2673 
2674     vde = Vcb->vde;
2675 
2676     if (dev->devobj) {
2677         pdo_device_extension* pdode = vde->pdode;
2678 
2679         ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2680 
2681         le = pdode->children.Flink;
2682         while (le != &pdode->children) {
2683             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2684 
2685             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2686                 PFILE_OBJECT FileObject;
2687                 PDEVICE_OBJECT mountmgr;
2688                 UNICODE_STRING mmdevpath;
2689 
2690                 pdode->children_loaded--;
2691 
2692                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2693                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2694                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2695                     if (!NT_SUCCESS(Status))
2696                         ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2697                     else {
2698                         MOUNTDEV_NAME mdn;
2699 
2700                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2701                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2702                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2703                         else {
2704                             MOUNTDEV_NAME* mdn2;
2705                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2706 
2707                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2708                             if (!mdn2)
2709                                 ERR("out of memory\n");
2710                             else {
2711                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2712                                 if (!NT_SUCCESS(Status))
2713                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2714                                 else {
2715                                     UNICODE_STRING name;
2716 
2717                                     name.Buffer = mdn2->Name;
2718                                     name.Length = name.MaximumLength = mdn2->NameLength;
2719 
2720                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2721                                     if (!NT_SUCCESS(Status))
2722                                         WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2723                                 }
2724 
2725                                 ExFreePool(mdn2);
2726                             }
2727                         }
2728 
2729                         ObDereferenceObject(FileObject);
2730                     }
2731                 }
2732 
2733                 ExFreePool(vc->pnp_name.Buffer);
2734                 RemoveEntryList(&vc->list_entry);
2735                 ExFreePool(vc);
2736 
2737                 ObDereferenceObject(vc->fileobj);
2738 
2739                 break;
2740             }
2741 
2742             le = le->Flink;
2743         }
2744 
2745         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2746             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2747 
2748             le = pdode->children.Flink;
2749             while (le != &pdode->children) {
2750                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2751 
2752                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2753                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2754                     break;
2755                 }
2756 
2757                 le = le->Flink;
2758             }
2759         }
2760 
2761         pdode->num_children = Vcb->superblock.num_devices;
2762 
2763         ExReleaseResourceLite(&pdode->child_lock);
2764 
2765         // free dev
2766 
2767         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2768             trim_whole_device(dev);
2769     }
2770 
2771     while (!IsListEmpty(&dev->space)) {
2772         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2773         space* s = CONTAINING_RECORD(le2, space, list_entry);
2774 
2775         ExFreePool(s);
2776     }
2777 
2778     ExFreePool(dev);
2779 
2780     if (Vcb->trim) {
2781         Vcb->trim = false;
2782 
2783         le = Vcb->devices.Flink;
2784         while (le != &Vcb->devices) {
2785             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2786 
2787             if (dev2->trim) {
2788                 Vcb->trim = true;
2789                 break;
2790             }
2791 
2792             le = le->Flink;
2793         }
2794     }
2795 
2796     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2797 
2798     return STATUS_SUCCESS;
2799 }
2800 
2801 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2802     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2803     DEVICE_DATA_SET_RANGE* ranges;
2804     ULONG datalen, i;
2805     KEY searchkey;
2806     traverse_ptr tp;
2807     NTSTATUS Status;
2808     bool b;
2809     uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2810     LIST_ENTRY* le;
2811 
2812     dev->num_trim_entries = 0;
2813 
2814     searchkey.obj_id = dev->devitem.dev_id;
2815     searchkey.obj_type = TYPE_DEV_EXTENT;
2816     searchkey.offset = 0;
2817 
2818     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2819     if (!NT_SUCCESS(Status)) {
2820         ERR("find_item returned %08x\n", Status);
2821         return;
2822     }
2823 
2824     do {
2825         traverse_ptr next_tp;
2826 
2827         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2828             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2829                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2830 
2831                 if (tp.item->key.offset > lastoff)
2832                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2833 
2834                 lastoff = tp.item->key.offset + de->length;
2835             } else {
2836                 ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2837                 return;
2838             }
2839         }
2840 
2841         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2842 
2843         if (b) {
2844             tp = next_tp;
2845             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2846                 break;
2847         }
2848     } while (b);
2849 
2850     if (lastoff < dev->devitem.num_bytes)
2851         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2852 
2853     if (dev->num_trim_entries == 0)
2854         return;
2855 
2856     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2857 
2858     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2859     if (!dmdsa) {
2860         ERR("out of memory\n");
2861         goto end;
2862     }
2863 
2864     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2865     dmdsa->Action = DeviceDsmAction_Trim;
2866     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2867     dmdsa->ParameterBlockOffset = 0;
2868     dmdsa->ParameterBlockLength = 0;
2869     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2870     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2871 
2872     ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2873 
2874     i = 0;
2875     le = dev->trim_list.Flink;
2876     while (le != &dev->trim_list) {
2877         space* s = CONTAINING_RECORD(le, space, list_entry);
2878 
2879         ranges[i].StartingOffset = s->address;
2880         ranges[i].LengthInBytes = s->size;
2881         i++;
2882 
2883         le = le->Flink;
2884     }
2885 
2886     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2887     if (!NT_SUCCESS(Status))
2888         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2889 
2890     ExFreePool(dmdsa);
2891 
2892 end:
2893     while (!IsListEmpty(&dev->trim_list)) {
2894         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2895         ExFreePool(s);
2896     }
2897 
2898     dev->num_trim_entries = 0;
2899 }
2900 
2901 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2902     NTSTATUS Status;
2903     bool changed;
2904     LIST_ENTRY* le;
2905     chunk* rc;
2906 
2907     // FIXME - allow with metadata chunks?
2908 
2909     while (true) {
2910         rc = NULL;
2911 
2912         ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2913 
2914         ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2915 
2916         // choose the least-used chunk we haven't looked at yet
2917         le = Vcb->chunks.Flink;
2918         while (le != &Vcb->chunks) {
2919             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2920 
2921             // FIXME - skip full-size chunks over e.g. 90% full?
2922             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2923                 rc = c;
2924 
2925             le = le->Flink;
2926         }
2927 
2928         ExReleaseResourceLite(&Vcb->chunk_lock);
2929 
2930         if (!rc) {
2931             ExReleaseResourceLite(&Vcb->tree_lock);
2932             break;
2933         }
2934 
2935         if (rc->list_entry_balance.Flink) {
2936             RemoveEntryList(&rc->list_entry_balance);
2937             Vcb->balance.chunks_left--;
2938         }
2939 
2940         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2941         rc->reloc = true;
2942 
2943         ExReleaseResourceLite(&Vcb->tree_lock);
2944 
2945         do {
2946             changed = false;
2947 
2948             Status = balance_data_chunk(Vcb, rc, &changed);
2949             if (!NT_SUCCESS(Status)) {
2950                 ERR("balance_data_chunk returned %08x\n", Status);
2951                 Vcb->balance.status = Status;
2952                 rc->list_entry_balance.Flink = NULL;
2953                 rc->reloc = false;
2954                 return Status;
2955             }
2956 
2957             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2958 
2959             if (Vcb->readonly)
2960                 Vcb->balance.stopping = true;
2961 
2962             if (Vcb->balance.stopping)
2963                 return STATUS_SUCCESS;
2964         } while (changed);
2965 
2966         rc->list_entry_balance.Flink = NULL;
2967 
2968         rc->changed = true;
2969         rc->space_changed = true;
2970         rc->balance_num = Vcb->balance.balance_num;
2971 
2972         Status = do_write(Vcb, NULL);
2973         if (!NT_SUCCESS(Status)) {
2974             ERR("do_write returned %08x\n", Status);
2975             return Status;
2976         }
2977 
2978         free_trees(Vcb);
2979     }
2980 
2981     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2982 
2983     Status = alloc_chunk(Vcb, flags, &rc, true);
2984 
2985     ExReleaseResourceLite(&Vcb->chunk_lock);
2986 
2987     if (NT_SUCCESS(Status)) {
2988         *newchunk = rc;
2989         return Status;
2990     } else {
2991         ERR("alloc_chunk returned %08x\n", Status);
2992         return Status;
2993     }
2994 }
2995 
2996 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2997     LIST_ENTRY* le;
2998 
2999     while (!IsListEmpty(&dev->space)) {
3000         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3001 
3002         ExFreePool(s);
3003     }
3004 
3005     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3006 
3007     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3008 
3009     le = Vcb->chunks.Flink;
3010     while (le != &Vcb->chunks) {
3011         uint16_t n;
3012         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3013         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3014 
3015         for (n = 0; n < c->chunk_item->num_stripes; n++) {
3016             uint64_t stripe_size = 0;
3017 
3018             if (cis[n].dev_id == dev->devitem.dev_id) {
3019                 if (stripe_size == 0) {
3020                     uint16_t factor;
3021 
3022                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3023                         factor = c->chunk_item->num_stripes;
3024                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3025                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3026                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3027                         factor = c->chunk_item->num_stripes - 1;
3028                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3029                         factor = c->chunk_item->num_stripes - 2;
3030                     else // SINGLE, DUP, RAID1
3031                         factor = 1;
3032 
3033                     stripe_size = c->chunk_item->size / factor;
3034                 }
3035 
3036                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3037             }
3038         }
3039 
3040         le = le->Flink;
3041     }
3042 
3043     return STATUS_SUCCESS;
3044 }
3045 
3046 _Function_class_(KSTART_ROUTINE)
3047 void __stdcall balance_thread(void* context) {
3048     device_extension* Vcb = (device_extension*)context;
3049     LIST_ENTRY chunks;
3050     LIST_ENTRY* le;
3051     uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3052     uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3053     NTSTATUS Status;
3054 
3055     Vcb->balance.balance_num++;
3056 
3057     Vcb->balance.stopping = false;
3058     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3059 
3060     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3061         old_data_flags = Vcb->data_flags;
3062         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3063 
3064         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3065     }
3066 
3067     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3068         old_metadata_flags = Vcb->metadata_flags;
3069         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3070     }
3071 
3072     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3073         old_system_flags = Vcb->system_flags;
3074         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3075     }
3076 
3077     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3078         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3079             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3080         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3081             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3082     }
3083 
3084     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3085     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3086 
3087     InitializeListHead(&chunks);
3088 
3089     // FIXME - what are we supposed to do with limit_start?
3090 
3091     if (!Vcb->readonly) {
3092         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3093             Status = add_balance_item(Vcb);
3094             if (!NT_SUCCESS(Status)) {
3095                 ERR("add_balance_item returned %08x\n", Status);
3096                 Vcb->balance.status = Status;
3097                 goto end;
3098             }
3099         } else {
3100             if (Vcb->need_write) {
3101                 Status = do_write(Vcb, NULL);
3102 
3103                 free_trees(Vcb);
3104 
3105                 if (!NT_SUCCESS(Status)) {
3106                     ERR("do_write returned %08x\n", Status);
3107                     Vcb->balance.status = Status;
3108                     goto end;
3109                 }
3110             }
3111         }
3112     }
3113 
3114     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3115 
3116     if (Vcb->balance.stopping)
3117         goto end;
3118 
3119     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3120 
3121     le = Vcb->chunks.Flink;
3122     while (le != &Vcb->chunks) {
3123         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3124         uint8_t sort;
3125 
3126         acquire_chunk_lock(c, Vcb);
3127 
3128         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3129             sort = BALANCE_OPTS_DATA;
3130         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3131             sort = BALANCE_OPTS_METADATA;
3132         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3133             sort = BALANCE_OPTS_SYSTEM;
3134         else {
3135             ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3136             release_chunk_lock(c, Vcb);
3137             break;
3138         }
3139 
3140         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3141             should_balance_chunk(Vcb, sort, c)) {
3142             InsertTailList(&chunks, &c->list_entry_balance);
3143 
3144             num_chunks[sort]++;
3145             Vcb->balance.total_chunks++;
3146             Vcb->balance.chunks_left++;
3147         } else if (sort == BALANCE_OPTS_METADATA)
3148             okay_metadata_chunks++;
3149         else if (sort == BALANCE_OPTS_DATA)
3150             okay_data_chunks++;
3151         else if (sort == BALANCE_OPTS_SYSTEM)
3152             okay_system_chunks++;
3153 
3154         if (!c->cache_loaded) {
3155             Status = load_cache_chunk(Vcb, c, NULL);
3156 
3157             if (!NT_SUCCESS(Status)) {
3158                 ERR("load_cache_chunk returned %08x\n", Status);
3159                 Vcb->balance.status = Status;
3160                 release_chunk_lock(c, Vcb);
3161                 ExReleaseResourceLite(&Vcb->chunk_lock);
3162                 goto end;
3163             }
3164         }
3165 
3166         release_chunk_lock(c, Vcb);
3167 
3168         le = le->Flink;
3169     }
3170 
3171     ExReleaseResourceLite(&Vcb->chunk_lock);
3172 
3173     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3174     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3175         bool consolidated = false;
3176         chunk* c;
3177 
3178         if (okay_metadata_chunks == 0) {
3179             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3180 
3181             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3182             if (NT_SUCCESS(Status))
3183                 c->balance_num = Vcb->balance.balance_num;
3184             else if (Status != STATUS_DISK_FULL || consolidated) {
3185                 ERR("alloc_chunk returned %08x\n", Status);
3186                 ExReleaseResourceLite(&Vcb->chunk_lock);
3187                 Vcb->balance.status = Status;
3188                 goto end;
3189             }
3190 
3191             ExReleaseResourceLite(&Vcb->chunk_lock);
3192 
3193             if (Status == STATUS_DISK_FULL) {
3194                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3195                 if (!NT_SUCCESS(Status)) {
3196                     ERR("try_consolidation returned %08x\n", Status);
3197                     Vcb->balance.status = Status;
3198                     goto end;
3199                 } else
3200                     c->balance_num = Vcb->balance.balance_num;
3201 
3202                 consolidated = true;
3203 
3204                 if (Vcb->balance.stopping)
3205                     goto end;
3206             }
3207         }
3208 
3209         if (okay_data_chunks == 0) {
3210             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3211 
3212             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3213             if (NT_SUCCESS(Status))
3214                 c->balance_num = Vcb->balance.balance_num;
3215             else if (Status != STATUS_DISK_FULL || consolidated) {
3216                 ERR("alloc_chunk returned %08x\n", Status);
3217                 ExReleaseResourceLite(&Vcb->chunk_lock);
3218                 Vcb->balance.status = Status;
3219                 goto end;
3220             }
3221 
3222             ExReleaseResourceLite(&Vcb->chunk_lock);
3223 
3224             if (Status == STATUS_DISK_FULL) {
3225                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3226                 if (!NT_SUCCESS(Status)) {
3227                     ERR("try_consolidation returned %08x\n", Status);
3228                     Vcb->balance.status = Status;
3229                     goto end;
3230                 } else
3231                     c->balance_num = Vcb->balance.balance_num;
3232 
3233                 consolidated = true;
3234 
3235                 if (Vcb->balance.stopping)
3236                     goto end;
3237             }
3238         }
3239 
3240         if (okay_system_chunks == 0) {
3241             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3242 
3243             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3244             if (NT_SUCCESS(Status))
3245                 c->balance_num = Vcb->balance.balance_num;
3246             else if (Status != STATUS_DISK_FULL || consolidated) {
3247                 ERR("alloc_chunk returned %08x\n", Status);
3248                 ExReleaseResourceLite(&Vcb->chunk_lock);
3249                 Vcb->balance.status = Status;
3250                 goto end;
3251             }
3252 
3253             ExReleaseResourceLite(&Vcb->chunk_lock);
3254 
3255             if (Status == STATUS_DISK_FULL) {
3256                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3257                 if (!NT_SUCCESS(Status)) {
3258                     ERR("try_consolidation returned %08x\n", Status);
3259                     Vcb->balance.status = Status;
3260                     goto end;
3261                 } else
3262                     c->balance_num = Vcb->balance.balance_num;
3263 
3264                 consolidated = true;
3265 
3266                 if (Vcb->balance.stopping)
3267                     goto end;
3268             }
3269         }
3270     }
3271 
3272     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3273 
3274     le = chunks.Flink;
3275     while (le != &chunks) {
3276         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3277 
3278         c->reloc = true;
3279 
3280         le = le->Flink;
3281     }
3282 
3283     ExReleaseResourceLite(&Vcb->chunk_lock);
3284 
3285     // do data chunks before metadata
3286     le = chunks.Flink;
3287     while (le != &chunks) {
3288         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3289         LIST_ENTRY* le2 = le->Flink;
3290 
3291         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3292             bool changed;
3293 
3294             do {
3295                 changed = false;
3296 
3297                 Status = balance_data_chunk(Vcb, c, &changed);
3298                 if (!NT_SUCCESS(Status)) {
3299                     ERR("balance_data_chunk returned %08x\n", Status);
3300                     Vcb->balance.status = Status;
3301                     goto end;
3302                 }
3303 
3304                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3305 
3306                 if (Vcb->readonly)
3307                     Vcb->balance.stopping = true;
3308 
3309                 if (Vcb->balance.stopping)
3310                     break;
3311             } while (changed);
3312 
3313             c->changed = true;
3314             c->space_changed = true;
3315         }
3316 
3317         if (Vcb->balance.stopping)
3318             goto end;
3319 
3320         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3321             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3322             RemoveEntryList(&c->list_entry_balance);
3323             c->list_entry_balance.Flink = NULL;
3324 
3325             Vcb->balance.chunks_left--;
3326         }
3327 
3328         le = le2;
3329     }
3330 
3331     // do metadata chunks
3332     while (!IsListEmpty(&chunks)) {
3333         chunk* c;
3334         bool changed;
3335 
3336         le = RemoveHeadList(&chunks);
3337         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3338 
3339         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3340             do {
3341                 Status = balance_metadata_chunk(Vcb, c, &changed);
3342                 if (!NT_SUCCESS(Status)) {
3343                     ERR("balance_metadata_chunk returned %08x\n", Status);
3344                     Vcb->balance.status = Status;
3345                     goto end;
3346                 }
3347 
3348                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3349 
3350                 if (Vcb->readonly)
3351                     Vcb->balance.stopping = true;
3352 
3353                 if (Vcb->balance.stopping)
3354                     break;
3355             } while (changed);
3356 
3357             c->changed = true;
3358             c->space_changed = true;
3359         }
3360 
3361         if (Vcb->balance.stopping)
3362             break;
3363 
3364         c->list_entry_balance.Flink = NULL;
3365 
3366         Vcb->balance.chunks_left--;
3367     }
3368 
3369 end:
3370     if (!Vcb->readonly) {
3371         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3372             le = chunks.Flink;
3373             while (le != &chunks) {
3374                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3375                 c->reloc = false;
3376 
3377                 le = le->Flink;
3378                 c->list_entry_balance.Flink = NULL;
3379             }
3380 
3381             if (old_data_flags != 0)
3382                 Vcb->data_flags = old_data_flags;
3383 
3384             if (old_metadata_flags != 0)
3385                 Vcb->metadata_flags = old_metadata_flags;
3386 
3387             if (old_system_flags != 0)
3388                 Vcb->system_flags = old_system_flags;
3389         }
3390 
3391         if (Vcb->balance.removing) {
3392             device* dev = NULL;
3393 
3394             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3395 
3396             le = Vcb->devices.Flink;
3397             while (le != &Vcb->devices) {
3398                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3399 
3400                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3401                     dev = dev2;
3402                     break;
3403                 }
3404 
3405                 le = le->Flink;
3406             }
3407 
3408             if (dev) {
3409                 if (Vcb->balance.chunks_left == 0) {
3410                     Status = finish_removing_device(Vcb, dev);
3411 
3412                     if (!NT_SUCCESS(Status)) {
3413                         ERR("finish_removing_device returned %08x\n", Status);
3414                         dev->reloc = false;
3415                     }
3416                 } else
3417                     dev->reloc = false;
3418             }
3419 
3420             ExReleaseResourceLite(&Vcb->tree_lock);
3421         } else if (Vcb->balance.shrinking) {
3422             device* dev = NULL;
3423 
3424             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3425 
3426             le = Vcb->devices.Flink;
3427             while (le != &Vcb->devices) {
3428                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3429 
3430                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3431                     dev = dev2;
3432                     break;
3433                 }
3434 
3435                 le = le->Flink;
3436             }
3437 
3438             if (!dev) {
3439                 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3440                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3441             }
3442 
3443             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3444                 if (dev) {
3445                     Status = regenerate_space_list(Vcb, dev);
3446                     if (!NT_SUCCESS(Status))
3447                         WARN("regenerate_space_list returned %08x\n", Status);
3448                 }
3449             } else {
3450                 uint64_t old_size;
3451 
3452                 old_size = dev->devitem.num_bytes;
3453                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3454 
3455                 Status = update_dev_item(Vcb, dev, NULL);
3456                 if (!NT_SUCCESS(Status)) {
3457                     ERR("update_dev_item returned %08x\n", Status);
3458                     dev->devitem.num_bytes = old_size;
3459                     Vcb->balance.status = Status;
3460 
3461                     Status = regenerate_space_list(Vcb, dev);
3462                     if (!NT_SUCCESS(Status))
3463                         WARN("regenerate_space_list returned %08x\n", Status);
3464                 } else {
3465                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3466 
3467                     Status = do_write(Vcb, NULL);
3468                     if (!NT_SUCCESS(Status))
3469                         ERR("do_write returned %08x\n", Status);
3470 
3471                     free_trees(Vcb);
3472                 }
3473             }
3474 
3475             ExReleaseResourceLite(&Vcb->tree_lock);
3476 
3477             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3478                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3479         } else {
3480             Status = remove_balance_item(Vcb);
3481             if (!NT_SUCCESS(Status)) {
3482                 ERR("remove_balance_item returned %08x\n", Status);
3483                 goto end;
3484             }
3485         }
3486 
3487         if (Vcb->trim && !Vcb->options.no_trim) {
3488             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3489 
3490             le = Vcb->devices.Flink;
3491             while (le != &Vcb->devices) {
3492                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3493 
3494                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3495                     trim_unalloc_space(Vcb, dev2);
3496 
3497                 le = le->Flink;
3498             }
3499 
3500             ExReleaseResourceLite(&Vcb->tree_lock);
3501         }
3502     }
3503 
3504     ZwClose(Vcb->balance.thread);
3505     Vcb->balance.thread = NULL;
3506 
3507     KeSetEvent(&Vcb->balance.finished, 0, false);
3508 }
3509 
3510 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3511     NTSTATUS Status;
3512     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3513     OBJECT_ATTRIBUTES oa;
3514     uint8_t i;
3515 
3516     if (length < sizeof(btrfs_start_balance) || !data)
3517         return STATUS_INVALID_PARAMETER;
3518 
3519     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3520         return STATUS_PRIVILEGE_NOT_HELD;
3521 
3522     if (Vcb->locked) {
3523         WARN("cannot start balance while locked\n");
3524         return STATUS_DEVICE_NOT_READY;
3525     }
3526 
3527     if (Vcb->scrub.thread) {
3528         WARN("cannot start balance while scrub running\n");
3529         return STATUS_DEVICE_NOT_READY;
3530     }
3531 
3532     if (Vcb->balance.thread) {
3533         WARN("balance already running\n");
3534         return STATUS_DEVICE_NOT_READY;
3535     }
3536 
3537     if (Vcb->readonly)
3538         return STATUS_MEDIA_WRITE_PROTECTED;
3539 
3540     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3541         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3542         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3543         return STATUS_SUCCESS;
3544 
3545     for (i = 0; i < 3; i++) {
3546         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3547             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3548                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3549                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3550 
3551                 if (bsb->opts[i].profiles == 0)
3552                     return STATUS_INVALID_PARAMETER;
3553             }
3554 
3555             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3556                 if (bsb->opts[i].devid == 0)
3557                     return STATUS_INVALID_PARAMETER;
3558             }
3559 
3560             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3561                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3562                     return STATUS_INVALID_PARAMETER;
3563             }
3564 
3565             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3566                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3567                     return STATUS_INVALID_PARAMETER;
3568             }
3569 
3570             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3571                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3572                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3573 
3574                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3575                     return STATUS_INVALID_PARAMETER;
3576             }
3577 
3578             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3579                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3580                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3581 
3582                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3583                     return STATUS_INVALID_PARAMETER;
3584             }
3585 
3586             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3587                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3588                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3589 
3590                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3591                     return STATUS_INVALID_PARAMETER;
3592             }
3593 
3594             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3595                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3596                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3597                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3598                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3599                     return STATUS_INVALID_PARAMETER;
3600             }
3601         }
3602     }
3603 
3604     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3605     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3606     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3607 
3608     Vcb->balance.paused = false;
3609     Vcb->balance.removing = false;
3610     Vcb->balance.shrinking = false;
3611     Vcb->balance.status = STATUS_SUCCESS;
3612     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3613 
3614     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3615 
3616     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3617     if (!NT_SUCCESS(Status)) {
3618         ERR("PsCreateSystemThread returned %08x\n", Status);
3619         return Status;
3620     }
3621 
3622     return STATUS_SUCCESS;
3623 }
3624 
3625 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3626     KEY searchkey;
3627     traverse_ptr tp;
3628     NTSTATUS Status;
3629     BALANCE_ITEM* bi;
3630     OBJECT_ATTRIBUTES oa;
3631     int i;
3632 
3633     searchkey.obj_id = BALANCE_ITEM_ID;
3634     searchkey.obj_type = TYPE_TEMP_ITEM;
3635     searchkey.offset = 0;
3636 
3637     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3638     if (!NT_SUCCESS(Status)) {
3639         ERR("find_item returned %08x\n", Status);
3640         return Status;
3641     }
3642 
3643     if (keycmp(tp.item->key, searchkey)) {
3644         TRACE("no balance item found\n");
3645         return STATUS_NOT_FOUND;
3646     }
3647 
3648     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3649         WARN("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3650              tp.item->size, sizeof(BALANCE_ITEM));
3651         return STATUS_INTERNAL_ERROR;
3652     }
3653 
3654     bi = (BALANCE_ITEM*)tp.item->data;
3655 
3656     if (bi->flags & BALANCE_FLAGS_DATA)
3657         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3658 
3659     if (bi->flags & BALANCE_FLAGS_METADATA)
3660         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3661 
3662     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3663         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3664 
3665     // do the heuristics that Linux driver does
3666 
3667     for (i = 0; i < 3; i++) {
3668         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3669             // if converting, don't redo chunks already done
3670 
3671             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3672                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3673 
3674             // don't balance chunks more than 90% filled - presumably these
3675             // have already been done
3676 
3677             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3678                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3679             ) {
3680                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3681                 Vcb->balance.opts[i].usage_start = 0;
3682                 Vcb->balance.opts[i].usage_end = 90;
3683             }
3684         }
3685     }
3686 
3687     if (Vcb->readonly || Vcb->options.skip_balance)
3688         Vcb->balance.paused = true;
3689     else
3690         Vcb->balance.paused = false;
3691 
3692     Vcb->balance.removing = false;
3693     Vcb->balance.shrinking = false;
3694     Vcb->balance.status = STATUS_SUCCESS;
3695     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3696 
3697     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3698 
3699     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3700     if (!NT_SUCCESS(Status)) {
3701         ERR("PsCreateSystemThread returned %08x\n", Status);
3702         return Status;
3703     }
3704 
3705     return STATUS_SUCCESS;
3706 }
3707 
3708 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3709     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3710 
3711     if (length < sizeof(btrfs_query_balance) || !data)
3712         return STATUS_INVALID_PARAMETER;
3713 
3714     if (!Vcb->balance.thread) {
3715         bqb->status = BTRFS_BALANCE_STOPPED;
3716 
3717         if (!NT_SUCCESS(Vcb->balance.status)) {
3718             bqb->status |= BTRFS_BALANCE_ERROR;
3719             bqb->error = Vcb->balance.status;
3720         }
3721 
3722         return STATUS_SUCCESS;
3723     }
3724 
3725     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3726 
3727     if (Vcb->balance.removing)
3728         bqb->status |= BTRFS_BALANCE_REMOVAL;
3729 
3730     if (Vcb->balance.shrinking)
3731         bqb->status |= BTRFS_BALANCE_SHRINKING;
3732 
3733     if (!NT_SUCCESS(Vcb->balance.status))
3734         bqb->status |= BTRFS_BALANCE_ERROR;
3735 
3736     bqb->chunks_left = Vcb->balance.chunks_left;
3737     bqb->total_chunks = Vcb->balance.total_chunks;
3738     bqb->error = Vcb->balance.status;
3739     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3740     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3741     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3742 
3743     return STATUS_SUCCESS;
3744 }
3745 
3746 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3747     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3748         return STATUS_PRIVILEGE_NOT_HELD;
3749 
3750     if (!Vcb->balance.thread)
3751         return STATUS_DEVICE_NOT_READY;
3752 
3753     if (Vcb->balance.paused)
3754         return STATUS_DEVICE_NOT_READY;
3755 
3756     Vcb->balance.paused = true;
3757     KeClearEvent(&Vcb->balance.event);
3758 
3759     return STATUS_SUCCESS;
3760 }
3761 
3762 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3763     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3764         return STATUS_PRIVILEGE_NOT_HELD;
3765 
3766     if (!Vcb->balance.thread)
3767         return STATUS_DEVICE_NOT_READY;
3768 
3769     if (!Vcb->balance.paused)
3770         return STATUS_DEVICE_NOT_READY;
3771 
3772     if (Vcb->readonly)
3773         return STATUS_MEDIA_WRITE_PROTECTED;
3774 
3775     Vcb->balance.paused = false;
3776     KeSetEvent(&Vcb->balance.event, 0, false);
3777 
3778     return STATUS_SUCCESS;
3779 }
3780 
3781 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3782     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3783         return STATUS_PRIVILEGE_NOT_HELD;
3784 
3785     if (!Vcb->balance.thread)
3786         return STATUS_DEVICE_NOT_READY;
3787 
3788     Vcb->balance.paused = false;
3789     Vcb->balance.stopping = true;
3790     Vcb->balance.status = STATUS_SUCCESS;
3791     KeSetEvent(&Vcb->balance.event, 0, false);
3792 
3793     return STATUS_SUCCESS;
3794 }
3795 
3796 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3797     uint64_t devid;
3798     LIST_ENTRY* le;
3799     device* dev = NULL;
3800     NTSTATUS Status;
3801     int i;
3802     uint64_t num_rw_devices;
3803     OBJECT_ATTRIBUTES oa;
3804 
3805     TRACE("(%p, %p, %x)\n", Vcb, data, length);
3806 
3807     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3808         return STATUS_PRIVILEGE_NOT_HELD;
3809 
3810     if (length < sizeof(uint64_t))
3811         return STATUS_INVALID_PARAMETER;
3812 
3813     devid = *(uint64_t*)data;
3814 
3815     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3816 
3817     if (Vcb->readonly) {
3818         ExReleaseResourceLite(&Vcb->tree_lock);
3819         return STATUS_MEDIA_WRITE_PROTECTED;
3820     }
3821 
3822     num_rw_devices = 0;
3823 
3824     le = Vcb->devices.Flink;
3825     while (le != &Vcb->devices) {
3826         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3827 
3828         if (dev2->devitem.dev_id == devid)
3829             dev = dev2;
3830 
3831         if (!dev2->readonly)
3832             num_rw_devices++;
3833 
3834         le = le->Flink;
3835     }
3836 
3837     if (!dev) {
3838         ExReleaseResourceLite(&Vcb->tree_lock);
3839         WARN("device %I64x not found\n", devid);
3840         return STATUS_NOT_FOUND;
3841     }
3842 
3843     if (!dev->readonly) {
3844         if (num_rw_devices == 1) {
3845             ExReleaseResourceLite(&Vcb->tree_lock);
3846             WARN("not removing last non-readonly device\n");
3847             return STATUS_INVALID_PARAMETER;
3848         }
3849 
3850         if (num_rw_devices == 4 &&
3851             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3852              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3853         ) {
3854             ExReleaseResourceLite(&Vcb->tree_lock);
3855             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3856             return STATUS_CANNOT_DELETE;
3857         }
3858 
3859         if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3860             ExReleaseResourceLite(&Vcb->tree_lock);
3861             ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3862             return STATUS_CANNOT_DELETE;
3863         }
3864 
3865         if (num_rw_devices == 2 &&
3866             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3867              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3868         ) {
3869             ExReleaseResourceLite(&Vcb->tree_lock);
3870             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3871             return STATUS_CANNOT_DELETE;
3872         }
3873     }
3874 
3875     ExReleaseResourceLite(&Vcb->tree_lock);
3876 
3877     if (Vcb->balance.thread) {
3878         WARN("balance already running\n");
3879         return STATUS_DEVICE_NOT_READY;
3880     }
3881 
3882     dev->reloc = true;
3883 
3884     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3885 
3886     for (i = 0; i < 3; i++) {
3887         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3888         Vcb->balance.opts[i].devid = devid;
3889     }
3890 
3891     Vcb->balance.paused = false;
3892     Vcb->balance.removing = true;
3893     Vcb->balance.shrinking = false;
3894     Vcb->balance.status = STATUS_SUCCESS;
3895     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3896 
3897     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3898 
3899     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3900     if (!NT_SUCCESS(Status)) {
3901         ERR("PsCreateSystemThread returned %08x\n", Status);
3902         dev->reloc = false;
3903         return Status;
3904     }
3905 
3906     return STATUS_SUCCESS;
3907 }
3908