xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 51fd824e)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
23     UINT64 address;
24     UINT64 new_address;
25     tree_header* data;
26     EXTENT_ITEM* ei;
27     tree* t;
28     BOOL system;
29     LIST_ENTRY refs;
30     LIST_ENTRY list_entry;
31 } metadata_reloc;
32 
33 typedef struct {
34     UINT8 type;
35     UINT64 hash;
36 
37     union {
38         TREE_BLOCK_REF tbr;
39         SHARED_BLOCK_REF sbr;
40     };
41 
42     metadata_reloc* parent;
43     BOOL top;
44     LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46 
47 typedef struct {
48     UINT64 address;
49     UINT64 size;
50     UINT64 new_address;
51     chunk* newchunk;
52     EXTENT_ITEM* ei;
53     LIST_ENTRY refs;
54     LIST_ENTRY list_entry;
55 } data_reloc;
56 
57 typedef struct {
58     UINT8 type;
59     UINT64 hash;
60 
61     union {
62         EXTENT_DATA_REF edr;
63         SHARED_DATA_REF sdr;
64     };
65 
66     metadata_reloc* parent;
67     LIST_ENTRY list_entry;
68 } data_reloc_ref;
69 
70 #ifndef __REACTOS__
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74 #endif
75 
76 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
77 
78 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
79                                    BOOL skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
80     NTSTATUS Status;
81     metadata_reloc* mr;
82     EXTENT_ITEM* ei;
83     UINT16 len;
84     UINT64 inline_rc;
85     UINT8* ptr;
86 
87     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
88     if (!mr) {
89         ERR("out of memory\n");
90         return STATUS_INSUFFICIENT_RESOURCES;
91     }
92 
93     mr->address = tp->item->key.obj_id;
94     mr->data = NULL;
95     mr->ei = (EXTENT_ITEM*)tp->item->data;
96     mr->system = FALSE;
97     InitializeListHead(&mr->refs);
98 
99     Status = delete_tree_item(Vcb, tp);
100     if (!NT_SUCCESS(Status)) {
101         ERR("delete_tree_item returned %08x\n", Status);
102         ExFreePool(mr);
103         return Status;
104     }
105 
106     if (!c)
107         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
108 
109     if (c) {
110         ExAcquireResourceExclusiveLite(&c->lock, TRUE);
111 
112         c->used -= Vcb->superblock.node_size;
113 
114         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
115 
116         ExReleaseResourceLite(&c->lock);
117     }
118 
119     ei = (EXTENT_ITEM*)tp->item->data;
120     inline_rc = 0;
121 
122     len = tp->item->size - sizeof(EXTENT_ITEM);
123     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
124     if (!skinny) {
125         len -= sizeof(EXTENT_ITEM2);
126         ptr += sizeof(EXTENT_ITEM2);
127     }
128 
129     while (len > 0) {
130         UINT8 secttype = *ptr;
131         UINT16 sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
132         metadata_reloc_ref* ref;
133 
134         len--;
135 
136         if (sectlen > len) {
137             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
138             return STATUS_INTERNAL_ERROR;
139         }
140 
141         if (sectlen == 0) {
142             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
143             return STATUS_INTERNAL_ERROR;
144         }
145 
146         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
147         if (!ref) {
148             ERR("out of memory\n");
149             return STATUS_INSUFFICIENT_RESOURCES;
150         }
151 
152         if (secttype == TYPE_TREE_BLOCK_REF) {
153             ref->type = TYPE_TREE_BLOCK_REF;
154             RtlCopyMemory(&ref->tbr, ptr + sizeof(UINT8), sizeof(TREE_BLOCK_REF));
155             inline_rc++;
156         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
157             ref->type = TYPE_SHARED_BLOCK_REF;
158             RtlCopyMemory(&ref->sbr, ptr + sizeof(UINT8), sizeof(SHARED_BLOCK_REF));
159             inline_rc++;
160         } else {
161             ERR("unexpected tree type %x\n", secttype);
162             ExFreePool(ref);
163             return STATUS_INTERNAL_ERROR;
164         }
165 
166         ref->parent = NULL;
167         ref->top = FALSE;
168         InsertTailList(&mr->refs, &ref->list_entry);
169 
170         len -= sectlen;
171         ptr += sizeof(UINT8) + sectlen;
172     }
173 
174     if (inline_rc < ei->refcount) { // look for non-inline entries
175         traverse_ptr tp2 = *tp, next_tp;
176 
177         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
178             tp2 = next_tp;
179 
180             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
181                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
182                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
183                     if (!ref) {
184                         ERR("out of memory\n");
185                         return STATUS_INSUFFICIENT_RESOURCES;
186                     }
187 
188                     ref->type = TYPE_TREE_BLOCK_REF;
189                     ref->tbr.offset = tp2.item->key.offset;
190                     ref->parent = NULL;
191                     ref->top = FALSE;
192                     InsertTailList(&mr->refs, &ref->list_entry);
193 
194                     Status = delete_tree_item(Vcb, &tp2);
195                     if (!NT_SUCCESS(Status)) {
196                         ERR("delete_tree_item returned %08x\n", Status);
197                         return Status;
198                     }
199                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
200                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
201                     if (!ref) {
202                         ERR("out of memory\n");
203                         return STATUS_INSUFFICIENT_RESOURCES;
204                     }
205 
206                     ref->type = TYPE_SHARED_BLOCK_REF;
207                     ref->sbr.offset = tp2.item->key.offset;
208                     ref->parent = NULL;
209                     ref->top = FALSE;
210                     InsertTailList(&mr->refs, &ref->list_entry);
211 
212                     Status = delete_tree_item(Vcb, &tp2);
213                     if (!NT_SUCCESS(Status)) {
214                         ERR("delete_tree_item returned %08x\n", Status);
215                         return Status;
216                     }
217                 }
218             } else
219                 break;
220         }
221     }
222 
223     InsertTailList(items, &mr->list_entry);
224 
225     if (mr2)
226         *mr2 = mr;
227 
228     return STATUS_SUCCESS;
229 }
230 
231 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
232                                           UINT64 address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
233     LIST_ENTRY* le;
234     KEY searchkey;
235     traverse_ptr tp;
236     BOOL skinny = FALSE;
237     NTSTATUS Status;
238 
239     le = items->Flink;
240     while (le != items) {
241         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
242 
243         if (mr->address == address) {
244             *mr2 = mr;
245             return STATUS_SUCCESS;
246         }
247 
248         le = le->Flink;
249     }
250 
251     searchkey.obj_id = address;
252     searchkey.obj_type = TYPE_METADATA_ITEM;
253     searchkey.offset = 0xffffffffffffffff;
254 
255     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
256     if (!NT_SUCCESS(Status)) {
257         ERR("find_item returned %08x\n", Status);
258         return Status;
259     }
260 
261     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
262         skinny = TRUE;
263     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
264              tp.item->size >= sizeof(EXTENT_ITEM)) {
265         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
266 
267         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
268             ERR("EXTENT_ITEM for %llx found, but tree flag not set\n", address);
269             return STATUS_INTERNAL_ERROR;
270         }
271     } else {
272         ERR("could not find valid EXTENT_ITEM for address %llx\n", address);
273         return STATUS_INTERNAL_ERROR;
274     }
275 
276     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
277     if (!NT_SUCCESS(Status)) {
278         ERR("add_metadata_reloc returned %08x\n", Status);
279         return Status;
280     }
281 
282     return STATUS_SUCCESS;
283 }
284 
285 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
286     LIST_ENTRY newlist, *le;
287 
288     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
289         return;
290 
291     // insertion sort
292 
293     InitializeListHead(&newlist);
294 
295     while (!IsListEmpty(&mr->refs)) {
296         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
297         BOOL inserted = FALSE;
298 
299         if (ref->type == TYPE_TREE_BLOCK_REF)
300             ref->hash = ref->tbr.offset;
301         else if (ref->type == TYPE_SHARED_BLOCK_REF)
302             ref->hash = ref->parent->new_address;
303 
304         le = newlist.Flink;
305         while (le != &newlist) {
306             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
307 
308             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
309                 InsertHeadList(le->Blink, &ref->list_entry);
310                 inserted = TRUE;
311                 break;
312             }
313 
314             le = le->Flink;
315         }
316 
317         if (!inserted)
318             InsertTailList(&newlist, &ref->list_entry);
319     }
320 
321     newlist.Flink->Blink = &mr->refs;
322     newlist.Blink->Flink = &mr->refs;
323     mr->refs.Flink = newlist.Flink;
324     mr->refs.Blink = newlist.Blink;
325 }
326 
327 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
328     NTSTATUS Status;
329     LIST_ENTRY* le;
330     UINT64 rc = 0;
331     UINT16 inline_len;
332     BOOL all_inline = TRUE;
333     metadata_reloc_ref* first_noninline = NULL;
334     EXTENT_ITEM* ei;
335     UINT8* ptr;
336 
337     inline_len = sizeof(EXTENT_ITEM);
338     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
339         inline_len += sizeof(EXTENT_ITEM2);
340 
341     sort_metadata_reloc_refs(mr);
342 
343     le = mr->refs.Flink;
344     while (le != &mr->refs) {
345         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
346         UINT16 extlen = 0;
347 
348         rc++;
349 
350         if (ref->type == TYPE_TREE_BLOCK_REF)
351             extlen += sizeof(TREE_BLOCK_REF);
352         else if (ref->type == TYPE_SHARED_BLOCK_REF)
353             extlen += sizeof(SHARED_BLOCK_REF);
354 
355         if (all_inline) {
356             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
357                 all_inline = FALSE;
358                 first_noninline = ref;
359             } else
360                 inline_len += extlen + 1;
361         }
362 
363         le = le->Flink;
364     }
365 
366     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
367     if (!ei) {
368         ERR("out of memory\n");
369         return STATUS_INSUFFICIENT_RESOURCES;
370     }
371 
372     ei->refcount = rc;
373     ei->generation = mr->ei->generation;
374     ei->flags = mr->ei->flags;
375     ptr = (UINT8*)&ei[1];
376 
377     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
378         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
379 
380         ei2->firstitem = *(KEY*)&mr->data[1];
381         ei2->level = mr->data->level;
382 
383         ptr += sizeof(EXTENT_ITEM2);
384     }
385 
386     le = mr->refs.Flink;
387     while (le != &mr->refs) {
388         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
389 
390         if (ref == first_noninline)
391             break;
392 
393         *ptr = ref->type;
394         ptr++;
395 
396         if (ref->type == TYPE_TREE_BLOCK_REF) {
397             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
398 
399             tbr->offset = ref->tbr.offset;
400 
401             ptr += sizeof(TREE_BLOCK_REF);
402         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
403             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
404 
405             sbr->offset = ref->parent->new_address;
406 
407             ptr += sizeof(SHARED_BLOCK_REF);
408         }
409 
410         le = le->Flink;
411     }
412 
413     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
414         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
415     else
416         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
417 
418     if (!NT_SUCCESS(Status)) {
419         ERR("insert_tree_item returned %08x\n", Status);
420         ExFreePool(ei);
421         return Status;
422     }
423 
424     if (!all_inline) {
425         le = &first_noninline->list_entry;
426 
427         while (le != &mr->refs) {
428             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
429 
430             if (ref->type == TYPE_TREE_BLOCK_REF) {
431                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
432                 if (!NT_SUCCESS(Status)) {
433                     ERR("insert_tree_item returned %08x\n", Status);
434                     return Status;
435                 }
436             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
437                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
438                 if (!NT_SUCCESS(Status)) {
439                     ERR("insert_tree_item returned %08x\n", Status);
440                     return Status;
441                 }
442             }
443 
444             le = le->Flink;
445         }
446     }
447 
448     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
449         if (mr->data->level > 0) {
450             UINT16 i;
451             internal_node* in = (internal_node*)&mr->data[1];
452 
453             for (i = 0; i < mr->data->num_items; i++) {
454                 UINT64 sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
455 
456                 if (sbrrc > 0) {
457                     SHARED_BLOCK_REF sbr;
458 
459                     sbr.offset = mr->new_address;
460 
461                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
462                     if (!NT_SUCCESS(Status)) {
463                         ERR("increase_extent_refcount returned %08x\n", Status);
464                         return Status;
465                     }
466 
467                     sbr.offset = mr->address;
468 
469                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
470                                                       sbr.offset, FALSE, NULL);
471                     if (!NT_SUCCESS(Status)) {
472                         ERR("decrease_extent_refcount returned %08x\n", Status);
473                         return Status;
474                     }
475                 }
476             }
477         } else {
478             UINT16 i;
479             leaf_node* ln = (leaf_node*)&mr->data[1];
480 
481             for (i = 0; i < mr->data->num_items; i++) {
482                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
483                     EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
484 
485                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
486                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
487 
488                         if (ed2->size > 0) { // not sparse
489                             UINT32 sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
490 
491                             if (sdrrc > 0) {
492                                 SHARED_DATA_REF sdr;
493                                 chunk* c;
494 
495                                 sdr.offset = mr->new_address;
496                                 sdr.count = sdrrc;
497 
498                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
499                                 if (!NT_SUCCESS(Status)) {
500                                     ERR("increase_extent_refcount returned %08x\n", Status);
501                                     return Status;
502                                 }
503 
504                                 sdr.offset = mr->address;
505 
506                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
507                                                                   sdr.offset, FALSE, NULL);
508                                 if (!NT_SUCCESS(Status)) {
509                                     ERR("decrease_extent_refcount returned %08x\n", Status);
510                                     return Status;
511                                 }
512 
513                                 c = get_chunk_from_address(Vcb, ed2->address);
514 
515                                 if (c) {
516                                     // check changed_extents
517 
518                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, TRUE);
519 
520                                     le = c->changed_extents.Flink;
521 
522                                     while (le != &c->changed_extents) {
523                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
524 
525                                         if (ce->address == ed2->address) {
526                                             LIST_ENTRY* le2;
527 
528                                             le2 = ce->refs.Flink;
529                                             while (le2 != &ce->refs) {
530                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
531 
532                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
533                                                     cer->sdr.offset = mr->new_address;
534                                                     break;
535                                                 }
536 
537                                                 le2 = le2->Flink;
538                                             }
539 
540                                             le2 = ce->old_refs.Flink;
541                                             while (le2 != &ce->old_refs) {
542                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
543 
544                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
545                                                     cer->sdr.offset = mr->new_address;
546                                                     break;
547                                                 }
548 
549                                                 le2 = le2->Flink;
550                                             }
551 
552                                             break;
553                                         }
554 
555                                         le = le->Flink;
556                                     }
557 
558                                     ExReleaseResourceLite(&c->changed_extents_lock);
559                                 }
560                             }
561                         }
562                     }
563                 }
564             }
565         }
566     }
567 
568     return STATUS_SUCCESS;
569 }
570 
571 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
572                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
573     LIST_ENTRY tree_writes, *le;
574     NTSTATUS Status;
575     traverse_ptr tp;
576     UINT8 level, max_level = 0;
577     chunk* newchunk = NULL;
578 
579     InitializeListHead(&tree_writes);
580 
581     le = items->Flink;
582     while (le != items) {
583         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
584         LIST_ENTRY* le2;
585         chunk* pc;
586 
587         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
588         if (!mr->data) {
589             ERR("out of memory\n");
590             return STATUS_INSUFFICIENT_RESOURCES;
591         }
592 
593         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, TRUE, (UINT8*)mr->data,
594                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, FALSE, NormalPagePriority);
595         if (!NT_SUCCESS(Status)) {
596             ERR("read_data returned %08x\n", Status);
597             return Status;
598         }
599 
600         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
601             mr->system = TRUE;
602 
603         if (data_items && mr->data->level == 0) {
604             le2 = data_items->Flink;
605             while (le2 != data_items) {
606                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
607                 leaf_node* ln = (leaf_node*)&mr->data[1];
608                 UINT16 i;
609 
610                 for (i = 0; i < mr->data->num_items; i++) {
611                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
612                         EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
613 
614                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
615                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
616 
617                             if (ed2->address == dr->address)
618                                 ed2->address = dr->new_address;
619                         }
620                     }
621                 }
622 
623                 le2 = le2->Flink;
624             }
625         }
626 
627         if (mr->data->level > max_level)
628             max_level = mr->data->level;
629 
630         le2 = mr->refs.Flink;
631         while (le2 != &mr->refs) {
632             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
633 
634             if (ref->type == TYPE_TREE_BLOCK_REF) {
635                 KEY* firstitem;
636                 root* r = NULL;
637                 LIST_ENTRY* le3;
638                 tree* t;
639 
640                 firstitem = (KEY*)&mr->data[1];
641 
642                 le3 = Vcb->roots.Flink;
643                 while (le3 != &Vcb->roots) {
644                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
645 
646                     if (r2->id == ref->tbr.offset) {
647                         r = r2;
648                         break;
649                     }
650 
651                     le3 = le3->Flink;
652                 }
653 
654                 if (!r) {
655                     ERR("could not find subvol with id %llx\n", ref->tbr.offset);
656                     return STATUS_INTERNAL_ERROR;
657                 }
658 
659                 Status = find_item_to_level(Vcb, r, &tp, firstitem, FALSE, mr->data->level + 1, NULL);
660                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
661                     ERR("find_item_to_level returned %08x\n", Status);
662                     return Status;
663                 }
664 
665                 t = tp.tree;
666                 while (t && t->header.level < mr->data->level + 1) {
667                     t = t->parent;
668                 }
669 
670                 if (!t)
671                     ref->top = TRUE;
672                 else {
673                     metadata_reloc* mr2;
674 
675                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
676                     if (!NT_SUCCESS(Status)) {
677                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
678                         return Status;
679                     }
680 
681                     ref->parent = mr2;
682                 }
683             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
684                 metadata_reloc* mr2;
685 
686                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
687                 if (!NT_SUCCESS(Status)) {
688                     ERR("add_metadata_reloc_parent returned %08x\n", Status);
689                     return Status;
690                 }
691 
692                 ref->parent = mr2;
693             }
694 
695             le2 = le2->Flink;
696         }
697 
698         le = le->Flink;
699     }
700 
701     le = items->Flink;
702     while (le != items) {
703         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
704         LIST_ENTRY* le2;
705         UINT32 hash;
706 
707         mr->t = NULL;
708 
709         hash = calc_crc32c(0xffffffff, (UINT8*)&mr->address, sizeof(UINT64));
710 
711         le2 = Vcb->trees_ptrs[hash >> 24];
712 
713         if (le2) {
714             while (le2 != &Vcb->trees_hash) {
715                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
716 
717                 if (t->header.address == mr->address) {
718                     mr->t = t;
719                     break;
720                 } else if (t->hash > hash)
721                     break;
722 
723                 le2 = le2->Flink;
724             }
725         }
726 
727         le = le->Flink;
728     }
729 
730     for (level = 0; level <= max_level; level++) {
731         le = items->Flink;
732         while (le != items) {
733             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
734 
735             if (mr->data->level == level) {
736                 BOOL done = FALSE;
737                 LIST_ENTRY* le2;
738                 tree_write* tw;
739                 UINT64 flags;
740                 tree* t3;
741 
742                 if (mr->system)
743                     flags = Vcb->system_flags;
744                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
745                     flags = Vcb->data_flags;
746                 else
747                     flags = Vcb->metadata_flags;
748 
749                 if (newchunk) {
750                     ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
751 
752                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
753                         newchunk->used += Vcb->superblock.node_size;
754                         space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
755                         done = TRUE;
756                     }
757 
758                     ExReleaseResourceLite(&newchunk->lock);
759                 }
760 
761                 if (!done) {
762                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
763 
764                     le2 = Vcb->chunks.Flink;
765                     while (le2 != &Vcb->chunks) {
766                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
767 
768                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
769                             ExAcquireResourceExclusiveLite(&c2->lock, TRUE);
770 
771                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
772                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
773                                     c2->used += Vcb->superblock.node_size;
774                                     space_list_subtract(c2, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
775                                     ExReleaseResourceLite(&c2->lock);
776                                     newchunk = c2;
777                                     done = TRUE;
778                                     break;
779                                 }
780                             }
781 
782                             ExReleaseResourceLite(&c2->lock);
783                         }
784 
785                         le2 = le2->Flink;
786                     }
787 
788                     // allocate new chunk if necessary
789                     if (!done) {
790                         Status = alloc_chunk(Vcb, flags, &newchunk, FALSE);
791 
792                         if (!NT_SUCCESS(Status)) {
793                             ERR("alloc_chunk returned %08x\n", Status);
794                             ExReleaseResourceLite(&Vcb->chunk_lock);
795                             goto end;
796                         }
797 
798                         ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
799 
800                         newchunk->balance_num = Vcb->balance.balance_num;
801 
802                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
803                             ExReleaseResourceLite(&newchunk->lock);
804                             ExReleaseResourceLite(&Vcb->chunk_lock);
805                             ERR("could not find address in new chunk\n");
806                             Status = STATUS_DISK_FULL;
807                             goto end;
808                         } else {
809                             newchunk->used += Vcb->superblock.node_size;
810                             space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
811                         }
812 
813                         ExReleaseResourceLite(&newchunk->lock);
814                     }
815 
816                     ExReleaseResourceLite(&Vcb->chunk_lock);
817                 }
818 
819                 // update parents
820                 le2 = mr->refs.Flink;
821                 while (le2 != &mr->refs) {
822                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
823 
824                     if (ref->parent) {
825                         UINT16 i;
826                         internal_node* in = (internal_node*)&ref->parent->data[1];
827 
828                         for (i = 0; i < ref->parent->data->num_items; i++) {
829                             if (in[i].address == mr->address) {
830                                 in[i].address = mr->new_address;
831                                 break;
832                             }
833                         }
834 
835                         if (ref->parent->t) {
836                             LIST_ENTRY* le3;
837 
838                             le3 = ref->parent->t->itemlist.Flink;
839                             while (le3 != &ref->parent->t->itemlist) {
840                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
841 
842                                 if (!td->inserted && td->treeholder.address == mr->address)
843                                     td->treeholder.address = mr->new_address;
844 
845                                 le3 = le3->Flink;
846                             }
847                         }
848                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
849                         LIST_ENTRY* le3;
850                         root* r = NULL;
851 
852                         // alter ROOT_ITEM
853 
854                         le3 = Vcb->roots.Flink;
855                         while (le3 != &Vcb->roots) {
856                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
857 
858                             if (r2->id == ref->tbr.offset) {
859                                 r = r2;
860                                 break;
861                             }
862 
863                             le3 = le3->Flink;
864                         }
865 
866                         if (r) {
867                             r->treeholder.address = mr->new_address;
868 
869                             if (r == Vcb->root_root)
870                                 Vcb->superblock.root_tree_addr = mr->new_address;
871                             else if (r == Vcb->chunk_root)
872                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
873                             else if (r->root_item.block_number == mr->address) {
874                                 KEY searchkey;
875                                 ROOT_ITEM* ri;
876 
877                                 r->root_item.block_number = mr->new_address;
878 
879                                 searchkey.obj_id = r->id;
880                                 searchkey.obj_type = TYPE_ROOT_ITEM;
881                                 searchkey.offset = 0xffffffffffffffff;
882 
883                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
884                                 if (!NT_SUCCESS(Status)) {
885                                     ERR("find_item returned %08x\n", Status);
886                                     goto end;
887                                 }
888 
889                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
890                                     ERR("could not find ROOT_ITEM for tree %llx\n", searchkey.obj_id);
891                                     Status = STATUS_INTERNAL_ERROR;
892                                     goto end;
893                                 }
894 
895                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
896                                 if (!ri) {
897                                     ERR("out of memory\n");
898                                     Status = STATUS_INSUFFICIENT_RESOURCES;
899                                     goto end;
900                                 }
901 
902                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
903 
904                                 Status = delete_tree_item(Vcb, &tp);
905                                 if (!NT_SUCCESS(Status)) {
906                                     ERR("delete_tree_item returned %08x\n", Status);
907                                     goto end;
908                                 }
909 
910                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
911                                 if (!NT_SUCCESS(Status)) {
912                                     ERR("insert_tree_item returned %08x\n", Status);
913                                     goto end;
914                                 }
915                             }
916                         }
917                     }
918 
919                     le2 = le2->Flink;
920                 }
921 
922                 mr->data->address = mr->new_address;
923 
924                 t3 = mr->t;
925 
926                 while (t3) {
927                     UINT8 h;
928                     BOOL inserted;
929                     tree* t4 = NULL;
930 
931                     // check if tree loaded more than once
932                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
933                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
934 
935                         if (nt->header.address == t3->header.address)
936                             t4 = nt;
937                     }
938 
939                     t3->header.address = mr->new_address;
940 
941                     h = t3->hash >> 24;
942 
943                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
944                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
945                             Vcb->trees_ptrs[h] = NULL;
946                         else {
947                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
948 
949                             if (t2->hash >> 24 == h)
950                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
951                             else
952                                 Vcb->trees_ptrs[h] = NULL;
953                         }
954                     }
955 
956                     RemoveEntryList(&t3->list_entry_hash);
957 
958                     t3->hash = calc_crc32c(0xffffffff, (UINT8*)&t3->header.address, sizeof(UINT64));
959                     h = t3->hash >> 24;
960 
961                     if (!Vcb->trees_ptrs[h]) {
962                         UINT8 h2 = h;
963 
964                         le2 = Vcb->trees_hash.Flink;
965 
966                         if (h2 > 0) {
967                             h2--;
968                             do {
969                                 if (Vcb->trees_ptrs[h2]) {
970                                     le2 = Vcb->trees_ptrs[h2];
971                                     break;
972                                 }
973 
974                                 h2--;
975                             } while (h2 > 0);
976                         }
977                     } else
978                         le2 = Vcb->trees_ptrs[h];
979 
980                     inserted = FALSE;
981                     while (le2 != &Vcb->trees_hash) {
982                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
983 
984                         if (t2->hash >= t3->hash) {
985                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
986                             inserted = TRUE;
987                             break;
988                         }
989 
990                         le2 = le2->Flink;
991                     }
992 
993                     if (!inserted)
994                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
995 
996                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
997                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
998 
999                     if (data_items && level == 0) {
1000                         le2 = data_items->Flink;
1001 
1002                         while (le2 != data_items) {
1003                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1004                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1005 
1006                             while (le3 != &t3->itemlist) {
1007                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1008 
1009                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1010                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1011 
1012                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1013                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1014 
1015                                         if (ed2->address == dr->address)
1016                                             ed2->address = dr->new_address;
1017                                     }
1018                                 }
1019 
1020                                 le3 = le3->Flink;
1021                             }
1022 
1023                             le2 = le2->Flink;
1024                         }
1025                     }
1026 
1027                     t3 = t4;
1028                 }
1029 
1030                 *((UINT32*)mr->data) = ~calc_crc32c(0xffffffff, (UINT8*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1031 
1032                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1033                 if (!tw) {
1034                     ERR("out of memory\n");
1035                     Status = STATUS_INSUFFICIENT_RESOURCES;
1036                     goto end;
1037                 }
1038 
1039                 tw->address = mr->new_address;
1040                 tw->length = Vcb->superblock.node_size;
1041                 tw->data = (UINT8*)mr->data;
1042 
1043                 if (IsListEmpty(&tree_writes))
1044                     InsertTailList(&tree_writes, &tw->list_entry);
1045                 else {
1046                     BOOL inserted = FALSE;
1047 
1048                     le2 = tree_writes.Flink;
1049                     while (le2 != &tree_writes) {
1050                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1051 
1052                         if (tw2->address > tw->address) {
1053                             InsertHeadList(le2->Blink, &tw->list_entry);
1054                             inserted = TRUE;
1055                             break;
1056                         }
1057 
1058                         le2 = le2->Flink;
1059                     }
1060 
1061                     if (!inserted)
1062                         InsertTailList(&tree_writes, &tw->list_entry);
1063                 }
1064             }
1065 
1066             le = le->Flink;
1067         }
1068     }
1069 
1070     Status = do_tree_writes(Vcb, &tree_writes, TRUE);
1071     if (!NT_SUCCESS(Status)) {
1072         ERR("do_tree_writes returned %08x\n", Status);
1073         goto end;
1074     }
1075 
1076     le = items->Flink;
1077     while (le != items) {
1078         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1079 
1080         Status = add_metadata_reloc_extent_item(Vcb, mr);
1081         if (!NT_SUCCESS(Status)) {
1082             ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1083             goto end;
1084         }
1085 
1086         le = le->Flink;
1087     }
1088 
1089     Status = STATUS_SUCCESS;
1090 
1091 end:
1092     while (!IsListEmpty(&tree_writes)) {
1093         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1094         ExFreePool(tw);
1095     }
1096 
1097     return Status;
1098 }
1099 
1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1101     KEY searchkey;
1102     traverse_ptr tp;
1103     NTSTATUS Status;
1104     BOOL b;
1105     LIST_ENTRY items, rollback;
1106     UINT32 loaded = 0;
1107 
1108     TRACE("chunk %llx\n", c->offset);
1109 
1110     InitializeListHead(&rollback);
1111     InitializeListHead(&items);
1112 
1113     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1114 
1115     searchkey.obj_id = c->offset;
1116     searchkey.obj_type = TYPE_METADATA_ITEM;
1117     searchkey.offset = 0xffffffffffffffff;
1118 
1119     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1120     if (!NT_SUCCESS(Status)) {
1121         ERR("find_item returned %08x\n", Status);
1122         goto end;
1123     }
1124 
1125     do {
1126         traverse_ptr next_tp;
1127 
1128         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129             break;
1130 
1131         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132             BOOL tree = FALSE, skinny = FALSE;
1133 
1134             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135                 tree = TRUE;
1136                 skinny = TRUE;
1137             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1139                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140 
1141                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1142                     tree = TRUE;
1143             }
1144 
1145             if (tree) {
1146                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147 
1148                 if (!NT_SUCCESS(Status)) {
1149                     ERR("add_metadata_reloc returned %08x\n", Status);
1150                     goto end;
1151                 }
1152 
1153                 loaded++;
1154 
1155                 if (loaded >= 64) // only do 64 at a time
1156                     break;
1157             }
1158         }
1159 
1160         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1161 
1162         if (b)
1163             tp = next_tp;
1164     } while (b);
1165 
1166     if (IsListEmpty(&items)) {
1167         *changed = FALSE;
1168         Status = STATUS_SUCCESS;
1169         goto end;
1170     } else
1171         *changed = TRUE;
1172 
1173     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1174     if (!NT_SUCCESS(Status)) {
1175         ERR("write_metadata_items returned %08x\n", Status);
1176         goto end;
1177     }
1178 
1179     Status = STATUS_SUCCESS;
1180 
1181     Vcb->need_write = TRUE;
1182 
1183 end:
1184     if (NT_SUCCESS(Status)) {
1185         Status = do_write(Vcb, NULL);
1186         if (!NT_SUCCESS(Status))
1187             ERR("do_write returned %08x\n", Status);
1188     }
1189 
1190     if (NT_SUCCESS(Status))
1191         clear_rollback(&rollback);
1192     else
1193         do_rollback(Vcb, &rollback);
1194 
1195     free_trees(Vcb);
1196 
1197     ExReleaseResourceLite(&Vcb->tree_lock);
1198 
1199     while (!IsListEmpty(&items)) {
1200         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1201 
1202         while (!IsListEmpty(&mr->refs)) {
1203             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1204 
1205             ExFreePool(ref);
1206         }
1207 
1208         ExFreePool(mr);
1209     }
1210 
1211     return Status;
1212 }
1213 
1214 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1215                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1216     NTSTATUS Status;
1217     LIST_ENTRY* le;
1218     KEY searchkey;
1219     traverse_ptr tp;
1220     root* r = NULL;
1221     metadata_reloc* mr;
1222     UINT64 last_tree = 0;
1223     data_reloc_ref* ref;
1224 
1225     le = Vcb->roots.Flink;
1226     while (le != &Vcb->roots) {
1227         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1228 
1229         if (r2->id == edr->root) {
1230             r = r2;
1231             break;
1232         }
1233 
1234         le = le->Flink;
1235     }
1236 
1237     if (!r) {
1238         ERR("could not find subvol %llx\n", edr->count);
1239         return STATUS_INTERNAL_ERROR;
1240     }
1241 
1242     searchkey.obj_id = edr->objid;
1243     searchkey.obj_type = TYPE_EXTENT_DATA;
1244     searchkey.offset = 0;
1245 
1246     Status = find_item(Vcb, r, &tp, &searchkey, FALSE, NULL);
1247     if (!NT_SUCCESS(Status)) {
1248         ERR("find_item returned %08x\n", Status);
1249         return Status;
1250     }
1251 
1252     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1253         traverse_ptr tp2;
1254 
1255         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1256             tp = tp2;
1257         else {
1258             ERR("could not find EXTENT_DATA for inode %llx in root %llx\n", searchkey.obj_id, r->id);
1259             return STATUS_INTERNAL_ERROR;
1260         }
1261     }
1262 
1263     ref = NULL;
1264 
1265     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1266         traverse_ptr tp2;
1267 
1268         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1269             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1270 
1271             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1272                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1273 
1274                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1275                     if (ref && last_tree == tp.tree->header.address)
1276                         ref->edr.count++;
1277                     else {
1278                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1279                         if (!ref) {
1280                             ERR("out of memory\n");
1281                             return STATUS_INSUFFICIENT_RESOURCES;
1282                         }
1283 
1284                         ref->type = TYPE_EXTENT_DATA_REF;
1285                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1286                         ref->edr.count = 1;
1287 
1288                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1289                         if (!NT_SUCCESS(Status)) {
1290                             ERR("add_metadata_reloc_parent returned %08x\n", Status);
1291                             ExFreePool(ref);
1292                             return Status;
1293                         }
1294 
1295                         last_tree = tp.tree->header.address;
1296                         ref->parent = mr;
1297 
1298                         InsertTailList(&dr->refs, &ref->list_entry);
1299                     }
1300                 }
1301             }
1302         }
1303 
1304         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1305             tp = tp2;
1306         else
1307             break;
1308     }
1309 
1310     return STATUS_SUCCESS;
1311 }
1312 
1313 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1314                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1315     NTSTATUS Status;
1316     data_reloc* dr;
1317     EXTENT_ITEM* ei;
1318     UINT16 len;
1319     UINT64 inline_rc;
1320     UINT8* ptr;
1321 
1322     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1323     if (!dr) {
1324         ERR("out of memory\n");
1325         return STATUS_INSUFFICIENT_RESOURCES;
1326     }
1327 
1328     dr->address = tp->item->key.obj_id;
1329     dr->size = tp->item->key.offset;
1330     dr->ei = (EXTENT_ITEM*)tp->item->data;
1331     InitializeListHead(&dr->refs);
1332 
1333     Status = delete_tree_item(Vcb, tp);
1334     if (!NT_SUCCESS(Status)) {
1335         ERR("delete_tree_item returned %08x\n", Status);
1336         return Status;
1337     }
1338 
1339     if (!c)
1340         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1341 
1342     if (c) {
1343         ExAcquireResourceExclusiveLite(&c->lock, TRUE);
1344 
1345         c->used -= tp->item->key.offset;
1346 
1347         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1348 
1349         ExReleaseResourceLite(&c->lock);
1350     }
1351 
1352     ei = (EXTENT_ITEM*)tp->item->data;
1353     inline_rc = 0;
1354 
1355     len = tp->item->size - sizeof(EXTENT_ITEM);
1356     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
1357 
1358     while (len > 0) {
1359         UINT8 secttype = *ptr;
1360         UINT16 sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1361 
1362         len--;
1363 
1364         if (sectlen > len) {
1365             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1366             return STATUS_INTERNAL_ERROR;
1367         }
1368 
1369         if (sectlen == 0) {
1370             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1371             return STATUS_INTERNAL_ERROR;
1372         }
1373 
1374         if (secttype == TYPE_EXTENT_DATA_REF) {
1375             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(UINT8));
1376 
1377             inline_rc += edr->count;
1378 
1379             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1380             if (!NT_SUCCESS(Status)) {
1381                 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1382                 return Status;
1383             }
1384         } else if (secttype == TYPE_SHARED_DATA_REF) {
1385             metadata_reloc* mr;
1386             data_reloc_ref* ref;
1387 
1388             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1389             if (!ref) {
1390                 ERR("out of memory\n");
1391                 return STATUS_INSUFFICIENT_RESOURCES;
1392             }
1393 
1394             ref->type = TYPE_SHARED_DATA_REF;
1395             RtlCopyMemory(&ref->sdr, ptr + sizeof(UINT8), sizeof(SHARED_DATA_REF));
1396             inline_rc += ref->sdr.count;
1397 
1398             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1399             if (!NT_SUCCESS(Status)) {
1400                 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1401                 ExFreePool(ref);
1402                 return Status;
1403             }
1404 
1405             ref->parent = mr;
1406 
1407             InsertTailList(&dr->refs, &ref->list_entry);
1408         } else {
1409             ERR("unexpected tree type %x\n", secttype);
1410             return STATUS_INTERNAL_ERROR;
1411         }
1412 
1413 
1414         len -= sectlen;
1415         ptr += sizeof(UINT8) + sectlen;
1416     }
1417 
1418     if (inline_rc < ei->refcount) { // look for non-inline entries
1419         traverse_ptr tp2 = *tp, next_tp;
1420 
1421         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
1422             tp2 = next_tp;
1423 
1424             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1425                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1426                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1427                     if (!NT_SUCCESS(Status)) {
1428                         ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1429                         return Status;
1430                     }
1431 
1432                     Status = delete_tree_item(Vcb, &tp2);
1433                     if (!NT_SUCCESS(Status)) {
1434                         ERR("delete_tree_item returned %08x\n", Status);
1435                         return Status;
1436                     }
1437                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(UINT32)) {
1438                     metadata_reloc* mr;
1439                     data_reloc_ref* ref;
1440 
1441                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1442                     if (!ref) {
1443                         ERR("out of memory\n");
1444                         return STATUS_INSUFFICIENT_RESOURCES;
1445                     }
1446 
1447                     ref->type = TYPE_SHARED_DATA_REF;
1448                     ref->sdr.offset = tp2.item->key.offset;
1449                     ref->sdr.count = *((UINT32*)tp2.item->data);
1450 
1451                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1452                     if (!NT_SUCCESS(Status)) {
1453                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
1454                         ExFreePool(ref);
1455                         return Status;
1456                     }
1457 
1458                     ref->parent = mr;
1459                     InsertTailList(&dr->refs, &ref->list_entry);
1460 
1461                     Status = delete_tree_item(Vcb, &tp2);
1462                     if (!NT_SUCCESS(Status)) {
1463                         ERR("delete_tree_item returned %08x\n", Status);
1464                         return Status;
1465                     }
1466                 }
1467             } else
1468                 break;
1469         }
1470     }
1471 
1472     InsertTailList(items, &dr->list_entry);
1473 
1474     return STATUS_SUCCESS;
1475 }
1476 
1477 static void sort_data_reloc_refs(data_reloc* dr) {
1478     LIST_ENTRY newlist, *le;
1479 
1480     if (IsListEmpty(&dr->refs))
1481         return;
1482 
1483     // insertion sort
1484 
1485     InitializeListHead(&newlist);
1486 
1487     while (!IsListEmpty(&dr->refs)) {
1488         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1489         BOOL inserted = FALSE;
1490 
1491         if (ref->type == TYPE_EXTENT_DATA_REF)
1492             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1493         else if (ref->type == TYPE_SHARED_DATA_REF)
1494             ref->hash = ref->parent->new_address;
1495 
1496         le = newlist.Flink;
1497         while (le != &newlist) {
1498             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1499 
1500             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1501                 InsertHeadList(le->Blink, &ref->list_entry);
1502                 inserted = TRUE;
1503                 break;
1504             }
1505 
1506             le = le->Flink;
1507         }
1508 
1509         if (!inserted)
1510             InsertTailList(&newlist, &ref->list_entry);
1511     }
1512 
1513     le = newlist.Flink;
1514     while (le != &newlist) {
1515         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1516 
1517         if (le->Flink != &newlist) {
1518             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1519 
1520             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1521                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1522                 RemoveEntryList(&ref2->list_entry);
1523                 ref->edr.count += ref2->edr.count;
1524                 ExFreePool(ref2);
1525                 continue;
1526             }
1527         }
1528 
1529         le = le->Flink;
1530     }
1531 
1532     newlist.Flink->Blink = &dr->refs;
1533     newlist.Blink->Flink = &dr->refs;
1534     dr->refs.Flink = newlist.Flink;
1535     dr->refs.Blink = newlist.Blink;
1536 }
1537 
1538 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1539     NTSTATUS Status;
1540     LIST_ENTRY* le;
1541     UINT64 rc = 0;
1542     UINT16 inline_len;
1543     BOOL all_inline = TRUE;
1544     data_reloc_ref* first_noninline = NULL;
1545     EXTENT_ITEM* ei;
1546     UINT8* ptr;
1547 
1548     inline_len = sizeof(EXTENT_ITEM);
1549 
1550     sort_data_reloc_refs(dr);
1551 
1552     le = dr->refs.Flink;
1553     while (le != &dr->refs) {
1554         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1555         UINT16 extlen = 0;
1556 
1557         if (ref->type == TYPE_EXTENT_DATA_REF) {
1558             extlen += sizeof(EXTENT_DATA_REF);
1559             rc += ref->edr.count;
1560         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1561             extlen += sizeof(SHARED_DATA_REF);
1562             rc++;
1563         }
1564 
1565         if (all_inline) {
1566             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1567                 all_inline = FALSE;
1568                 first_noninline = ref;
1569             } else
1570                 inline_len += extlen + 1;
1571         }
1572 
1573         le = le->Flink;
1574     }
1575 
1576     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1577     if (!ei) {
1578         ERR("out of memory\n");
1579         return STATUS_INSUFFICIENT_RESOURCES;
1580     }
1581 
1582     ei->refcount = rc;
1583     ei->generation = dr->ei->generation;
1584     ei->flags = dr->ei->flags;
1585     ptr = (UINT8*)&ei[1];
1586 
1587     le = dr->refs.Flink;
1588     while (le != &dr->refs) {
1589         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1590 
1591         if (ref == first_noninline)
1592             break;
1593 
1594         *ptr = ref->type;
1595         ptr++;
1596 
1597         if (ref->type == TYPE_EXTENT_DATA_REF) {
1598             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1599 
1600             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1601 
1602             ptr += sizeof(EXTENT_DATA_REF);
1603         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1604             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1605 
1606             sdr->offset = ref->parent->new_address;
1607             sdr->count = ref->sdr.count;
1608 
1609             ptr += sizeof(SHARED_DATA_REF);
1610         }
1611 
1612         le = le->Flink;
1613     }
1614 
1615     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1616     if (!NT_SUCCESS(Status)) {
1617         ERR("insert_tree_item returned %08x\n", Status);
1618         return Status;
1619     }
1620 
1621     if (!all_inline) {
1622         le = &first_noninline->list_entry;
1623 
1624         while (le != &dr->refs) {
1625             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1626 
1627             if (ref->type == TYPE_EXTENT_DATA_REF) {
1628                 EXTENT_DATA_REF* edr;
1629 
1630                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1631                 if (!edr) {
1632                     ERR("out of memory\n");
1633                     return STATUS_INSUFFICIENT_RESOURCES;
1634                 }
1635 
1636                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1637 
1638                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1639                 if (!NT_SUCCESS(Status)) {
1640                     ERR("insert_tree_item returned %08x\n", Status);
1641                     return Status;
1642                 }
1643             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1644                 UINT32* sdr;
1645 
1646                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(UINT32), ALLOC_TAG);
1647                 if (!sdr) {
1648                     ERR("out of memory\n");
1649                     return STATUS_INSUFFICIENT_RESOURCES;
1650                 }
1651 
1652                 *sdr = ref->sdr.count;
1653 
1654                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(UINT32), NULL, NULL);
1655                 if (!NT_SUCCESS(Status)) {
1656                     ERR("insert_tree_item returned %08x\n", Status);
1657                     return Status;
1658                 }
1659             }
1660 
1661             le = le->Flink;
1662         }
1663     }
1664 
1665     return STATUS_SUCCESS;
1666 }
1667 
1668 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1669     KEY searchkey;
1670     traverse_ptr tp;
1671     NTSTATUS Status;
1672     BOOL b;
1673     LIST_ENTRY items, metadata_items, rollback, *le;
1674     UINT64 loaded = 0, num_loaded = 0;
1675     chunk* newchunk = NULL;
1676     UINT8* data = NULL;
1677 
1678     TRACE("chunk %llx\n", c->offset);
1679 
1680     InitializeListHead(&rollback);
1681     InitializeListHead(&items);
1682     InitializeListHead(&metadata_items);
1683 
1684     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1685 
1686     searchkey.obj_id = c->offset;
1687     searchkey.obj_type = TYPE_EXTENT_ITEM;
1688     searchkey.offset = 0xffffffffffffffff;
1689 
1690     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1691     if (!NT_SUCCESS(Status)) {
1692         ERR("find_item returned %08x\n", Status);
1693         goto end;
1694     }
1695 
1696     do {
1697         traverse_ptr next_tp;
1698 
1699         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1700             break;
1701 
1702         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1703             BOOL tree = FALSE;
1704 
1705             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1706                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1707 
1708                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1709                     tree = TRUE;
1710             }
1711 
1712             if (!tree) {
1713                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1714 
1715                 if (!NT_SUCCESS(Status)) {
1716                     ERR("add_data_reloc returned %08x\n", Status);
1717                     goto end;
1718                 }
1719 
1720                 loaded += tp.item->key.offset;
1721                 num_loaded++;
1722 
1723                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1724                     break;
1725             }
1726         }
1727 
1728         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1729 
1730         if (b)
1731             tp = next_tp;
1732     } while (b);
1733 
1734     if (IsListEmpty(&items)) {
1735         *changed = FALSE;
1736         Status = STATUS_SUCCESS;
1737         goto end;
1738     } else
1739         *changed = TRUE;
1740 
1741     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1742     if (!data) {
1743         ERR("out of memory\n");
1744         Status = STATUS_INSUFFICIENT_RESOURCES;
1745         goto end;
1746     }
1747 
1748     le = items.Flink;
1749     while (le != &items) {
1750         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1751         BOOL done = FALSE;
1752         LIST_ENTRY* le2;
1753         UINT32* csum;
1754         RTL_BITMAP bmp;
1755         ULONG* bmparr;
1756         ULONG runlength, index, lastoff;
1757 
1758         if (newchunk) {
1759             ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
1760 
1761             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1762                 newchunk->used += dr->size;
1763                 space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1764                 done = TRUE;
1765             }
1766 
1767             ExReleaseResourceLite(&newchunk->lock);
1768         }
1769 
1770         if (!done) {
1771             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
1772 
1773             le2 = Vcb->chunks.Flink;
1774             while (le2 != &Vcb->chunks) {
1775                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1776 
1777                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1778                     ExAcquireResourceExclusiveLite(&c2->lock, TRUE);
1779 
1780                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1781                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1782                             c2->used += dr->size;
1783                             space_list_subtract(c2, FALSE, dr->new_address, dr->size, &rollback);
1784                             ExReleaseResourceLite(&c2->lock);
1785                             newchunk = c2;
1786                             done = TRUE;
1787                             break;
1788                         }
1789                     }
1790 
1791                     ExReleaseResourceLite(&c2->lock);
1792                 }
1793 
1794                 le2 = le2->Flink;
1795             }
1796 
1797             // allocate new chunk if necessary
1798             if (!done) {
1799                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, FALSE);
1800 
1801                 if (!NT_SUCCESS(Status)) {
1802                     ERR("alloc_chunk returned %08x\n", Status);
1803                     ExReleaseResourceLite(&Vcb->chunk_lock);
1804                     goto end;
1805                 }
1806 
1807                 ExAcquireResourceExclusiveLite(&newchunk->lock, TRUE);
1808 
1809                 newchunk->balance_num = Vcb->balance.balance_num;
1810 
1811                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1812                     ExReleaseResourceLite(&newchunk->lock);
1813                     ExReleaseResourceLite(&Vcb->chunk_lock);
1814                     ERR("could not find address in new chunk\n");
1815                     Status = STATUS_DISK_FULL;
1816                     goto end;
1817                 } else {
1818                     newchunk->used += dr->size;
1819                     space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1820                 }
1821 
1822                 ExReleaseResourceLite(&newchunk->lock);
1823             }
1824 
1825             ExReleaseResourceLite(&Vcb->chunk_lock);
1826         }
1827 
1828         dr->newchunk = newchunk;
1829 
1830         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align((dr->size / Vcb->superblock.sector_size) + 1, sizeof(ULONG)), ALLOC_TAG);
1831         if (!bmparr) {
1832             ERR("out of memory\n");
1833             Status = STATUS_INSUFFICIENT_RESOURCES;
1834             goto end;
1835         }
1836 
1837         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(UINT32) / Vcb->superblock.sector_size), ALLOC_TAG);
1838         if (!csum) {
1839             ERR("out of memory\n");
1840             ExFreePool(bmparr);
1841             Status = STATUS_INSUFFICIENT_RESOURCES;
1842             goto end;
1843         }
1844 
1845         RtlInitializeBitMap(&bmp, bmparr, (ULONG)(dr->size / Vcb->superblock.sector_size));
1846         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1847 
1848         searchkey.obj_id = EXTENT_CSUM_ID;
1849         searchkey.obj_type = TYPE_EXTENT_CSUM;
1850         searchkey.offset = dr->address;
1851 
1852         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, FALSE, NULL);
1853         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1854             ERR("find_item returned %08x\n", Status);
1855             ExFreePool(csum);
1856             ExFreePool(bmparr);
1857             goto end;
1858         }
1859 
1860         if (Status != STATUS_NOT_FOUND) {
1861             do {
1862                 traverse_ptr next_tp;
1863 
1864                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1865                     if (tp.item->key.offset >= dr->address + dr->size)
1866                         break;
1867                     else if (tp.item->size >= sizeof(UINT32) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)) >= dr->address) {
1868                         UINT64 cs = max(dr->address, tp.item->key.offset);
1869                         UINT64 ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)));
1870 
1871                         RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1872                                       tp.item->data + ((cs - tp.item->key.offset) * sizeof(UINT32) / Vcb->superblock.sector_size),
1873                                       (ULONG)((ce - cs) * sizeof(UINT32) / Vcb->superblock.sector_size));
1874 
1875                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1876 
1877                         if (ce == dr->address + dr->size)
1878                             break;
1879                     }
1880                 }
1881 
1882                 if (find_next_item(Vcb, &tp, &next_tp, FALSE, NULL))
1883                     tp = next_tp;
1884                 else
1885                     break;
1886             } while (TRUE);
1887         }
1888 
1889         lastoff = 0;
1890         runlength = RtlFindFirstRunClear(&bmp, &index);
1891 
1892         while (runlength != 0) {
1893             if (index > lastoff) {
1894                 ULONG off = lastoff;
1895                 ULONG size = index - lastoff;
1896 
1897                 // handle no csum run
1898                 do {
1899                     ULONG rl;
1900 
1901                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1902                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1903                     else
1904                         rl = size;
1905 
1906                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1907                                        c, NULL, NULL, 0, FALSE, NormalPagePriority);
1908                     if (!NT_SUCCESS(Status)) {
1909                         ERR("read_data returned %08x\n", Status);
1910                         ExFreePool(csum);
1911                         ExFreePool(bmparr);
1912                         goto end;
1913                     }
1914 
1915                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1916                                                  NULL, newchunk, FALSE, 0, NormalPagePriority);
1917                     if (!NT_SUCCESS(Status)) {
1918                         ERR("write_data_complete returned %08x\n", Status);
1919                         ExFreePool(csum);
1920                         ExFreePool(bmparr);
1921                         goto end;
1922                     }
1923 
1924                     size -= rl;
1925                     off += rl;
1926                 } while (size > 0);
1927             }
1928 
1929             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1930             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1931 
1932             // handle csum run
1933             do {
1934                 ULONG rl;
1935 
1936                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1937                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1938                 else
1939                     rl = runlength;
1940 
1941                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], FALSE, data,
1942                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1943                 if (!NT_SUCCESS(Status)) {
1944                     ERR("read_data returned %08x\n", Status);
1945                     ExFreePool(csum);
1946                     ExFreePool(bmparr);
1947                     goto end;
1948                 }
1949 
1950                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1951                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1952                 if (!NT_SUCCESS(Status)) {
1953                     ERR("write_data_complete returned %08x\n", Status);
1954                     ExFreePool(csum);
1955                     ExFreePool(bmparr);
1956                     goto end;
1957                 }
1958 
1959                 runlength -= rl;
1960                 index += rl;
1961             } while (runlength > 0);
1962 
1963             lastoff = index;
1964             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1965         }
1966 
1967         ExFreePool(csum);
1968         ExFreePool(bmparr);
1969 
1970         // handle final nocsum run
1971         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1972             ULONG off = lastoff;
1973             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1974 
1975             do {
1976                 ULONG rl;
1977 
1978                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1979                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1980                 else
1981                     rl = size;
1982 
1983                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1984                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1985                 if (!NT_SUCCESS(Status)) {
1986                     ERR("read_data returned %08x\n", Status);
1987                     ExFreePool(csum);
1988                     ExFreePool(bmparr);
1989                     goto end;
1990                 }
1991 
1992                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1993                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1994                 if (!NT_SUCCESS(Status)) {
1995                     ERR("write_data_complete returned %08x\n", Status);
1996                     ExFreePool(csum);
1997                     ExFreePool(bmparr);
1998                     goto end;
1999                 }
2000 
2001                 size -= rl;
2002                 off += rl;
2003             } while (size > 0);
2004         }
2005 
2006         le = le->Flink;
2007     }
2008 
2009     ExFreePool(data);
2010     data = NULL;
2011 
2012     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2013     if (!NT_SUCCESS(Status)) {
2014         ERR("write_metadata_items returned %08x\n", Status);
2015         goto end;
2016     }
2017 
2018     le = items.Flink;
2019     while (le != &items) {
2020         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2021 
2022         Status = add_data_reloc_extent_item(Vcb, dr);
2023         if (!NT_SUCCESS(Status)) {
2024             ERR("add_data_reloc_extent_item returned %08x\n", Status);
2025             goto end;
2026         }
2027 
2028         le = le->Flink;
2029     }
2030 
2031     le = c->changed_extents.Flink;
2032     while (le != &c->changed_extents) {
2033         LIST_ENTRY *le2, *le3;
2034         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2035 
2036         le3 = le->Flink;
2037 
2038         le2 = items.Flink;
2039         while (le2 != &items) {
2040             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2041 
2042             if (ce->address == dr->address) {
2043                 ce->address = dr->new_address;
2044                 RemoveEntryList(&ce->list_entry);
2045                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2046                 break;
2047             }
2048 
2049             le2 = le2->Flink;
2050         }
2051 
2052         le = le3;
2053     }
2054 
2055     Status = STATUS_SUCCESS;
2056 
2057     Vcb->need_write = TRUE;
2058 
2059 end:
2060     if (NT_SUCCESS(Status)) {
2061         // update extents in cache inodes before we flush
2062         le = Vcb->chunks.Flink;
2063         while (le != &Vcb->chunks) {
2064             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2065 
2066             if (c2->cache) {
2067                 LIST_ENTRY* le2;
2068 
2069                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, TRUE);
2070 
2071                 le2 = c2->cache->extents.Flink;
2072                 while (le2 != &c2->cache->extents) {
2073                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2074 
2075                     if (!ext->ignore) {
2076                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2077                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2078 
2079                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2080                                 LIST_ENTRY* le3 = items.Flink;
2081                                 while (le3 != &items) {
2082                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2083 
2084                                     if (ed2->address == dr->address) {
2085                                         ed2->address = dr->new_address;
2086                                         break;
2087                                     }
2088 
2089                                     le3 = le3->Flink;
2090                                 }
2091                             }
2092                         }
2093                     }
2094 
2095                     le2 = le2->Flink;
2096                 }
2097 
2098                 ExReleaseResourceLite(c2->cache->Header.Resource);
2099             }
2100 
2101             le = le->Flink;
2102         }
2103 
2104         Status = do_write(Vcb, NULL);
2105         if (!NT_SUCCESS(Status))
2106             ERR("do_write returned %08x\n", Status);
2107     }
2108 
2109     if (NT_SUCCESS(Status)) {
2110         clear_rollback(&rollback);
2111 
2112         // update open FCBs
2113         // FIXME - speed this up(?)
2114 
2115         acquire_fcb_lock_shared(Vcb);
2116 
2117         le = Vcb->all_fcbs.Flink;
2118         while (le != &Vcb->all_fcbs) {
2119             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2120             LIST_ENTRY* le2;
2121 
2122             ExAcquireResourceExclusiveLite(fcb->Header.Resource, TRUE);
2123 
2124             le2 = fcb->extents.Flink;
2125             while (le2 != &fcb->extents) {
2126                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2127 
2128                 if (!ext->ignore) {
2129                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2130                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2131 
2132                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2133                             LIST_ENTRY* le3 = items.Flink;
2134                             while (le3 != &items) {
2135                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2136 
2137                                 if (ed2->address == dr->address) {
2138                                     ed2->address = dr->new_address;
2139                                     break;
2140                                 }
2141 
2142                                 le3 = le3->Flink;
2143                             }
2144                         }
2145                     }
2146                 }
2147 
2148                 le2 = le2->Flink;
2149             }
2150 
2151             ExReleaseResourceLite(fcb->Header.Resource);
2152 
2153             le = le->Flink;
2154         }
2155 
2156         release_fcb_lock(Vcb);
2157     } else
2158         do_rollback(Vcb, &rollback);
2159 
2160     free_trees(Vcb);
2161 
2162     ExReleaseResourceLite(&Vcb->tree_lock);
2163 
2164     if (data)
2165         ExFreePool(data);
2166 
2167     while (!IsListEmpty(&items)) {
2168         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2169 
2170         while (!IsListEmpty(&dr->refs)) {
2171             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2172 
2173             ExFreePool(ref);
2174         }
2175 
2176         ExFreePool(dr);
2177     }
2178 
2179     while (!IsListEmpty(&metadata_items)) {
2180         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2181 
2182         while (!IsListEmpty(&mr->refs)) {
2183             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2184 
2185             ExFreePool(ref);
2186         }
2187 
2188         ExFreePool(mr);
2189     }
2190 
2191     return Status;
2192 }
2193 
2194 static __inline UINT64 get_chunk_dup_type(chunk* c) {
2195     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2196         return BLOCK_FLAG_RAID0;
2197     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2198         return BLOCK_FLAG_RAID1;
2199     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2200         return BLOCK_FLAG_DUPLICATE;
2201     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2202         return BLOCK_FLAG_RAID10;
2203     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2204         return BLOCK_FLAG_RAID5;
2205     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2206         return BLOCK_FLAG_RAID6;
2207     else
2208         return BLOCK_FLAG_SINGLE;
2209 }
2210 
2211 static BOOL should_balance_chunk(device_extension* Vcb, UINT8 sort, chunk* c) {
2212     btrfs_balance_opts* opts;
2213 
2214     opts = &Vcb->balance.opts[sort];
2215 
2216     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2217         return FALSE;
2218 
2219     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2220         UINT64 type = get_chunk_dup_type(c);
2221 
2222         if (!(type & opts->profiles))
2223             return FALSE;
2224     }
2225 
2226     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2227         UINT16 i;
2228         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2229         BOOL b = FALSE;
2230 
2231         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2232             if (cis[i].dev_id == opts->devid) {
2233                 b = TRUE;
2234                 break;
2235             }
2236         }
2237 
2238         if (!b)
2239             return FALSE;
2240     }
2241 
2242     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2243         UINT16 i, factor;
2244         UINT64 physsize;
2245         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2246         BOOL b = FALSE;
2247 
2248         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2249             factor = c->chunk_item->num_stripes;
2250         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2251             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2252         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2253             factor = c->chunk_item->num_stripes - 1;
2254         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2255             factor = c->chunk_item->num_stripes - 2;
2256         else // SINGLE, DUPLICATE, RAID1
2257             factor = 1;
2258 
2259         physsize = c->chunk_item->size / factor;
2260 
2261         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2262             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2263                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2264                 b = TRUE;
2265                 break;
2266             }
2267         }
2268 
2269         if (!b)
2270             return FALSE;
2271     }
2272 
2273     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2274         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2275             return FALSE;
2276     }
2277 
2278     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2279         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2280             return FALSE;
2281     }
2282 
2283     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2284         UINT64 usage = c->used * 100 / c->chunk_item->size;
2285 
2286         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2287         if (c->used > 0 && usage == 0)
2288             usage = 1;
2289 
2290         if (usage < opts->usage_start || usage > opts->usage_end)
2291             return FALSE;
2292     }
2293 
2294     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2295         UINT64 type = get_chunk_dup_type(c);
2296 
2297         if (type == opts->convert)
2298             return FALSE;
2299     }
2300 
2301     return TRUE;
2302 }
2303 
2304 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2305     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2306         args->profiles = opts->profiles;
2307         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2308     }
2309 
2310     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2311         if (args->usage_start == 0) {
2312             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2313             args->usage_start = opts->usage_start;
2314             args->usage_end = opts->usage_end;
2315         } else {
2316             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2317             args->usage = opts->usage_end;
2318         }
2319     }
2320 
2321     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2322         args->devid = opts->devid;
2323         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2324     }
2325 
2326     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2327         args->drange_start = opts->drange_start;
2328         args->drange_end = opts->drange_end;
2329         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2330     }
2331 
2332     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2333         args->vrange_start = opts->vrange_start;
2334         args->vrange_end = opts->vrange_end;
2335         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2336     }
2337 
2338     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2339         args->convert = opts->convert;
2340         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2341 
2342         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2343             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2344     }
2345 
2346     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2347         if (args->limit_start == 0) {
2348             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2349             args->limit_start = (UINT32)opts->limit_start;
2350             args->limit_end = (UINT32)opts->limit_end;
2351         } else {
2352             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2353             args->limit = opts->limit_end;
2354         }
2355     }
2356 
2357     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2358         args->stripes_start = opts->stripes_start;
2359         args->stripes_end = opts->stripes_end;
2360         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2361     }
2362 }
2363 
2364 static NTSTATUS add_balance_item(device_extension* Vcb) {
2365     KEY searchkey;
2366     traverse_ptr tp;
2367     NTSTATUS Status;
2368     BALANCE_ITEM* bi;
2369 
2370     searchkey.obj_id = BALANCE_ITEM_ID;
2371     searchkey.obj_type = TYPE_TEMP_ITEM;
2372     searchkey.offset = 0;
2373 
2374     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2375 
2376     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2377     if (!NT_SUCCESS(Status)) {
2378         ERR("find_item returned %08x\n", Status);
2379         goto end;
2380     }
2381 
2382     if (!keycmp(tp.item->key, searchkey)) {
2383         Status = delete_tree_item(Vcb, &tp);
2384         if (!NT_SUCCESS(Status)) {
2385             ERR("delete_tree_item returned %08x\n", Status);
2386             goto end;
2387         }
2388     }
2389 
2390     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2391     if (!bi) {
2392         ERR("out of memory\n");
2393         Status = STATUS_INSUFFICIENT_RESOURCES;
2394         goto end;
2395     }
2396 
2397     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2398 
2399     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2400         bi->flags |= BALANCE_FLAGS_DATA;
2401         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2402     }
2403 
2404     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2405         bi->flags |= BALANCE_FLAGS_METADATA;
2406         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2407     }
2408 
2409     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2410         bi->flags |= BALANCE_FLAGS_SYSTEM;
2411         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2412     }
2413 
2414     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2415     if (!NT_SUCCESS(Status)) {
2416         ERR("insert_tree_item returned %08x\n", Status);
2417         ExFreePool(bi);
2418         goto end;
2419     }
2420 
2421     Status = STATUS_SUCCESS;
2422 
2423 end:
2424     if (NT_SUCCESS(Status)) {
2425         Status = do_write(Vcb, NULL);
2426         if (!NT_SUCCESS(Status))
2427             ERR("do_write returned %08x\n", Status);
2428     }
2429 
2430     free_trees(Vcb);
2431 
2432     ExReleaseResourceLite(&Vcb->tree_lock);
2433 
2434     return Status;
2435 }
2436 
2437 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2438     KEY searchkey;
2439     traverse_ptr tp;
2440     NTSTATUS Status;
2441 
2442     searchkey.obj_id = BALANCE_ITEM_ID;
2443     searchkey.obj_type = TYPE_TEMP_ITEM;
2444     searchkey.offset = 0;
2445 
2446     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2447 
2448     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2449     if (!NT_SUCCESS(Status)) {
2450         ERR("find_item returned %08x\n", Status);
2451         goto end;
2452     }
2453 
2454     if (!keycmp(tp.item->key, searchkey)) {
2455         Status = delete_tree_item(Vcb, &tp);
2456         if (!NT_SUCCESS(Status)) {
2457             ERR("delete_tree_item returned %08x\n", Status);
2458             goto end;
2459         }
2460 
2461         Status = do_write(Vcb, NULL);
2462         if (!NT_SUCCESS(Status)) {
2463             ERR("do_write returned %08x\n", Status);
2464             goto end;
2465         }
2466 
2467         free_trees(Vcb);
2468     }
2469 
2470     Status = STATUS_SUCCESS;
2471 
2472 end:
2473     ExReleaseResourceLite(&Vcb->tree_lock);
2474 
2475     return Status;
2476 }
2477 
2478 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2479     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2480 
2481     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2482         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2483         opts->profiles = args->profiles;
2484     }
2485 
2486     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2487         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2488 
2489         opts->usage_start = 0;
2490         opts->usage_end = (UINT8)args->usage;
2491     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2492         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2493 
2494         opts->usage_start = (UINT8)args->usage_start;
2495         opts->usage_end = (UINT8)args->usage_end;
2496     }
2497 
2498     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2499         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2500         opts->devid = args->devid;
2501     }
2502 
2503     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2504         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2505         opts->drange_start = args->drange_start;
2506         opts->drange_end = args->drange_end;
2507     }
2508 
2509     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2510         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2511         opts->vrange_start = args->vrange_start;
2512         opts->vrange_end = args->vrange_end;
2513     }
2514 
2515     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2516         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2517 
2518         opts->limit_start = 0;
2519         opts->limit_end = args->limit;
2520     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2521         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2522 
2523         opts->limit_start = args->limit_start;
2524         opts->limit_end = args->limit_end;
2525     }
2526 
2527     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2528         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2529 
2530         opts->stripes_start = (UINT16)args->stripes_start;
2531         opts->stripes_end = (UINT16)args->stripes_end;
2532     }
2533 
2534     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2535         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2536         opts->convert = args->convert;
2537 
2538         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2539             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2540     }
2541 }
2542 
2543 static NTSTATUS remove_superblocks(device* dev) {
2544     NTSTATUS Status;
2545     superblock* sb;
2546     int i = 0;
2547 
2548     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2549     if (!sb) {
2550         ERR("out of memory\n");
2551         return STATUS_INSUFFICIENT_RESOURCES;
2552     }
2553 
2554     RtlZeroMemory(sb, sizeof(superblock));
2555 
2556     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2557         Status = write_data_phys(dev->devobj, superblock_addrs[i], sb, sizeof(superblock));
2558 
2559         if (!NT_SUCCESS(Status)) {
2560             ExFreePool(sb);
2561             return Status;
2562         }
2563 
2564         i++;
2565     }
2566 
2567     ExFreePool(sb);
2568 
2569     return STATUS_SUCCESS;
2570 }
2571 
2572 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2573     KEY searchkey;
2574     traverse_ptr tp;
2575     NTSTATUS Status;
2576     LIST_ENTRY* le;
2577     volume_device_extension* vde;
2578 
2579     if (Vcb->need_write) {
2580         Status = do_write(Vcb, NULL);
2581 
2582         if (!NT_SUCCESS(Status))
2583             ERR("do_write returned %08x\n", Status);
2584     } else
2585         Status = STATUS_SUCCESS;
2586 
2587     free_trees(Vcb);
2588 
2589     if (!NT_SUCCESS(Status))
2590         return Status;
2591 
2592     // remove entry in chunk tree
2593 
2594     searchkey.obj_id = 1;
2595     searchkey.obj_type = TYPE_DEV_ITEM;
2596     searchkey.offset = dev->devitem.dev_id;
2597 
2598     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, FALSE, NULL);
2599     if (!NT_SUCCESS(Status)) {
2600         ERR("find_item returned %08x\n", Status);
2601         return Status;
2602     }
2603 
2604     if (!keycmp(searchkey, tp.item->key)) {
2605         Status = delete_tree_item(Vcb, &tp);
2606 
2607         if (!NT_SUCCESS(Status)) {
2608             ERR("delete_tree_item returned %08x\n", Status);
2609             return Status;
2610         }
2611     }
2612 
2613     // remove stats entry in device tree
2614 
2615     searchkey.obj_id = 0;
2616     searchkey.obj_type = TYPE_DEV_STATS;
2617     searchkey.offset = dev->devitem.dev_id;
2618 
2619     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2620     if (!NT_SUCCESS(Status)) {
2621         ERR("find_item returned %08x\n", Status);
2622         return Status;
2623     }
2624 
2625     if (!keycmp(searchkey, tp.item->key)) {
2626         Status = delete_tree_item(Vcb, &tp);
2627 
2628         if (!NT_SUCCESS(Status)) {
2629             ERR("delete_tree_item returned %08x\n", Status);
2630             return Status;
2631         }
2632     }
2633 
2634     // update superblock
2635 
2636     Vcb->superblock.num_devices--;
2637     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2638     Vcb->devices_loaded--;
2639 
2640     RemoveEntryList(&dev->list_entry);
2641 
2642     // flush
2643 
2644     Status = do_write(Vcb, NULL);
2645     if (!NT_SUCCESS(Status))
2646         ERR("do_write returned %08x\n", Status);
2647 
2648     free_trees(Vcb);
2649 
2650     if (!NT_SUCCESS(Status))
2651         return Status;
2652 
2653     if (!dev->readonly && dev->devobj) {
2654         Status = remove_superblocks(dev);
2655         if (!NT_SUCCESS(Status))
2656             WARN("remove_superblocks returned %08x\n", Status);
2657     }
2658 
2659     // remove entry in volume list
2660 
2661     vde = Vcb->vde;
2662 
2663     if (dev->devobj) {
2664         pdo_device_extension* pdode = vde->pdode;
2665 
2666         ExAcquireResourceExclusiveLite(&pdode->child_lock, TRUE);
2667 
2668         le = pdode->children.Flink;
2669         while (le != &pdode->children) {
2670             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2671 
2672             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2673                 PFILE_OBJECT FileObject;
2674                 PDEVICE_OBJECT mountmgr;
2675                 UNICODE_STRING mmdevpath;
2676 
2677                 pdode->children_loaded--;
2678 
2679                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2680                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2681                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2682                     if (!NT_SUCCESS(Status))
2683                         ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2684                     else {
2685                         MOUNTDEV_NAME mdn;
2686 
2687                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), TRUE, NULL);
2688                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2689                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2690                         else {
2691                             MOUNTDEV_NAME* mdn2;
2692                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2693 
2694                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2695                             if (!mdn2)
2696                                 ERR("out of memory\n");
2697                             else {
2698                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, TRUE, NULL);
2699                                 if (!NT_SUCCESS(Status))
2700                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2701                                 else {
2702                                     UNICODE_STRING name;
2703 
2704                                     name.Buffer = mdn2->Name;
2705                                     name.Length = name.MaximumLength = mdn2->NameLength;
2706 
2707                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2708                                     if (!NT_SUCCESS(Status))
2709                                         WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2710                                 }
2711 
2712                                 ExFreePool(mdn2);
2713                             }
2714                         }
2715 
2716                         ObDereferenceObject(FileObject);
2717                     }
2718                 }
2719 
2720                 ExFreePool(vc->pnp_name.Buffer);
2721                 RemoveEntryList(&vc->list_entry);
2722                 ExFreePool(vc);
2723 
2724                 ObDereferenceObject(vc->fileobj);
2725 
2726                 break;
2727             }
2728 
2729             le = le->Flink;
2730         }
2731 
2732         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2733             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2734 
2735             le = pdode->children.Flink;
2736             while (le != &pdode->children) {
2737                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2738 
2739                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2740                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2741                     break;
2742                 }
2743 
2744                 le = le->Flink;
2745             }
2746         }
2747 
2748         pdode->num_children = Vcb->superblock.num_devices;
2749 
2750         ExReleaseResourceLite(&pdode->child_lock);
2751 
2752         // free dev
2753 
2754         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2755             trim_whole_device(dev);
2756     }
2757 
2758     while (!IsListEmpty(&dev->space)) {
2759         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2760         space* s = CONTAINING_RECORD(le2, space, list_entry);
2761 
2762         ExFreePool(s);
2763     }
2764 
2765     ExFreePool(dev);
2766 
2767     if (Vcb->trim) {
2768         Vcb->trim = FALSE;
2769 
2770         le = Vcb->devices.Flink;
2771         while (le != &Vcb->devices) {
2772             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2773 
2774             if (dev2->trim) {
2775                 Vcb->trim = TRUE;
2776                 break;
2777             }
2778 
2779             le = le->Flink;
2780         }
2781     }
2782 
2783     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2784 
2785     return STATUS_SUCCESS;
2786 }
2787 
2788 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2789     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2790     DEVICE_DATA_SET_RANGE* ranges;
2791     ULONG datalen, i;
2792     KEY searchkey;
2793     traverse_ptr tp;
2794     NTSTATUS Status;
2795     BOOL b;
2796     UINT64 lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2797     LIST_ENTRY* le;
2798 
2799     dev->num_trim_entries = 0;
2800 
2801     searchkey.obj_id = dev->devitem.dev_id;
2802     searchkey.obj_type = TYPE_DEV_EXTENT;
2803     searchkey.offset = 0;
2804 
2805     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2806     if (!NT_SUCCESS(Status)) {
2807         ERR("find_item returned %08x\n", Status);
2808         return;
2809     }
2810 
2811     do {
2812         traverse_ptr next_tp;
2813 
2814         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2815             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2816                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2817 
2818                 if (tp.item->key.offset > lastoff)
2819                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2820 
2821                 lastoff = tp.item->key.offset + de->length;
2822             } else {
2823                 ERR("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2824                 return;
2825             }
2826         }
2827 
2828         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
2829 
2830         if (b) {
2831             tp = next_tp;
2832             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2833                 break;
2834         }
2835     } while (b);
2836 
2837     if (lastoff < dev->devitem.num_bytes)
2838         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2839 
2840     if (dev->num_trim_entries == 0)
2841         return;
2842 
2843     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2844 
2845     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2846     if (!dmdsa) {
2847         ERR("out of memory\n");
2848         goto end;
2849     }
2850 
2851     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2852     dmdsa->Action = DeviceDsmAction_Trim;
2853     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2854     dmdsa->ParameterBlockOffset = 0;
2855     dmdsa->ParameterBlockLength = 0;
2856     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64));
2857     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2858 
2859     ranges = (DEVICE_DATA_SET_RANGE*)((UINT8*)dmdsa + dmdsa->DataSetRangesOffset);
2860 
2861     i = 0;
2862     le = dev->trim_list.Flink;
2863     while (le != &dev->trim_list) {
2864         space* s = CONTAINING_RECORD(le, space, list_entry);
2865 
2866         ranges[i].StartingOffset = s->address;
2867         ranges[i].LengthInBytes = s->size;
2868         i++;
2869 
2870         le = le->Flink;
2871     }
2872 
2873     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, TRUE, NULL);
2874     if (!NT_SUCCESS(Status))
2875         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2876 
2877     ExFreePool(dmdsa);
2878 
2879 end:
2880     while (!IsListEmpty(&dev->trim_list)) {
2881         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2882         ExFreePool(s);
2883     }
2884 
2885     dev->num_trim_entries = 0;
2886 }
2887 
2888 static NTSTATUS try_consolidation(device_extension* Vcb, UINT64 flags, chunk** newchunk) {
2889     NTSTATUS Status;
2890     BOOL changed;
2891     LIST_ENTRY* le;
2892     chunk* rc;
2893 
2894     // FIXME - allow with metadata chunks?
2895 
2896     while (TRUE) {
2897         rc = NULL;
2898 
2899         ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
2900 
2901         ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
2902 
2903         // choose the least-used chunk we haven't looked at yet
2904         le = Vcb->chunks.Flink;
2905         while (le != &Vcb->chunks) {
2906             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2907 
2908             // FIXME - skip full-size chunks over e.g. 90% full?
2909             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2910                 rc = c;
2911 
2912             le = le->Flink;
2913         }
2914 
2915         ExReleaseResourceLite(&Vcb->chunk_lock);
2916 
2917         if (!rc) {
2918             ExReleaseResourceLite(&Vcb->tree_lock);
2919             break;
2920         }
2921 
2922         if (rc->list_entry_balance.Flink) {
2923             RemoveEntryList(&rc->list_entry_balance);
2924             Vcb->balance.chunks_left--;
2925         }
2926 
2927         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2928         rc->reloc = TRUE;
2929 
2930         ExReleaseResourceLite(&Vcb->tree_lock);
2931 
2932         do {
2933             changed = FALSE;
2934 
2935             Status = balance_data_chunk(Vcb, rc, &changed);
2936             if (!NT_SUCCESS(Status)) {
2937                 ERR("balance_data_chunk returned %08x\n", Status);
2938                 Vcb->balance.status = Status;
2939                 rc->list_entry_balance.Flink = NULL;
2940                 rc->reloc = FALSE;
2941                 return Status;
2942             }
2943 
2944             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
2945 
2946             if (Vcb->readonly)
2947                 Vcb->balance.stopping = TRUE;
2948 
2949             if (Vcb->balance.stopping)
2950                 return STATUS_SUCCESS;
2951         } while (changed);
2952 
2953         rc->list_entry_balance.Flink = NULL;
2954 
2955         rc->changed = TRUE;
2956         rc->space_changed = TRUE;
2957         rc->balance_num = Vcb->balance.balance_num;
2958 
2959         Status = do_write(Vcb, NULL);
2960         if (!NT_SUCCESS(Status)) {
2961             ERR("do_write returned %08x\n", Status);
2962             return Status;
2963         }
2964     }
2965 
2966     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
2967 
2968     Status = alloc_chunk(Vcb, flags, &rc, TRUE);
2969 
2970     ExReleaseResourceLite(&Vcb->chunk_lock);
2971 
2972     if (NT_SUCCESS(Status)) {
2973         *newchunk = rc;
2974         return Status;
2975     } else {
2976         ERR("alloc_chunk returned %08x\n", Status);
2977         return Status;
2978     }
2979 }
2980 
2981 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2982     LIST_ENTRY* le;
2983 
2984     while (!IsListEmpty(&dev->space)) {
2985         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
2986 
2987         ExFreePool(s);
2988     }
2989 
2990     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
2991 
2992     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
2993 
2994     le = Vcb->chunks.Flink;
2995     while (le != &Vcb->chunks) {
2996         UINT16 n;
2997         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2998         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2999 
3000         for (n = 0; n < c->chunk_item->num_stripes; n++) {
3001             UINT64 stripe_size = 0;
3002 
3003             if (cis[n].dev_id == dev->devitem.dev_id) {
3004                 if (stripe_size == 0) {
3005                     UINT16 factor;
3006 
3007                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3008                         factor = c->chunk_item->num_stripes;
3009                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3010                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3011                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3012                         factor = c->chunk_item->num_stripes - 1;
3013                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3014                         factor = c->chunk_item->num_stripes - 2;
3015                     else // SINGLE, DUP, RAID1
3016                         factor = 1;
3017 
3018                     stripe_size = c->chunk_item->size / factor;
3019                 }
3020 
3021                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3022             }
3023         }
3024 
3025         le = le->Flink;
3026     }
3027 
3028     return STATUS_SUCCESS;
3029 }
3030 
3031 _Function_class_(KSTART_ROUTINE)
3032 #ifndef __REACTOS__
3033 void balance_thread(void* context) {
3034 #else
3035 void NTAPI balance_thread(void* context) {
3036 #endif
3037     device_extension* Vcb = (device_extension*)context;
3038     LIST_ENTRY chunks;
3039     LIST_ENTRY* le;
3040     UINT64 num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3041     UINT64 old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3042     NTSTATUS Status;
3043 
3044     Vcb->balance.balance_num++;
3045 
3046     Vcb->balance.stopping = FALSE;
3047     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, FALSE);
3048 
3049     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3050         old_data_flags = Vcb->data_flags;
3051         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3052 
3053         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3054     }
3055 
3056     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3057         old_metadata_flags = Vcb->metadata_flags;
3058         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3059     }
3060 
3061     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062         old_system_flags = Vcb->system_flags;
3063         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3064     }
3065 
3066     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3067         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3068             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3069         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3070             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3071     }
3072 
3073     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3074     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3075 
3076     InitializeListHead(&chunks);
3077 
3078     // FIXME - what are we supposed to do with limit_start?
3079 
3080     if (!Vcb->readonly) {
3081         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3082             Status = add_balance_item(Vcb);
3083             if (!NT_SUCCESS(Status)) {
3084                 ERR("add_balance_item returned %08x\n", Status);
3085                 Vcb->balance.status = Status;
3086                 goto end;
3087             }
3088         } else {
3089             if (Vcb->need_write) {
3090                 Status = do_write(Vcb, NULL);
3091 
3092                 free_trees(Vcb);
3093 
3094                 if (!NT_SUCCESS(Status)) {
3095                     ERR("do_write returned %08x\n", Status);
3096                     Vcb->balance.status = Status;
3097                     goto end;
3098                 }
3099             }
3100         }
3101     }
3102 
3103     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3104 
3105     if (Vcb->balance.stopping)
3106         goto end;
3107 
3108     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3109 
3110     le = Vcb->chunks.Flink;
3111     while (le != &Vcb->chunks) {
3112         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3113         UINT8 sort;
3114 
3115         ExAcquireResourceExclusiveLite(&c->lock, TRUE);
3116 
3117         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3118             sort = BALANCE_OPTS_DATA;
3119         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3120             sort = BALANCE_OPTS_METADATA;
3121         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3122             sort = BALANCE_OPTS_SYSTEM;
3123         else {
3124             ERR("unexpected chunk type %llx\n", c->chunk_item->type);
3125             ExReleaseResourceLite(&c->lock);
3126             break;
3127         }
3128 
3129         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3130             should_balance_chunk(Vcb, sort, c)) {
3131             InsertTailList(&chunks, &c->list_entry_balance);
3132 
3133             num_chunks[sort]++;
3134             Vcb->balance.total_chunks++;
3135             Vcb->balance.chunks_left++;
3136         } else if (sort == BALANCE_OPTS_METADATA)
3137             okay_metadata_chunks++;
3138         else if (sort == BALANCE_OPTS_DATA)
3139             okay_data_chunks++;
3140         else if (sort == BALANCE_OPTS_SYSTEM)
3141             okay_system_chunks++;
3142 
3143         if (!c->cache_loaded) {
3144             Status = load_cache_chunk(Vcb, c, NULL);
3145 
3146             if (!NT_SUCCESS(Status)) {
3147                 ERR("load_cache_chunk returned %08x\n", Status);
3148                 Vcb->balance.status = Status;
3149                 ExReleaseResourceLite(&c->lock);
3150                 ExReleaseResourceLite(&Vcb->chunk_lock);
3151                 goto end;
3152             }
3153         }
3154 
3155         ExReleaseResourceLite(&c->lock);
3156 
3157         le = le->Flink;
3158     }
3159 
3160     ExReleaseResourceLite(&Vcb->chunk_lock);
3161 
3162     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3163     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3164         BOOL consolidated = FALSE;
3165         chunk* c;
3166 
3167         if (okay_metadata_chunks == 0) {
3168             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3169 
3170             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, TRUE);
3171             if (NT_SUCCESS(Status))
3172                 c->balance_num = Vcb->balance.balance_num;
3173             else if (Status != STATUS_DISK_FULL || consolidated) {
3174                 ERR("alloc_chunk returned %08x\n", Status);
3175                 ExReleaseResourceLite(&Vcb->chunk_lock);
3176                 Vcb->balance.status = Status;
3177                 goto end;
3178             }
3179 
3180             ExReleaseResourceLite(&Vcb->chunk_lock);
3181 
3182             if (Status == STATUS_DISK_FULL) {
3183                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3184                 if (!NT_SUCCESS(Status)) {
3185                     ERR("try_consolidation returned %08x\n", Status);
3186                     Vcb->balance.status = Status;
3187                     goto end;
3188                 } else
3189                     c->balance_num = Vcb->balance.balance_num;
3190 
3191                 consolidated = TRUE;
3192 
3193                 if (Vcb->balance.stopping)
3194                     goto end;
3195             }
3196         }
3197 
3198         if (okay_data_chunks == 0) {
3199             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3200 
3201             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, TRUE);
3202             if (NT_SUCCESS(Status))
3203                 c->balance_num = Vcb->balance.balance_num;
3204             else if (Status != STATUS_DISK_FULL || consolidated) {
3205                 ERR("alloc_chunk returned %08x\n", Status);
3206                 ExReleaseResourceLite(&Vcb->chunk_lock);
3207                 Vcb->balance.status = Status;
3208                 goto end;
3209             }
3210 
3211             ExReleaseResourceLite(&Vcb->chunk_lock);
3212 
3213             if (Status == STATUS_DISK_FULL) {
3214                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3215                 if (!NT_SUCCESS(Status)) {
3216                     ERR("try_consolidation returned %08x\n", Status);
3217                     Vcb->balance.status = Status;
3218                     goto end;
3219                 } else
3220                     c->balance_num = Vcb->balance.balance_num;
3221 
3222                 consolidated = TRUE;
3223 
3224                 if (Vcb->balance.stopping)
3225                     goto end;
3226             }
3227         }
3228 
3229         if (okay_system_chunks == 0) {
3230             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3231 
3232             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, TRUE);
3233             if (NT_SUCCESS(Status))
3234                 c->balance_num = Vcb->balance.balance_num;
3235             else if (Status != STATUS_DISK_FULL || consolidated) {
3236                 ERR("alloc_chunk returned %08x\n", Status);
3237                 ExReleaseResourceLite(&Vcb->chunk_lock);
3238                 Vcb->balance.status = Status;
3239                 goto end;
3240             }
3241 
3242             ExReleaseResourceLite(&Vcb->chunk_lock);
3243 
3244             if (Status == STATUS_DISK_FULL) {
3245                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3246                 if (!NT_SUCCESS(Status)) {
3247                     ERR("try_consolidation returned %08x\n", Status);
3248                     Vcb->balance.status = Status;
3249                     goto end;
3250                 } else
3251                     c->balance_num = Vcb->balance.balance_num;
3252 
3253                 consolidated = TRUE;
3254 
3255                 if (Vcb->balance.stopping)
3256                     goto end;
3257             }
3258         }
3259     }
3260 
3261     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3262 
3263     le = chunks.Flink;
3264     while (le != &chunks) {
3265         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3266 
3267         c->reloc = TRUE;
3268 
3269         le = le->Flink;
3270     }
3271 
3272     ExReleaseResourceLite(&Vcb->chunk_lock);
3273 
3274     // do data chunks before metadata
3275     le = chunks.Flink;
3276     while (le != &chunks) {
3277         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278         LIST_ENTRY* le2 = le->Flink;
3279 
3280         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3281             BOOL changed;
3282 
3283             do {
3284                 changed = FALSE;
3285 
3286                 Status = balance_data_chunk(Vcb, c, &changed);
3287                 if (!NT_SUCCESS(Status)) {
3288                     ERR("balance_data_chunk returned %08x\n", Status);
3289                     Vcb->balance.status = Status;
3290                     goto end;
3291                 }
3292 
3293                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3294 
3295                 if (Vcb->readonly)
3296                     Vcb->balance.stopping = TRUE;
3297 
3298                 if (Vcb->balance.stopping)
3299                     break;
3300             } while (changed);
3301 
3302             c->changed = TRUE;
3303             c->space_changed = TRUE;
3304         }
3305 
3306         if (Vcb->balance.stopping)
3307             goto end;
3308 
3309         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3310             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3311             RemoveEntryList(&c->list_entry_balance);
3312             c->list_entry_balance.Flink = NULL;
3313 
3314             Vcb->balance.chunks_left--;
3315         }
3316 
3317         le = le2;
3318     }
3319 
3320     // do metadata chunks
3321     while (!IsListEmpty(&chunks)) {
3322         chunk* c;
3323         BOOL changed;
3324 
3325         le = RemoveHeadList(&chunks);
3326         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3327 
3328         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3329             do {
3330                 Status = balance_metadata_chunk(Vcb, c, &changed);
3331                 if (!NT_SUCCESS(Status)) {
3332                     ERR("balance_metadata_chunk returned %08x\n", Status);
3333                     Vcb->balance.status = Status;
3334                     goto end;
3335                 }
3336 
3337                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3338 
3339                 if (Vcb->readonly)
3340                     Vcb->balance.stopping = TRUE;
3341 
3342                 if (Vcb->balance.stopping)
3343                     break;
3344             } while (changed);
3345 
3346             c->changed = TRUE;
3347             c->space_changed = TRUE;
3348         }
3349 
3350         if (Vcb->balance.stopping)
3351             break;
3352 
3353         c->list_entry_balance.Flink = NULL;
3354 
3355         Vcb->balance.chunks_left--;
3356     }
3357 
3358 end:
3359     if (!Vcb->readonly) {
3360         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3361             le = chunks.Flink;
3362             while (le != &chunks) {
3363                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3364                 c->reloc = FALSE;
3365 
3366                 le = le->Flink;
3367                 c->list_entry_balance.Flink = NULL;
3368             }
3369 
3370             if (old_data_flags != 0)
3371                 Vcb->data_flags = old_data_flags;
3372 
3373             if (old_metadata_flags != 0)
3374                 Vcb->metadata_flags = old_metadata_flags;
3375 
3376             if (old_system_flags != 0)
3377                 Vcb->system_flags = old_system_flags;
3378         }
3379 
3380         if (Vcb->balance.removing) {
3381             device* dev = NULL;
3382 
3383             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3384 
3385             le = Vcb->devices.Flink;
3386             while (le != &Vcb->devices) {
3387                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3388 
3389                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3390                     dev = dev2;
3391                     break;
3392                 }
3393 
3394                 le = le->Flink;
3395             }
3396 
3397             if (dev) {
3398                 if (Vcb->balance.chunks_left == 0) {
3399                     Status = finish_removing_device(Vcb, dev);
3400 
3401                     if (!NT_SUCCESS(Status)) {
3402                         ERR("finish_removing_device returned %08x\n", Status);
3403                         dev->reloc = FALSE;
3404                     }
3405                 } else
3406                     dev->reloc = FALSE;
3407             }
3408 
3409             ExReleaseResourceLite(&Vcb->tree_lock);
3410         } else if (Vcb->balance.shrinking) {
3411             device* dev = NULL;
3412 
3413             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3414 
3415             le = Vcb->devices.Flink;
3416             while (le != &Vcb->devices) {
3417                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3418 
3419                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3420                     dev = dev2;
3421                     break;
3422                 }
3423 
3424                 le = le->Flink;
3425             }
3426 
3427             if (!dev) {
3428                 ERR("could not find device %llx\n", Vcb->balance.opts[0].devid);
3429                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3430             }
3431 
3432             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3433                 if (dev) {
3434                     Status = regenerate_space_list(Vcb, dev);
3435                     if (!NT_SUCCESS(Status))
3436                         WARN("regenerate_space_list returned %08x\n", Status);
3437                 }
3438             } else {
3439                 UINT64 old_size;
3440 
3441                 old_size = dev->devitem.num_bytes;
3442                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3443 
3444                 Status = update_dev_item(Vcb, dev, NULL);
3445                 if (!NT_SUCCESS(Status)) {
3446                     ERR("update_dev_item returned %08x\n", Status);
3447                     dev->devitem.num_bytes = old_size;
3448                     Vcb->balance.status = Status;
3449 
3450                     Status = regenerate_space_list(Vcb, dev);
3451                     if (!NT_SUCCESS(Status))
3452                         WARN("regenerate_space_list returned %08x\n", Status);
3453                 } else {
3454                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3455 
3456                     Status = do_write(Vcb, NULL);
3457                     if (!NT_SUCCESS(Status))
3458                         ERR("do_write returned %08x\n", Status);
3459 
3460                     free_trees(Vcb);
3461                 }
3462             }
3463 
3464             ExReleaseResourceLite(&Vcb->tree_lock);
3465 
3466             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3467                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3468         } else {
3469             Status = remove_balance_item(Vcb);
3470             if (!NT_SUCCESS(Status)) {
3471                 ERR("remove_balance_item returned %08x\n", Status);
3472                 goto end;
3473             }
3474         }
3475 
3476         if (Vcb->trim && !Vcb->options.no_trim) {
3477             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3478 
3479             le = Vcb->devices.Flink;
3480             while (le != &Vcb->devices) {
3481                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3482 
3483                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3484                     trim_unalloc_space(Vcb, dev2);
3485 
3486                 le = le->Flink;
3487             }
3488 
3489             ExReleaseResourceLite(&Vcb->tree_lock);
3490         }
3491     }
3492 
3493     ZwClose(Vcb->balance.thread);
3494     Vcb->balance.thread = NULL;
3495 
3496     KeSetEvent(&Vcb->balance.finished, 0, FALSE);
3497 }
3498 
3499 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3500     NTSTATUS Status;
3501     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3502     UINT8 i;
3503 
3504     if (length < sizeof(btrfs_start_balance) || !data)
3505         return STATUS_INVALID_PARAMETER;
3506 
3507     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3508         return STATUS_PRIVILEGE_NOT_HELD;
3509 
3510     if (Vcb->locked) {
3511         WARN("cannot start balance while locked\n");
3512         return STATUS_DEVICE_NOT_READY;
3513     }
3514 
3515     if (Vcb->scrub.thread) {
3516         WARN("cannot start balance while scrub running\n");
3517         return STATUS_DEVICE_NOT_READY;
3518     }
3519 
3520     if (Vcb->balance.thread) {
3521         WARN("balance already running\n");
3522         return STATUS_DEVICE_NOT_READY;
3523     }
3524 
3525     if (Vcb->readonly)
3526         return STATUS_MEDIA_WRITE_PROTECTED;
3527 
3528     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3529         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3530         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3531         return STATUS_SUCCESS;
3532 
3533     for (i = 0; i < 3; i++) {
3534         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3535             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3536                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3537                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3538 
3539                 if (bsb->opts[i].profiles == 0)
3540                     return STATUS_INVALID_PARAMETER;
3541             }
3542 
3543             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3544                 if (bsb->opts[i].devid == 0)
3545                     return STATUS_INVALID_PARAMETER;
3546             }
3547 
3548             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3549                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3550                     return STATUS_INVALID_PARAMETER;
3551             }
3552 
3553             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3554                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3555                     return STATUS_INVALID_PARAMETER;
3556             }
3557 
3558             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3559                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3560                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3561 
3562                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3563                     return STATUS_INVALID_PARAMETER;
3564             }
3565 
3566             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3567                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3568                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3569 
3570                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3571                     return STATUS_INVALID_PARAMETER;
3572             }
3573 
3574             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3575                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3576                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3577 
3578                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3579                     return STATUS_INVALID_PARAMETER;
3580             }
3581 
3582             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3583                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3584                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3585                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3586                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3587                     return STATUS_INVALID_PARAMETER;
3588             }
3589         }
3590     }
3591 
3592     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3593     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3594     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3595 
3596     Vcb->balance.paused = FALSE;
3597     Vcb->balance.removing = FALSE;
3598     Vcb->balance.shrinking = FALSE;
3599     Vcb->balance.status = STATUS_SUCCESS;
3600     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3601 
3602     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3603     if (!NT_SUCCESS(Status)) {
3604         ERR("PsCreateSystemThread returned %08x\n", Status);
3605         return Status;
3606     }
3607 
3608     return STATUS_SUCCESS;
3609 }
3610 
3611 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3612     KEY searchkey;
3613     traverse_ptr tp;
3614     NTSTATUS Status;
3615     BALANCE_ITEM* bi;
3616     int i;
3617 
3618     searchkey.obj_id = BALANCE_ITEM_ID;
3619     searchkey.obj_type = TYPE_TEMP_ITEM;
3620     searchkey.offset = 0;
3621 
3622     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
3623     if (!NT_SUCCESS(Status)) {
3624         ERR("find_item returned %08x\n", Status);
3625         return Status;
3626     }
3627 
3628     if (keycmp(tp.item->key, searchkey)) {
3629         TRACE("no balance item found\n");
3630         return STATUS_NOT_FOUND;
3631     }
3632 
3633     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3634         WARN("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3635              tp.item->size, sizeof(BALANCE_ITEM));
3636         return STATUS_INTERNAL_ERROR;
3637     }
3638 
3639     bi = (BALANCE_ITEM*)tp.item->data;
3640 
3641     if (bi->flags & BALANCE_FLAGS_DATA)
3642         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3643 
3644     if (bi->flags & BALANCE_FLAGS_METADATA)
3645         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3646 
3647     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3648         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3649 
3650     // do the heuristics that Linux driver does
3651 
3652     for (i = 0; i < 3; i++) {
3653         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3654             // if converting, don't redo chunks already done
3655 
3656             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3657                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3658 
3659             // don't balance chunks more than 90% filled - presumably these
3660             // have already been done
3661 
3662             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3663                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3664             ) {
3665                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3666                 Vcb->balance.opts[i].usage_start = 0;
3667                 Vcb->balance.opts[i].usage_end = 90;
3668             }
3669         }
3670     }
3671 
3672     if (Vcb->readonly || Vcb->options.skip_balance)
3673         Vcb->balance.paused = TRUE;
3674     else
3675         Vcb->balance.paused = FALSE;
3676 
3677     Vcb->balance.removing = FALSE;
3678     Vcb->balance.shrinking = FALSE;
3679     Vcb->balance.status = STATUS_SUCCESS;
3680     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3681 
3682     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3683     if (!NT_SUCCESS(Status)) {
3684         ERR("PsCreateSystemThread returned %08x\n", Status);
3685         return Status;
3686     }
3687 
3688     return STATUS_SUCCESS;
3689 }
3690 
3691 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3692     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3693 
3694     if (length < sizeof(btrfs_query_balance) || !data)
3695         return STATUS_INVALID_PARAMETER;
3696 
3697     if (!Vcb->balance.thread) {
3698         bqb->status = BTRFS_BALANCE_STOPPED;
3699 
3700         if (!NT_SUCCESS(Vcb->balance.status)) {
3701             bqb->status |= BTRFS_BALANCE_ERROR;
3702             bqb->error = Vcb->balance.status;
3703         }
3704 
3705         return STATUS_SUCCESS;
3706     }
3707 
3708     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3709 
3710     if (Vcb->balance.removing)
3711         bqb->status |= BTRFS_BALANCE_REMOVAL;
3712 
3713     if (Vcb->balance.shrinking)
3714         bqb->status |= BTRFS_BALANCE_SHRINKING;
3715 
3716     if (!NT_SUCCESS(Vcb->balance.status))
3717         bqb->status |= BTRFS_BALANCE_ERROR;
3718 
3719     bqb->chunks_left = Vcb->balance.chunks_left;
3720     bqb->total_chunks = Vcb->balance.total_chunks;
3721     bqb->error = Vcb->balance.status;
3722     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3723     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3724     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3725 
3726     return STATUS_SUCCESS;
3727 }
3728 
3729 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3730     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3731         return STATUS_PRIVILEGE_NOT_HELD;
3732 
3733     if (!Vcb->balance.thread)
3734         return STATUS_DEVICE_NOT_READY;
3735 
3736     if (Vcb->balance.paused)
3737         return STATUS_DEVICE_NOT_READY;
3738 
3739     Vcb->balance.paused = TRUE;
3740     KeClearEvent(&Vcb->balance.event);
3741 
3742     return STATUS_SUCCESS;
3743 }
3744 
3745 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3746     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3747         return STATUS_PRIVILEGE_NOT_HELD;
3748 
3749     if (!Vcb->balance.thread)
3750         return STATUS_DEVICE_NOT_READY;
3751 
3752     if (!Vcb->balance.paused)
3753         return STATUS_DEVICE_NOT_READY;
3754 
3755     if (Vcb->readonly)
3756         return STATUS_MEDIA_WRITE_PROTECTED;
3757 
3758     Vcb->balance.paused = FALSE;
3759     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3760 
3761     return STATUS_SUCCESS;
3762 }
3763 
3764 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3765     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3766         return STATUS_PRIVILEGE_NOT_HELD;
3767 
3768     if (!Vcb->balance.thread)
3769         return STATUS_DEVICE_NOT_READY;
3770 
3771     Vcb->balance.paused = FALSE;
3772     Vcb->balance.stopping = TRUE;
3773     Vcb->balance.status = STATUS_SUCCESS;
3774     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3775 
3776     return STATUS_SUCCESS;
3777 }
3778 
3779 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3780     UINT64 devid;
3781     LIST_ENTRY* le;
3782     device* dev = NULL;
3783     NTSTATUS Status;
3784     int i;
3785     UINT64 num_rw_devices;
3786 
3787     TRACE("(%p, %p, %x)\n", Vcb, data, length);
3788 
3789     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3790         return STATUS_PRIVILEGE_NOT_HELD;
3791 
3792     if (length < sizeof(UINT64))
3793         return STATUS_INVALID_PARAMETER;
3794 
3795     devid = *(UINT64*)data;
3796 
3797     ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
3798 
3799     if (Vcb->readonly) {
3800         ExReleaseResourceLite(&Vcb->tree_lock);
3801         return STATUS_MEDIA_WRITE_PROTECTED;
3802     }
3803 
3804     num_rw_devices = 0;
3805 
3806     le = Vcb->devices.Flink;
3807     while (le != &Vcb->devices) {
3808         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3809 
3810         if (dev2->devitem.dev_id == devid)
3811             dev = dev2;
3812 
3813         if (!dev2->readonly)
3814             num_rw_devices++;
3815 
3816         le = le->Flink;
3817     }
3818 
3819     if (!dev) {
3820         ExReleaseResourceLite(&Vcb->tree_lock);
3821         WARN("device %llx not found\n", devid);
3822         return STATUS_NOT_FOUND;
3823     }
3824 
3825     if (!dev->readonly) {
3826         if (num_rw_devices == 1) {
3827             ExReleaseResourceLite(&Vcb->tree_lock);
3828             WARN("not removing last non-readonly device\n");
3829             return STATUS_INVALID_PARAMETER;
3830         }
3831 
3832         if (num_rw_devices == 4 &&
3833             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3834              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3835         ) {
3836             ExReleaseResourceLite(&Vcb->tree_lock);
3837             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3838             return STATUS_CANNOT_DELETE;
3839         }
3840 
3841         if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3842             ExReleaseResourceLite(&Vcb->tree_lock);
3843             ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3844             return STATUS_CANNOT_DELETE;
3845         }
3846 
3847         if (num_rw_devices == 2 &&
3848             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3849              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3850         ) {
3851             ExReleaseResourceLite(&Vcb->tree_lock);
3852             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3853             return STATUS_CANNOT_DELETE;
3854         }
3855     }
3856 
3857     ExReleaseResourceLite(&Vcb->tree_lock);
3858 
3859     if (Vcb->balance.thread) {
3860         WARN("balance already running\n");
3861         return STATUS_DEVICE_NOT_READY;
3862     }
3863 
3864     dev->reloc = TRUE;
3865 
3866     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3867 
3868     for (i = 0; i < 3; i++) {
3869         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3870         Vcb->balance.opts[i].devid = devid;
3871     }
3872 
3873     Vcb->balance.paused = FALSE;
3874     Vcb->balance.removing = TRUE;
3875     Vcb->balance.shrinking = FALSE;
3876     Vcb->balance.status = STATUS_SUCCESS;
3877     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3878 
3879     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3880     if (!NT_SUCCESS(Status)) {
3881         ERR("PsCreateSystemThread returned %08x\n", Status);
3882         dev->reloc = FALSE;
3883         return Status;
3884     }
3885 
3886     return STATUS_SUCCESS;
3887 }
3888