xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 02e84521)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
23     UINT64 address;
24     UINT64 new_address;
25     tree_header* data;
26     EXTENT_ITEM* ei;
27     tree* t;
28     BOOL system;
29     LIST_ENTRY refs;
30     LIST_ENTRY list_entry;
31 } metadata_reloc;
32 
33 typedef struct {
34     UINT8 type;
35     UINT64 hash;
36 
37     union {
38         TREE_BLOCK_REF tbr;
39         SHARED_BLOCK_REF sbr;
40     };
41 
42     metadata_reloc* parent;
43     BOOL top;
44     LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46 
47 typedef struct {
48     UINT64 address;
49     UINT64 size;
50     UINT64 new_address;
51     chunk* newchunk;
52     EXTENT_ITEM* ei;
53     LIST_ENTRY refs;
54     LIST_ENTRY list_entry;
55 } data_reloc;
56 
57 typedef struct {
58     UINT8 type;
59     UINT64 hash;
60 
61     union {
62         EXTENT_DATA_REF edr;
63         SHARED_DATA_REF sdr;
64     };
65 
66     metadata_reloc* parent;
67     LIST_ENTRY list_entry;
68 } data_reloc_ref;
69 
70 #ifndef __REACTOS__
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74 #endif
75 
76 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
77 
78 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
79                                    BOOL skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
80     NTSTATUS Status;
81     metadata_reloc* mr;
82     EXTENT_ITEM* ei;
83     UINT16 len;
84     UINT64 inline_rc;
85     UINT8* ptr;
86 
87     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
88     if (!mr) {
89         ERR("out of memory\n");
90         return STATUS_INSUFFICIENT_RESOURCES;
91     }
92 
93     mr->address = tp->item->key.obj_id;
94     mr->data = NULL;
95     mr->ei = (EXTENT_ITEM*)tp->item->data;
96     mr->system = FALSE;
97     InitializeListHead(&mr->refs);
98 
99     Status = delete_tree_item(Vcb, tp);
100     if (!NT_SUCCESS(Status)) {
101         ERR("delete_tree_item returned %08x\n", Status);
102         ExFreePool(mr);
103         return Status;
104     }
105 
106     if (!c)
107         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
108 
109     if (c) {
110         acquire_chunk_lock(c, Vcb);
111 
112         c->used -= Vcb->superblock.node_size;
113 
114         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
115 
116         release_chunk_lock(c, Vcb);
117     }
118 
119     ei = (EXTENT_ITEM*)tp->item->data;
120     inline_rc = 0;
121 
122     len = tp->item->size - sizeof(EXTENT_ITEM);
123     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
124     if (!skinny) {
125         len -= sizeof(EXTENT_ITEM2);
126         ptr += sizeof(EXTENT_ITEM2);
127     }
128 
129     while (len > 0) {
130         UINT8 secttype = *ptr;
131         UINT16 sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
132         metadata_reloc_ref* ref;
133 
134         len--;
135 
136         if (sectlen > len) {
137             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
138             return STATUS_INTERNAL_ERROR;
139         }
140 
141         if (sectlen == 0) {
142             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
143             return STATUS_INTERNAL_ERROR;
144         }
145 
146         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
147         if (!ref) {
148             ERR("out of memory\n");
149             return STATUS_INSUFFICIENT_RESOURCES;
150         }
151 
152         if (secttype == TYPE_TREE_BLOCK_REF) {
153             ref->type = TYPE_TREE_BLOCK_REF;
154             RtlCopyMemory(&ref->tbr, ptr + sizeof(UINT8), sizeof(TREE_BLOCK_REF));
155             inline_rc++;
156         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
157             ref->type = TYPE_SHARED_BLOCK_REF;
158             RtlCopyMemory(&ref->sbr, ptr + sizeof(UINT8), sizeof(SHARED_BLOCK_REF));
159             inline_rc++;
160         } else {
161             ERR("unexpected tree type %x\n", secttype);
162             ExFreePool(ref);
163             return STATUS_INTERNAL_ERROR;
164         }
165 
166         ref->parent = NULL;
167         ref->top = FALSE;
168         InsertTailList(&mr->refs, &ref->list_entry);
169 
170         len -= sectlen;
171         ptr += sizeof(UINT8) + sectlen;
172     }
173 
174     if (inline_rc < ei->refcount) { // look for non-inline entries
175         traverse_ptr tp2 = *tp, next_tp;
176 
177         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
178             tp2 = next_tp;
179 
180             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
181                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
182                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
183                     if (!ref) {
184                         ERR("out of memory\n");
185                         return STATUS_INSUFFICIENT_RESOURCES;
186                     }
187 
188                     ref->type = TYPE_TREE_BLOCK_REF;
189                     ref->tbr.offset = tp2.item->key.offset;
190                     ref->parent = NULL;
191                     ref->top = FALSE;
192                     InsertTailList(&mr->refs, &ref->list_entry);
193 
194                     Status = delete_tree_item(Vcb, &tp2);
195                     if (!NT_SUCCESS(Status)) {
196                         ERR("delete_tree_item returned %08x\n", Status);
197                         return Status;
198                     }
199                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
200                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
201                     if (!ref) {
202                         ERR("out of memory\n");
203                         return STATUS_INSUFFICIENT_RESOURCES;
204                     }
205 
206                     ref->type = TYPE_SHARED_BLOCK_REF;
207                     ref->sbr.offset = tp2.item->key.offset;
208                     ref->parent = NULL;
209                     ref->top = FALSE;
210                     InsertTailList(&mr->refs, &ref->list_entry);
211 
212                     Status = delete_tree_item(Vcb, &tp2);
213                     if (!NT_SUCCESS(Status)) {
214                         ERR("delete_tree_item returned %08x\n", Status);
215                         return Status;
216                     }
217                 }
218             } else
219                 break;
220         }
221     }
222 
223     InsertTailList(items, &mr->list_entry);
224 
225     if (mr2)
226         *mr2 = mr;
227 
228     return STATUS_SUCCESS;
229 }
230 
231 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
232                                           UINT64 address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
233     LIST_ENTRY* le;
234     KEY searchkey;
235     traverse_ptr tp;
236     BOOL skinny = FALSE;
237     NTSTATUS Status;
238 
239     le = items->Flink;
240     while (le != items) {
241         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
242 
243         if (mr->address == address) {
244             *mr2 = mr;
245             return STATUS_SUCCESS;
246         }
247 
248         le = le->Flink;
249     }
250 
251     searchkey.obj_id = address;
252     searchkey.obj_type = TYPE_METADATA_ITEM;
253     searchkey.offset = 0xffffffffffffffff;
254 
255     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
256     if (!NT_SUCCESS(Status)) {
257         ERR("find_item returned %08x\n", Status);
258         return Status;
259     }
260 
261     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
262         skinny = TRUE;
263     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
264              tp.item->size >= sizeof(EXTENT_ITEM)) {
265         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
266 
267         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
268             ERR("EXTENT_ITEM for %llx found, but tree flag not set\n", address);
269             return STATUS_INTERNAL_ERROR;
270         }
271     } else {
272         ERR("could not find valid EXTENT_ITEM for address %llx\n", address);
273         return STATUS_INTERNAL_ERROR;
274     }
275 
276     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
277     if (!NT_SUCCESS(Status)) {
278         ERR("add_metadata_reloc returned %08x\n", Status);
279         return Status;
280     }
281 
282     return STATUS_SUCCESS;
283 }
284 
285 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
286     LIST_ENTRY newlist, *le;
287 
288     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
289         return;
290 
291     // insertion sort
292 
293     InitializeListHead(&newlist);
294 
295     while (!IsListEmpty(&mr->refs)) {
296         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
297         BOOL inserted = FALSE;
298 
299         if (ref->type == TYPE_TREE_BLOCK_REF)
300             ref->hash = ref->tbr.offset;
301         else if (ref->type == TYPE_SHARED_BLOCK_REF)
302             ref->hash = ref->parent->new_address;
303 
304         le = newlist.Flink;
305         while (le != &newlist) {
306             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
307 
308             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
309                 InsertHeadList(le->Blink, &ref->list_entry);
310                 inserted = TRUE;
311                 break;
312             }
313 
314             le = le->Flink;
315         }
316 
317         if (!inserted)
318             InsertTailList(&newlist, &ref->list_entry);
319     }
320 
321     newlist.Flink->Blink = &mr->refs;
322     newlist.Blink->Flink = &mr->refs;
323     mr->refs.Flink = newlist.Flink;
324     mr->refs.Blink = newlist.Blink;
325 }
326 
327 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
328     NTSTATUS Status;
329     LIST_ENTRY* le;
330     UINT64 rc = 0;
331     UINT16 inline_len;
332     BOOL all_inline = TRUE;
333     metadata_reloc_ref* first_noninline = NULL;
334     EXTENT_ITEM* ei;
335     UINT8* ptr;
336 
337     inline_len = sizeof(EXTENT_ITEM);
338     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
339         inline_len += sizeof(EXTENT_ITEM2);
340 
341     sort_metadata_reloc_refs(mr);
342 
343     le = mr->refs.Flink;
344     while (le != &mr->refs) {
345         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
346         UINT16 extlen = 0;
347 
348         rc++;
349 
350         if (ref->type == TYPE_TREE_BLOCK_REF)
351             extlen += sizeof(TREE_BLOCK_REF);
352         else if (ref->type == TYPE_SHARED_BLOCK_REF)
353             extlen += sizeof(SHARED_BLOCK_REF);
354 
355         if (all_inline) {
356             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
357                 all_inline = FALSE;
358                 first_noninline = ref;
359             } else
360                 inline_len += extlen + 1;
361         }
362 
363         le = le->Flink;
364     }
365 
366     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
367     if (!ei) {
368         ERR("out of memory\n");
369         return STATUS_INSUFFICIENT_RESOURCES;
370     }
371 
372     ei->refcount = rc;
373     ei->generation = mr->ei->generation;
374     ei->flags = mr->ei->flags;
375     ptr = (UINT8*)&ei[1];
376 
377     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
378         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
379 
380         ei2->firstitem = *(KEY*)&mr->data[1];
381         ei2->level = mr->data->level;
382 
383         ptr += sizeof(EXTENT_ITEM2);
384     }
385 
386     le = mr->refs.Flink;
387     while (le != &mr->refs) {
388         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
389 
390         if (ref == first_noninline)
391             break;
392 
393         *ptr = ref->type;
394         ptr++;
395 
396         if (ref->type == TYPE_TREE_BLOCK_REF) {
397             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
398 
399             tbr->offset = ref->tbr.offset;
400 
401             ptr += sizeof(TREE_BLOCK_REF);
402         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
403             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
404 
405             sbr->offset = ref->parent->new_address;
406 
407             ptr += sizeof(SHARED_BLOCK_REF);
408         }
409 
410         le = le->Flink;
411     }
412 
413     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
414         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
415     else
416         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
417 
418     if (!NT_SUCCESS(Status)) {
419         ERR("insert_tree_item returned %08x\n", Status);
420         ExFreePool(ei);
421         return Status;
422     }
423 
424     if (!all_inline) {
425         le = &first_noninline->list_entry;
426 
427         while (le != &mr->refs) {
428             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
429 
430             if (ref->type == TYPE_TREE_BLOCK_REF) {
431                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
432                 if (!NT_SUCCESS(Status)) {
433                     ERR("insert_tree_item returned %08x\n", Status);
434                     return Status;
435                 }
436             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
437                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
438                 if (!NT_SUCCESS(Status)) {
439                     ERR("insert_tree_item returned %08x\n", Status);
440                     return Status;
441                 }
442             }
443 
444             le = le->Flink;
445         }
446     }
447 
448     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
449         if (mr->data->level > 0) {
450             UINT16 i;
451             internal_node* in = (internal_node*)&mr->data[1];
452 
453             for (i = 0; i < mr->data->num_items; i++) {
454                 UINT64 sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
455 
456                 if (sbrrc > 0) {
457                     SHARED_BLOCK_REF sbr;
458 
459                     sbr.offset = mr->new_address;
460 
461                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
462                     if (!NT_SUCCESS(Status)) {
463                         ERR("increase_extent_refcount returned %08x\n", Status);
464                         return Status;
465                     }
466 
467                     sbr.offset = mr->address;
468 
469                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
470                                                       sbr.offset, FALSE, NULL);
471                     if (!NT_SUCCESS(Status)) {
472                         ERR("decrease_extent_refcount returned %08x\n", Status);
473                         return Status;
474                     }
475                 }
476             }
477         } else {
478             UINT16 i;
479             leaf_node* ln = (leaf_node*)&mr->data[1];
480 
481             for (i = 0; i < mr->data->num_items; i++) {
482                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
483                     EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
484 
485                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
486                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
487 
488                         if (ed2->size > 0) { // not sparse
489                             UINT32 sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
490 
491                             if (sdrrc > 0) {
492                                 SHARED_DATA_REF sdr;
493                                 chunk* c;
494 
495                                 sdr.offset = mr->new_address;
496                                 sdr.count = sdrrc;
497 
498                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
499                                 if (!NT_SUCCESS(Status)) {
500                                     ERR("increase_extent_refcount returned %08x\n", Status);
501                                     return Status;
502                                 }
503 
504                                 sdr.offset = mr->address;
505 
506                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
507                                                                   sdr.offset, FALSE, NULL);
508                                 if (!NT_SUCCESS(Status)) {
509                                     ERR("decrease_extent_refcount returned %08x\n", Status);
510                                     return Status;
511                                 }
512 
513                                 c = get_chunk_from_address(Vcb, ed2->address);
514 
515                                 if (c) {
516                                     // check changed_extents
517 
518                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, TRUE);
519 
520                                     le = c->changed_extents.Flink;
521 
522                                     while (le != &c->changed_extents) {
523                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
524 
525                                         if (ce->address == ed2->address) {
526                                             LIST_ENTRY* le2;
527 
528                                             le2 = ce->refs.Flink;
529                                             while (le2 != &ce->refs) {
530                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
531 
532                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
533                                                     cer->sdr.offset = mr->new_address;
534                                                     break;
535                                                 }
536 
537                                                 le2 = le2->Flink;
538                                             }
539 
540                                             le2 = ce->old_refs.Flink;
541                                             while (le2 != &ce->old_refs) {
542                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
543 
544                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
545                                                     cer->sdr.offset = mr->new_address;
546                                                     break;
547                                                 }
548 
549                                                 le2 = le2->Flink;
550                                             }
551 
552                                             break;
553                                         }
554 
555                                         le = le->Flink;
556                                     }
557 
558                                     ExReleaseResourceLite(&c->changed_extents_lock);
559                                 }
560                             }
561                         }
562                     }
563                 }
564             }
565         }
566     }
567 
568     return STATUS_SUCCESS;
569 }
570 
571 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
572                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
573     LIST_ENTRY tree_writes, *le;
574     NTSTATUS Status;
575     traverse_ptr tp;
576     UINT8 level, max_level = 0;
577     chunk* newchunk = NULL;
578 
579     InitializeListHead(&tree_writes);
580 
581     le = items->Flink;
582     while (le != items) {
583         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
584         LIST_ENTRY* le2;
585         chunk* pc;
586 
587         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
588         if (!mr->data) {
589             ERR("out of memory\n");
590             return STATUS_INSUFFICIENT_RESOURCES;
591         }
592 
593         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, TRUE, (UINT8*)mr->data,
594                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, FALSE, NormalPagePriority);
595         if (!NT_SUCCESS(Status)) {
596             ERR("read_data returned %08x\n", Status);
597             return Status;
598         }
599 
600         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
601             mr->system = TRUE;
602 
603         if (data_items && mr->data->level == 0) {
604             le2 = data_items->Flink;
605             while (le2 != data_items) {
606                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
607                 leaf_node* ln = (leaf_node*)&mr->data[1];
608                 UINT16 i;
609 
610                 for (i = 0; i < mr->data->num_items; i++) {
611                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
612                         EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
613 
614                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
615                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
616 
617                             if (ed2->address == dr->address)
618                                 ed2->address = dr->new_address;
619                         }
620                     }
621                 }
622 
623                 le2 = le2->Flink;
624             }
625         }
626 
627         if (mr->data->level > max_level)
628             max_level = mr->data->level;
629 
630         le2 = mr->refs.Flink;
631         while (le2 != &mr->refs) {
632             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
633 
634             if (ref->type == TYPE_TREE_BLOCK_REF) {
635                 KEY* firstitem;
636                 root* r = NULL;
637                 LIST_ENTRY* le3;
638                 tree* t;
639 
640                 firstitem = (KEY*)&mr->data[1];
641 
642                 le3 = Vcb->roots.Flink;
643                 while (le3 != &Vcb->roots) {
644                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
645 
646                     if (r2->id == ref->tbr.offset) {
647                         r = r2;
648                         break;
649                     }
650 
651                     le3 = le3->Flink;
652                 }
653 
654                 if (!r) {
655                     ERR("could not find subvol with id %llx\n", ref->tbr.offset);
656                     return STATUS_INTERNAL_ERROR;
657                 }
658 
659                 Status = find_item_to_level(Vcb, r, &tp, firstitem, FALSE, mr->data->level + 1, NULL);
660                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
661                     ERR("find_item_to_level returned %08x\n", Status);
662                     return Status;
663                 }
664 
665                 t = tp.tree;
666                 while (t && t->header.level < mr->data->level + 1) {
667                     t = t->parent;
668                 }
669 
670                 if (!t)
671                     ref->top = TRUE;
672                 else {
673                     metadata_reloc* mr2;
674 
675                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
676                     if (!NT_SUCCESS(Status)) {
677                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
678                         return Status;
679                     }
680 
681                     ref->parent = mr2;
682                 }
683             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
684                 metadata_reloc* mr2;
685 
686                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
687                 if (!NT_SUCCESS(Status)) {
688                     ERR("add_metadata_reloc_parent returned %08x\n", Status);
689                     return Status;
690                 }
691 
692                 ref->parent = mr2;
693             }
694 
695             le2 = le2->Flink;
696         }
697 
698         le = le->Flink;
699     }
700 
701     le = items->Flink;
702     while (le != items) {
703         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
704         LIST_ENTRY* le2;
705         UINT32 hash;
706 
707         mr->t = NULL;
708 
709         hash = calc_crc32c(0xffffffff, (UINT8*)&mr->address, sizeof(UINT64));
710 
711         le2 = Vcb->trees_ptrs[hash >> 24];
712 
713         if (le2) {
714             while (le2 != &Vcb->trees_hash) {
715                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
716 
717                 if (t->header.address == mr->address) {
718                     mr->t = t;
719                     break;
720                 } else if (t->hash > hash)
721                     break;
722 
723                 le2 = le2->Flink;
724             }
725         }
726 
727         le = le->Flink;
728     }
729 
730     for (level = 0; level <= max_level; level++) {
731         le = items->Flink;
732         while (le != items) {
733             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
734 
735             if (mr->data->level == level) {
736                 BOOL done = FALSE;
737                 LIST_ENTRY* le2;
738                 tree_write* tw;
739                 UINT64 flags;
740                 tree* t3;
741 
742                 if (mr->system)
743                     flags = Vcb->system_flags;
744                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
745                     flags = Vcb->data_flags;
746                 else
747                     flags = Vcb->metadata_flags;
748 
749                 if (newchunk) {
750                     acquire_chunk_lock(newchunk, Vcb);
751 
752                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
753                         newchunk->used += Vcb->superblock.node_size;
754                         space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
755                         done = TRUE;
756                     }
757 
758                     release_chunk_lock(newchunk, Vcb);
759                 }
760 
761                 if (!done) {
762                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
763 
764                     le2 = Vcb->chunks.Flink;
765                     while (le2 != &Vcb->chunks) {
766                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
767 
768                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
769                             acquire_chunk_lock(c2, Vcb);
770 
771                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
772                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
773                                     c2->used += Vcb->superblock.node_size;
774                                     space_list_subtract(c2, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
775                                     release_chunk_lock(c2, Vcb);
776                                     newchunk = c2;
777                                     done = TRUE;
778                                     break;
779                                 }
780                             }
781 
782                             release_chunk_lock(c2, Vcb);
783                         }
784 
785                         le2 = le2->Flink;
786                     }
787 
788                     // allocate new chunk if necessary
789                     if (!done) {
790                         Status = alloc_chunk(Vcb, flags, &newchunk, FALSE);
791 
792                         if (!NT_SUCCESS(Status)) {
793                             ERR("alloc_chunk returned %08x\n", Status);
794                             ExReleaseResourceLite(&Vcb->chunk_lock);
795                             goto end;
796                         }
797 
798                         acquire_chunk_lock(newchunk, Vcb);
799 
800                         newchunk->balance_num = Vcb->balance.balance_num;
801 
802                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
803                             release_chunk_lock(newchunk, Vcb);
804                             ExReleaseResourceLite(&Vcb->chunk_lock);
805                             ERR("could not find address in new chunk\n");
806                             Status = STATUS_DISK_FULL;
807                             goto end;
808                         } else {
809                             newchunk->used += Vcb->superblock.node_size;
810                             space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
811                         }
812 
813                         release_chunk_lock(newchunk, Vcb);
814                     }
815 
816                     ExReleaseResourceLite(&Vcb->chunk_lock);
817                 }
818 
819                 // update parents
820                 le2 = mr->refs.Flink;
821                 while (le2 != &mr->refs) {
822                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
823 
824                     if (ref->parent) {
825                         UINT16 i;
826                         internal_node* in = (internal_node*)&ref->parent->data[1];
827 
828                         for (i = 0; i < ref->parent->data->num_items; i++) {
829                             if (in[i].address == mr->address) {
830                                 in[i].address = mr->new_address;
831                                 break;
832                             }
833                         }
834 
835                         if (ref->parent->t) {
836                             LIST_ENTRY* le3;
837 
838                             le3 = ref->parent->t->itemlist.Flink;
839                             while (le3 != &ref->parent->t->itemlist) {
840                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
841 
842                                 if (!td->inserted && td->treeholder.address == mr->address)
843                                     td->treeholder.address = mr->new_address;
844 
845                                 le3 = le3->Flink;
846                             }
847                         }
848                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
849                         LIST_ENTRY* le3;
850                         root* r = NULL;
851 
852                         // alter ROOT_ITEM
853 
854                         le3 = Vcb->roots.Flink;
855                         while (le3 != &Vcb->roots) {
856                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
857 
858                             if (r2->id == ref->tbr.offset) {
859                                 r = r2;
860                                 break;
861                             }
862 
863                             le3 = le3->Flink;
864                         }
865 
866                         if (r) {
867                             r->treeholder.address = mr->new_address;
868 
869                             if (r == Vcb->root_root)
870                                 Vcb->superblock.root_tree_addr = mr->new_address;
871                             else if (r == Vcb->chunk_root)
872                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
873                             else if (r->root_item.block_number == mr->address) {
874                                 KEY searchkey;
875                                 ROOT_ITEM* ri;
876 
877                                 r->root_item.block_number = mr->new_address;
878 
879                                 searchkey.obj_id = r->id;
880                                 searchkey.obj_type = TYPE_ROOT_ITEM;
881                                 searchkey.offset = 0xffffffffffffffff;
882 
883                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
884                                 if (!NT_SUCCESS(Status)) {
885                                     ERR("find_item returned %08x\n", Status);
886                                     goto end;
887                                 }
888 
889                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
890                                     ERR("could not find ROOT_ITEM for tree %llx\n", searchkey.obj_id);
891                                     Status = STATUS_INTERNAL_ERROR;
892                                     goto end;
893                                 }
894 
895                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
896                                 if (!ri) {
897                                     ERR("out of memory\n");
898                                     Status = STATUS_INSUFFICIENT_RESOURCES;
899                                     goto end;
900                                 }
901 
902                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
903 
904                                 Status = delete_tree_item(Vcb, &tp);
905                                 if (!NT_SUCCESS(Status)) {
906                                     ERR("delete_tree_item returned %08x\n", Status);
907                                     goto end;
908                                 }
909 
910                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
911                                 if (!NT_SUCCESS(Status)) {
912                                     ERR("insert_tree_item returned %08x\n", Status);
913                                     goto end;
914                                 }
915                             }
916                         }
917                     }
918 
919                     le2 = le2->Flink;
920                 }
921 
922                 mr->data->address = mr->new_address;
923 
924                 t3 = mr->t;
925 
926                 while (t3) {
927                     UINT8 h;
928                     BOOL inserted;
929                     tree* t4 = NULL;
930 
931                     // check if tree loaded more than once
932                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
933                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
934 
935                         if (nt->header.address == t3->header.address)
936                             t4 = nt;
937                     }
938 
939                     t3->header.address = mr->new_address;
940 
941                     h = t3->hash >> 24;
942 
943                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
944                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
945                             Vcb->trees_ptrs[h] = NULL;
946                         else {
947                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
948 
949                             if (t2->hash >> 24 == h)
950                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
951                             else
952                                 Vcb->trees_ptrs[h] = NULL;
953                         }
954                     }
955 
956                     RemoveEntryList(&t3->list_entry_hash);
957 
958                     t3->hash = calc_crc32c(0xffffffff, (UINT8*)&t3->header.address, sizeof(UINT64));
959                     h = t3->hash >> 24;
960 
961                     if (!Vcb->trees_ptrs[h]) {
962                         UINT8 h2 = h;
963 
964                         le2 = Vcb->trees_hash.Flink;
965 
966                         if (h2 > 0) {
967                             h2--;
968                             do {
969                                 if (Vcb->trees_ptrs[h2]) {
970                                     le2 = Vcb->trees_ptrs[h2];
971                                     break;
972                                 }
973 
974                                 h2--;
975                             } while (h2 > 0);
976                         }
977                     } else
978                         le2 = Vcb->trees_ptrs[h];
979 
980                     inserted = FALSE;
981                     while (le2 != &Vcb->trees_hash) {
982                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
983 
984                         if (t2->hash >= t3->hash) {
985                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
986                             inserted = TRUE;
987                             break;
988                         }
989 
990                         le2 = le2->Flink;
991                     }
992 
993                     if (!inserted)
994                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
995 
996                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
997                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
998 
999                     if (data_items && level == 0) {
1000                         le2 = data_items->Flink;
1001 
1002                         while (le2 != data_items) {
1003                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1004                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1005 
1006                             while (le3 != &t3->itemlist) {
1007                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1008 
1009                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1010                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1011 
1012                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1013                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1014 
1015                                         if (ed2->address == dr->address)
1016                                             ed2->address = dr->new_address;
1017                                     }
1018                                 }
1019 
1020                                 le3 = le3->Flink;
1021                             }
1022 
1023                             le2 = le2->Flink;
1024                         }
1025                     }
1026 
1027                     t3 = t4;
1028                 }
1029 
1030                 *((UINT32*)mr->data) = ~calc_crc32c(0xffffffff, (UINT8*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1031 
1032                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1033                 if (!tw) {
1034                     ERR("out of memory\n");
1035                     Status = STATUS_INSUFFICIENT_RESOURCES;
1036                     goto end;
1037                 }
1038 
1039                 tw->address = mr->new_address;
1040                 tw->length = Vcb->superblock.node_size;
1041                 tw->data = (UINT8*)mr->data;
1042 
1043                 if (IsListEmpty(&tree_writes))
1044                     InsertTailList(&tree_writes, &tw->list_entry);
1045                 else {
1046                     BOOL inserted = FALSE;
1047 
1048                     le2 = tree_writes.Flink;
1049                     while (le2 != &tree_writes) {
1050                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1051 
1052                         if (tw2->address > tw->address) {
1053                             InsertHeadList(le2->Blink, &tw->list_entry);
1054                             inserted = TRUE;
1055                             break;
1056                         }
1057 
1058                         le2 = le2->Flink;
1059                     }
1060 
1061                     if (!inserted)
1062                         InsertTailList(&tree_writes, &tw->list_entry);
1063                 }
1064             }
1065 
1066             le = le->Flink;
1067         }
1068     }
1069 
1070     Status = do_tree_writes(Vcb, &tree_writes, TRUE);
1071     if (!NT_SUCCESS(Status)) {
1072         ERR("do_tree_writes returned %08x\n", Status);
1073         goto end;
1074     }
1075 
1076     le = items->Flink;
1077     while (le != items) {
1078         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1079 
1080         Status = add_metadata_reloc_extent_item(Vcb, mr);
1081         if (!NT_SUCCESS(Status)) {
1082             ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1083             goto end;
1084         }
1085 
1086         le = le->Flink;
1087     }
1088 
1089     Status = STATUS_SUCCESS;
1090 
1091 end:
1092     while (!IsListEmpty(&tree_writes)) {
1093         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1094         ExFreePool(tw);
1095     }
1096 
1097     return Status;
1098 }
1099 
1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1101     KEY searchkey;
1102     traverse_ptr tp;
1103     NTSTATUS Status;
1104     BOOL b;
1105     LIST_ENTRY items, rollback;
1106     UINT32 loaded = 0;
1107 
1108     TRACE("chunk %llx\n", c->offset);
1109 
1110     InitializeListHead(&rollback);
1111     InitializeListHead(&items);
1112 
1113     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1114 
1115     searchkey.obj_id = c->offset;
1116     searchkey.obj_type = TYPE_METADATA_ITEM;
1117     searchkey.offset = 0xffffffffffffffff;
1118 
1119     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1120     if (!NT_SUCCESS(Status)) {
1121         ERR("find_item returned %08x\n", Status);
1122         goto end;
1123     }
1124 
1125     do {
1126         traverse_ptr next_tp;
1127 
1128         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129             break;
1130 
1131         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132             BOOL tree = FALSE, skinny = FALSE;
1133 
1134             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135                 tree = TRUE;
1136                 skinny = TRUE;
1137             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1139                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140 
1141                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1142                     tree = TRUE;
1143             }
1144 
1145             if (tree) {
1146                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147 
1148                 if (!NT_SUCCESS(Status)) {
1149                     ERR("add_metadata_reloc returned %08x\n", Status);
1150                     goto end;
1151                 }
1152 
1153                 loaded++;
1154 
1155                 if (loaded >= 64) // only do 64 at a time
1156                     break;
1157             }
1158         }
1159 
1160         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1161 
1162         if (b)
1163             tp = next_tp;
1164     } while (b);
1165 
1166     if (IsListEmpty(&items)) {
1167         *changed = FALSE;
1168         Status = STATUS_SUCCESS;
1169         goto end;
1170     } else
1171         *changed = TRUE;
1172 
1173     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1174     if (!NT_SUCCESS(Status)) {
1175         ERR("write_metadata_items returned %08x\n", Status);
1176         goto end;
1177     }
1178 
1179     Status = STATUS_SUCCESS;
1180 
1181     Vcb->need_write = TRUE;
1182 
1183 end:
1184     if (NT_SUCCESS(Status)) {
1185         Status = do_write(Vcb, NULL);
1186         if (!NT_SUCCESS(Status))
1187             ERR("do_write returned %08x\n", Status);
1188     }
1189 
1190     if (NT_SUCCESS(Status))
1191         clear_rollback(&rollback);
1192     else
1193         do_rollback(Vcb, &rollback);
1194 
1195     free_trees(Vcb);
1196 
1197     ExReleaseResourceLite(&Vcb->tree_lock);
1198 
1199     while (!IsListEmpty(&items)) {
1200         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1201 
1202         while (!IsListEmpty(&mr->refs)) {
1203             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1204 
1205             ExFreePool(ref);
1206         }
1207 
1208         ExFreePool(mr);
1209     }
1210 
1211     return Status;
1212 }
1213 
1214 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1215                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1216     NTSTATUS Status;
1217     LIST_ENTRY* le;
1218     KEY searchkey;
1219     traverse_ptr tp;
1220     root* r = NULL;
1221     metadata_reloc* mr;
1222     UINT64 last_tree = 0;
1223     data_reloc_ref* ref;
1224 
1225     le = Vcb->roots.Flink;
1226     while (le != &Vcb->roots) {
1227         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1228 
1229         if (r2->id == edr->root) {
1230             r = r2;
1231             break;
1232         }
1233 
1234         le = le->Flink;
1235     }
1236 
1237     if (!r) {
1238         ERR("could not find subvol %llx\n", edr->count);
1239         return STATUS_INTERNAL_ERROR;
1240     }
1241 
1242     searchkey.obj_id = edr->objid;
1243     searchkey.obj_type = TYPE_EXTENT_DATA;
1244     searchkey.offset = 0;
1245 
1246     Status = find_item(Vcb, r, &tp, &searchkey, FALSE, NULL);
1247     if (!NT_SUCCESS(Status)) {
1248         ERR("find_item returned %08x\n", Status);
1249         return Status;
1250     }
1251 
1252     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1253         traverse_ptr tp2;
1254 
1255         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1256             tp = tp2;
1257         else {
1258             ERR("could not find EXTENT_DATA for inode %llx in root %llx\n", searchkey.obj_id, r->id);
1259             return STATUS_INTERNAL_ERROR;
1260         }
1261     }
1262 
1263     ref = NULL;
1264 
1265     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1266         traverse_ptr tp2;
1267 
1268         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1269             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1270 
1271             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1272                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1273 
1274                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1275                     if (ref && last_tree == tp.tree->header.address)
1276                         ref->edr.count++;
1277                     else {
1278                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1279                         if (!ref) {
1280                             ERR("out of memory\n");
1281                             return STATUS_INSUFFICIENT_RESOURCES;
1282                         }
1283 
1284                         ref->type = TYPE_EXTENT_DATA_REF;
1285                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1286                         ref->edr.count = 1;
1287 
1288                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1289                         if (!NT_SUCCESS(Status)) {
1290                             ERR("add_metadata_reloc_parent returned %08x\n", Status);
1291                             ExFreePool(ref);
1292                             return Status;
1293                         }
1294 
1295                         last_tree = tp.tree->header.address;
1296                         ref->parent = mr;
1297 
1298                         InsertTailList(&dr->refs, &ref->list_entry);
1299                     }
1300                 }
1301             }
1302         }
1303 
1304         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1305             tp = tp2;
1306         else
1307             break;
1308     }
1309 
1310     return STATUS_SUCCESS;
1311 }
1312 
1313 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1314                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1315     NTSTATUS Status;
1316     data_reloc* dr;
1317     EXTENT_ITEM* ei;
1318     UINT16 len;
1319     UINT64 inline_rc;
1320     UINT8* ptr;
1321 
1322     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1323     if (!dr) {
1324         ERR("out of memory\n");
1325         return STATUS_INSUFFICIENT_RESOURCES;
1326     }
1327 
1328     dr->address = tp->item->key.obj_id;
1329     dr->size = tp->item->key.offset;
1330     dr->ei = (EXTENT_ITEM*)tp->item->data;
1331     InitializeListHead(&dr->refs);
1332 
1333     Status = delete_tree_item(Vcb, tp);
1334     if (!NT_SUCCESS(Status)) {
1335         ERR("delete_tree_item returned %08x\n", Status);
1336         return Status;
1337     }
1338 
1339     if (!c)
1340         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1341 
1342     if (c) {
1343         acquire_chunk_lock(c, Vcb);
1344 
1345         c->used -= tp->item->key.offset;
1346 
1347         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1348 
1349         release_chunk_lock(c, Vcb);
1350     }
1351 
1352     ei = (EXTENT_ITEM*)tp->item->data;
1353     inline_rc = 0;
1354 
1355     len = tp->item->size - sizeof(EXTENT_ITEM);
1356     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
1357 
1358     while (len > 0) {
1359         UINT8 secttype = *ptr;
1360         UINT16 sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1361 
1362         len--;
1363 
1364         if (sectlen > len) {
1365             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1366             return STATUS_INTERNAL_ERROR;
1367         }
1368 
1369         if (sectlen == 0) {
1370             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1371             return STATUS_INTERNAL_ERROR;
1372         }
1373 
1374         if (secttype == TYPE_EXTENT_DATA_REF) {
1375             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(UINT8));
1376 
1377             inline_rc += edr->count;
1378 
1379             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1380             if (!NT_SUCCESS(Status)) {
1381                 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1382                 return Status;
1383             }
1384         } else if (secttype == TYPE_SHARED_DATA_REF) {
1385             metadata_reloc* mr;
1386             data_reloc_ref* ref;
1387 
1388             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1389             if (!ref) {
1390                 ERR("out of memory\n");
1391                 return STATUS_INSUFFICIENT_RESOURCES;
1392             }
1393 
1394             ref->type = TYPE_SHARED_DATA_REF;
1395             RtlCopyMemory(&ref->sdr, ptr + sizeof(UINT8), sizeof(SHARED_DATA_REF));
1396             inline_rc += ref->sdr.count;
1397 
1398             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1399             if (!NT_SUCCESS(Status)) {
1400                 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1401                 ExFreePool(ref);
1402                 return Status;
1403             }
1404 
1405             ref->parent = mr;
1406 
1407             InsertTailList(&dr->refs, &ref->list_entry);
1408         } else {
1409             ERR("unexpected tree type %x\n", secttype);
1410             return STATUS_INTERNAL_ERROR;
1411         }
1412 
1413 
1414         len -= sectlen;
1415         ptr += sizeof(UINT8) + sectlen;
1416     }
1417 
1418     if (inline_rc < ei->refcount) { // look for non-inline entries
1419         traverse_ptr tp2 = *tp, next_tp;
1420 
1421         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
1422             tp2 = next_tp;
1423 
1424             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1425                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1426                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1427                     if (!NT_SUCCESS(Status)) {
1428                         ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1429                         return Status;
1430                     }
1431 
1432                     Status = delete_tree_item(Vcb, &tp2);
1433                     if (!NT_SUCCESS(Status)) {
1434                         ERR("delete_tree_item returned %08x\n", Status);
1435                         return Status;
1436                     }
1437                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(UINT32)) {
1438                     metadata_reloc* mr;
1439                     data_reloc_ref* ref;
1440 
1441                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1442                     if (!ref) {
1443                         ERR("out of memory\n");
1444                         return STATUS_INSUFFICIENT_RESOURCES;
1445                     }
1446 
1447                     ref->type = TYPE_SHARED_DATA_REF;
1448                     ref->sdr.offset = tp2.item->key.offset;
1449                     ref->sdr.count = *((UINT32*)tp2.item->data);
1450 
1451                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1452                     if (!NT_SUCCESS(Status)) {
1453                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
1454                         ExFreePool(ref);
1455                         return Status;
1456                     }
1457 
1458                     ref->parent = mr;
1459                     InsertTailList(&dr->refs, &ref->list_entry);
1460 
1461                     Status = delete_tree_item(Vcb, &tp2);
1462                     if (!NT_SUCCESS(Status)) {
1463                         ERR("delete_tree_item returned %08x\n", Status);
1464                         return Status;
1465                     }
1466                 }
1467             } else
1468                 break;
1469         }
1470     }
1471 
1472     InsertTailList(items, &dr->list_entry);
1473 
1474     return STATUS_SUCCESS;
1475 }
1476 
1477 static void sort_data_reloc_refs(data_reloc* dr) {
1478     LIST_ENTRY newlist, *le;
1479 
1480     if (IsListEmpty(&dr->refs))
1481         return;
1482 
1483     // insertion sort
1484 
1485     InitializeListHead(&newlist);
1486 
1487     while (!IsListEmpty(&dr->refs)) {
1488         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1489         BOOL inserted = FALSE;
1490 
1491         if (ref->type == TYPE_EXTENT_DATA_REF)
1492             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1493         else if (ref->type == TYPE_SHARED_DATA_REF)
1494             ref->hash = ref->parent->new_address;
1495 
1496         le = newlist.Flink;
1497         while (le != &newlist) {
1498             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1499 
1500             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1501                 InsertHeadList(le->Blink, &ref->list_entry);
1502                 inserted = TRUE;
1503                 break;
1504             }
1505 
1506             le = le->Flink;
1507         }
1508 
1509         if (!inserted)
1510             InsertTailList(&newlist, &ref->list_entry);
1511     }
1512 
1513     le = newlist.Flink;
1514     while (le != &newlist) {
1515         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1516 
1517         if (le->Flink != &newlist) {
1518             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1519 
1520             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1521                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1522                 RemoveEntryList(&ref2->list_entry);
1523                 ref->edr.count += ref2->edr.count;
1524                 ExFreePool(ref2);
1525                 continue;
1526             }
1527         }
1528 
1529         le = le->Flink;
1530     }
1531 
1532     newlist.Flink->Blink = &dr->refs;
1533     newlist.Blink->Flink = &dr->refs;
1534     dr->refs.Flink = newlist.Flink;
1535     dr->refs.Blink = newlist.Blink;
1536 }
1537 
1538 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1539     NTSTATUS Status;
1540     LIST_ENTRY* le;
1541     UINT64 rc = 0;
1542     UINT16 inline_len;
1543     BOOL all_inline = TRUE;
1544     data_reloc_ref* first_noninline = NULL;
1545     EXTENT_ITEM* ei;
1546     UINT8* ptr;
1547 
1548     inline_len = sizeof(EXTENT_ITEM);
1549 
1550     sort_data_reloc_refs(dr);
1551 
1552     le = dr->refs.Flink;
1553     while (le != &dr->refs) {
1554         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1555         UINT16 extlen = 0;
1556 
1557         if (ref->type == TYPE_EXTENT_DATA_REF) {
1558             extlen += sizeof(EXTENT_DATA_REF);
1559             rc += ref->edr.count;
1560         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1561             extlen += sizeof(SHARED_DATA_REF);
1562             rc++;
1563         }
1564 
1565         if (all_inline) {
1566             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1567                 all_inline = FALSE;
1568                 first_noninline = ref;
1569             } else
1570                 inline_len += extlen + 1;
1571         }
1572 
1573         le = le->Flink;
1574     }
1575 
1576     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1577     if (!ei) {
1578         ERR("out of memory\n");
1579         return STATUS_INSUFFICIENT_RESOURCES;
1580     }
1581 
1582     ei->refcount = rc;
1583     ei->generation = dr->ei->generation;
1584     ei->flags = dr->ei->flags;
1585     ptr = (UINT8*)&ei[1];
1586 
1587     le = dr->refs.Flink;
1588     while (le != &dr->refs) {
1589         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1590 
1591         if (ref == first_noninline)
1592             break;
1593 
1594         *ptr = ref->type;
1595         ptr++;
1596 
1597         if (ref->type == TYPE_EXTENT_DATA_REF) {
1598             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1599 
1600             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1601 
1602             ptr += sizeof(EXTENT_DATA_REF);
1603         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1604             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1605 
1606             sdr->offset = ref->parent->new_address;
1607             sdr->count = ref->sdr.count;
1608 
1609             ptr += sizeof(SHARED_DATA_REF);
1610         }
1611 
1612         le = le->Flink;
1613     }
1614 
1615     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1616     if (!NT_SUCCESS(Status)) {
1617         ERR("insert_tree_item returned %08x\n", Status);
1618         return Status;
1619     }
1620 
1621     if (!all_inline) {
1622         le = &first_noninline->list_entry;
1623 
1624         while (le != &dr->refs) {
1625             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1626 
1627             if (ref->type == TYPE_EXTENT_DATA_REF) {
1628                 EXTENT_DATA_REF* edr;
1629 
1630                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1631                 if (!edr) {
1632                     ERR("out of memory\n");
1633                     return STATUS_INSUFFICIENT_RESOURCES;
1634                 }
1635 
1636                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1637 
1638                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1639                 if (!NT_SUCCESS(Status)) {
1640                     ERR("insert_tree_item returned %08x\n", Status);
1641                     return Status;
1642                 }
1643             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1644                 UINT32* sdr;
1645 
1646                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(UINT32), ALLOC_TAG);
1647                 if (!sdr) {
1648                     ERR("out of memory\n");
1649                     return STATUS_INSUFFICIENT_RESOURCES;
1650                 }
1651 
1652                 *sdr = ref->sdr.count;
1653 
1654                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(UINT32), NULL, NULL);
1655                 if (!NT_SUCCESS(Status)) {
1656                     ERR("insert_tree_item returned %08x\n", Status);
1657                     return Status;
1658                 }
1659             }
1660 
1661             le = le->Flink;
1662         }
1663     }
1664 
1665     return STATUS_SUCCESS;
1666 }
1667 
1668 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1669     KEY searchkey;
1670     traverse_ptr tp;
1671     NTSTATUS Status;
1672     BOOL b;
1673     LIST_ENTRY items, metadata_items, rollback, *le;
1674     UINT64 loaded = 0, num_loaded = 0;
1675     chunk* newchunk = NULL;
1676     UINT8* data = NULL;
1677 
1678     TRACE("chunk %llx\n", c->offset);
1679 
1680     InitializeListHead(&rollback);
1681     InitializeListHead(&items);
1682     InitializeListHead(&metadata_items);
1683 
1684     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1685 
1686     searchkey.obj_id = c->offset;
1687     searchkey.obj_type = TYPE_EXTENT_ITEM;
1688     searchkey.offset = 0xffffffffffffffff;
1689 
1690     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1691     if (!NT_SUCCESS(Status)) {
1692         ERR("find_item returned %08x\n", Status);
1693         goto end;
1694     }
1695 
1696     do {
1697         traverse_ptr next_tp;
1698 
1699         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1700             break;
1701 
1702         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1703             BOOL tree = FALSE;
1704 
1705             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1706                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1707 
1708                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1709                     tree = TRUE;
1710             }
1711 
1712             if (!tree) {
1713                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1714 
1715                 if (!NT_SUCCESS(Status)) {
1716                     ERR("add_data_reloc returned %08x\n", Status);
1717                     goto end;
1718                 }
1719 
1720                 loaded += tp.item->key.offset;
1721                 num_loaded++;
1722 
1723                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1724                     break;
1725             }
1726         }
1727 
1728         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1729 
1730         if (b)
1731             tp = next_tp;
1732     } while (b);
1733 
1734     if (IsListEmpty(&items)) {
1735         *changed = FALSE;
1736         Status = STATUS_SUCCESS;
1737         goto end;
1738     } else
1739         *changed = TRUE;
1740 
1741     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1742     if (!data) {
1743         ERR("out of memory\n");
1744         Status = STATUS_INSUFFICIENT_RESOURCES;
1745         goto end;
1746     }
1747 
1748     le = items.Flink;
1749     while (le != &items) {
1750         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1751         BOOL done = FALSE;
1752         LIST_ENTRY* le2;
1753         UINT32* csum;
1754         RTL_BITMAP bmp;
1755         ULONG* bmparr;
1756         ULONG runlength, index, lastoff;
1757 
1758         if (newchunk) {
1759             acquire_chunk_lock(newchunk, Vcb);
1760 
1761             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1762                 newchunk->used += dr->size;
1763                 space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1764                 done = TRUE;
1765             }
1766 
1767             release_chunk_lock(newchunk, Vcb);
1768         }
1769 
1770         if (!done) {
1771             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
1772 
1773             le2 = Vcb->chunks.Flink;
1774             while (le2 != &Vcb->chunks) {
1775                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1776 
1777                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1778                     acquire_chunk_lock(c2, Vcb);
1779 
1780                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1781                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1782                             c2->used += dr->size;
1783                             space_list_subtract(c2, FALSE, dr->new_address, dr->size, &rollback);
1784                             release_chunk_lock(c2, Vcb);
1785                             newchunk = c2;
1786                             done = TRUE;
1787                             break;
1788                         }
1789                     }
1790 
1791                     release_chunk_lock(c2, Vcb);
1792                 }
1793 
1794                 le2 = le2->Flink;
1795             }
1796 
1797             // allocate new chunk if necessary
1798             if (!done) {
1799                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, FALSE);
1800 
1801                 if (!NT_SUCCESS(Status)) {
1802                     ERR("alloc_chunk returned %08x\n", Status);
1803                     ExReleaseResourceLite(&Vcb->chunk_lock);
1804                     goto end;
1805                 }
1806 
1807                 acquire_chunk_lock(newchunk, Vcb);
1808 
1809                 newchunk->balance_num = Vcb->balance.balance_num;
1810 
1811                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1812                     release_chunk_lock(newchunk, Vcb);
1813                     ExReleaseResourceLite(&Vcb->chunk_lock);
1814                     ERR("could not find address in new chunk\n");
1815                     Status = STATUS_DISK_FULL;
1816                     goto end;
1817                 } else {
1818                     newchunk->used += dr->size;
1819                     space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1820                 }
1821 
1822                 release_chunk_lock(newchunk, Vcb);
1823             }
1824 
1825             ExReleaseResourceLite(&Vcb->chunk_lock);
1826         }
1827 
1828         dr->newchunk = newchunk;
1829 
1830         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align((dr->size / Vcb->superblock.sector_size) + 1, sizeof(ULONG)), ALLOC_TAG);
1831         if (!bmparr) {
1832             ERR("out of memory\n");
1833             Status = STATUS_INSUFFICIENT_RESOURCES;
1834             goto end;
1835         }
1836 
1837         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(UINT32) / Vcb->superblock.sector_size), ALLOC_TAG);
1838         if (!csum) {
1839             ERR("out of memory\n");
1840             ExFreePool(bmparr);
1841             Status = STATUS_INSUFFICIENT_RESOURCES;
1842             goto end;
1843         }
1844 
1845         RtlInitializeBitMap(&bmp, bmparr, (ULONG)(dr->size / Vcb->superblock.sector_size));
1846         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1847 
1848         searchkey.obj_id = EXTENT_CSUM_ID;
1849         searchkey.obj_type = TYPE_EXTENT_CSUM;
1850         searchkey.offset = dr->address;
1851 
1852         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, FALSE, NULL);
1853         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1854             ERR("find_item returned %08x\n", Status);
1855             ExFreePool(csum);
1856             ExFreePool(bmparr);
1857             goto end;
1858         }
1859 
1860         if (Status != STATUS_NOT_FOUND) {
1861             do {
1862                 traverse_ptr next_tp;
1863 
1864                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1865                     if (tp.item->key.offset >= dr->address + dr->size)
1866                         break;
1867                     else if (tp.item->size >= sizeof(UINT32) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)) >= dr->address) {
1868                         UINT64 cs = max(dr->address, tp.item->key.offset);
1869                         UINT64 ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)));
1870 
1871                         RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1872                                       tp.item->data + ((cs - tp.item->key.offset) * sizeof(UINT32) / Vcb->superblock.sector_size),
1873                                       (ULONG)((ce - cs) * sizeof(UINT32) / Vcb->superblock.sector_size));
1874 
1875                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1876 
1877                         if (ce == dr->address + dr->size)
1878                             break;
1879                     }
1880                 }
1881 
1882                 if (find_next_item(Vcb, &tp, &next_tp, FALSE, NULL))
1883                     tp = next_tp;
1884                 else
1885                     break;
1886             } while (TRUE);
1887         }
1888 
1889         lastoff = 0;
1890         runlength = RtlFindFirstRunClear(&bmp, &index);
1891 
1892         while (runlength != 0) {
1893             if (index > lastoff) {
1894                 ULONG off = lastoff;
1895                 ULONG size = index - lastoff;
1896 
1897                 // handle no csum run
1898                 do {
1899                     ULONG rl;
1900 
1901                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1902                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1903                     else
1904                         rl = size;
1905 
1906                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1907                                        c, NULL, NULL, 0, FALSE, NormalPagePriority);
1908                     if (!NT_SUCCESS(Status)) {
1909                         ERR("read_data returned %08x\n", Status);
1910                         ExFreePool(csum);
1911                         ExFreePool(bmparr);
1912                         goto end;
1913                     }
1914 
1915                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1916                                                  NULL, newchunk, FALSE, 0, NormalPagePriority);
1917                     if (!NT_SUCCESS(Status)) {
1918                         ERR("write_data_complete returned %08x\n", Status);
1919                         ExFreePool(csum);
1920                         ExFreePool(bmparr);
1921                         goto end;
1922                     }
1923 
1924                     size -= rl;
1925                     off += rl;
1926                 } while (size > 0);
1927             }
1928 
1929             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1930             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1931 
1932             // handle csum run
1933             do {
1934                 ULONG rl;
1935 
1936                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1937                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1938                 else
1939                     rl = runlength;
1940 
1941                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], FALSE, data,
1942                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1943                 if (!NT_SUCCESS(Status)) {
1944                     ERR("read_data returned %08x\n", Status);
1945                     ExFreePool(csum);
1946                     ExFreePool(bmparr);
1947                     goto end;
1948                 }
1949 
1950                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1951                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1952                 if (!NT_SUCCESS(Status)) {
1953                     ERR("write_data_complete returned %08x\n", Status);
1954                     ExFreePool(csum);
1955                     ExFreePool(bmparr);
1956                     goto end;
1957                 }
1958 
1959                 runlength -= rl;
1960                 index += rl;
1961             } while (runlength > 0);
1962 
1963             lastoff = index;
1964             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1965         }
1966 
1967         ExFreePool(csum);
1968         ExFreePool(bmparr);
1969 
1970         // handle final nocsum run
1971         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1972             ULONG off = lastoff;
1973             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1974 
1975             do {
1976                 ULONG rl;
1977 
1978                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1979                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1980                 else
1981                     rl = size;
1982 
1983                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1984                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1985                 if (!NT_SUCCESS(Status)) {
1986                     ERR("read_data returned %08x\n", Status);
1987                     goto end;
1988                 }
1989 
1990                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1991                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1992                 if (!NT_SUCCESS(Status)) {
1993                     ERR("write_data_complete returned %08x\n", Status);
1994                     goto end;
1995                 }
1996 
1997                 size -= rl;
1998                 off += rl;
1999             } while (size > 0);
2000         }
2001 
2002         le = le->Flink;
2003     }
2004 
2005     ExFreePool(data);
2006     data = NULL;
2007 
2008     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2009     if (!NT_SUCCESS(Status)) {
2010         ERR("write_metadata_items returned %08x\n", Status);
2011         goto end;
2012     }
2013 
2014     le = items.Flink;
2015     while (le != &items) {
2016         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2017 
2018         Status = add_data_reloc_extent_item(Vcb, dr);
2019         if (!NT_SUCCESS(Status)) {
2020             ERR("add_data_reloc_extent_item returned %08x\n", Status);
2021             goto end;
2022         }
2023 
2024         le = le->Flink;
2025     }
2026 
2027     le = c->changed_extents.Flink;
2028     while (le != &c->changed_extents) {
2029         LIST_ENTRY *le2, *le3;
2030         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2031 
2032         le3 = le->Flink;
2033 
2034         le2 = items.Flink;
2035         while (le2 != &items) {
2036             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2037 
2038             if (ce->address == dr->address) {
2039                 ce->address = dr->new_address;
2040                 RemoveEntryList(&ce->list_entry);
2041                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2042                 break;
2043             }
2044 
2045             le2 = le2->Flink;
2046         }
2047 
2048         le = le3;
2049     }
2050 
2051     Status = STATUS_SUCCESS;
2052 
2053     Vcb->need_write = TRUE;
2054 
2055 end:
2056     if (NT_SUCCESS(Status)) {
2057         // update extents in cache inodes before we flush
2058         le = Vcb->chunks.Flink;
2059         while (le != &Vcb->chunks) {
2060             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2061 
2062             if (c2->cache) {
2063                 LIST_ENTRY* le2;
2064 
2065                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, TRUE);
2066 
2067                 le2 = c2->cache->extents.Flink;
2068                 while (le2 != &c2->cache->extents) {
2069                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2070 
2071                     if (!ext->ignore) {
2072                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2073                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2074 
2075                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2076                                 LIST_ENTRY* le3 = items.Flink;
2077                                 while (le3 != &items) {
2078                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2079 
2080                                     if (ed2->address == dr->address) {
2081                                         ed2->address = dr->new_address;
2082                                         break;
2083                                     }
2084 
2085                                     le3 = le3->Flink;
2086                                 }
2087                             }
2088                         }
2089                     }
2090 
2091                     le2 = le2->Flink;
2092                 }
2093 
2094                 ExReleaseResourceLite(c2->cache->Header.Resource);
2095             }
2096 
2097             le = le->Flink;
2098         }
2099 
2100         Status = do_write(Vcb, NULL);
2101         if (!NT_SUCCESS(Status))
2102             ERR("do_write returned %08x\n", Status);
2103     }
2104 
2105     if (NT_SUCCESS(Status)) {
2106         clear_rollback(&rollback);
2107 
2108         // update open FCBs
2109         // FIXME - speed this up(?)
2110 
2111         acquire_fcb_lock_shared(Vcb);
2112 
2113         le = Vcb->all_fcbs.Flink;
2114         while (le != &Vcb->all_fcbs) {
2115             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2116             LIST_ENTRY* le2;
2117 
2118             ExAcquireResourceExclusiveLite(fcb->Header.Resource, TRUE);
2119 
2120             le2 = fcb->extents.Flink;
2121             while (le2 != &fcb->extents) {
2122                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2123 
2124                 if (!ext->ignore) {
2125                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2126                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2127 
2128                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2129                             LIST_ENTRY* le3 = items.Flink;
2130                             while (le3 != &items) {
2131                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2132 
2133                                 if (ed2->address == dr->address) {
2134                                     ed2->address = dr->new_address;
2135                                     break;
2136                                 }
2137 
2138                                 le3 = le3->Flink;
2139                             }
2140                         }
2141                     }
2142                 }
2143 
2144                 le2 = le2->Flink;
2145             }
2146 
2147             ExReleaseResourceLite(fcb->Header.Resource);
2148 
2149             le = le->Flink;
2150         }
2151 
2152         release_fcb_lock(Vcb);
2153     } else
2154         do_rollback(Vcb, &rollback);
2155 
2156     free_trees(Vcb);
2157 
2158     ExReleaseResourceLite(&Vcb->tree_lock);
2159 
2160     if (data)
2161         ExFreePool(data);
2162 
2163     while (!IsListEmpty(&items)) {
2164         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2165 
2166         while (!IsListEmpty(&dr->refs)) {
2167             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2168 
2169             ExFreePool(ref);
2170         }
2171 
2172         ExFreePool(dr);
2173     }
2174 
2175     while (!IsListEmpty(&metadata_items)) {
2176         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2177 
2178         while (!IsListEmpty(&mr->refs)) {
2179             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2180 
2181             ExFreePool(ref);
2182         }
2183 
2184         ExFreePool(mr);
2185     }
2186 
2187     return Status;
2188 }
2189 
2190 static __inline UINT64 get_chunk_dup_type(chunk* c) {
2191     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2192         return BLOCK_FLAG_RAID0;
2193     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2194         return BLOCK_FLAG_RAID1;
2195     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2196         return BLOCK_FLAG_DUPLICATE;
2197     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2198         return BLOCK_FLAG_RAID10;
2199     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2200         return BLOCK_FLAG_RAID5;
2201     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2202         return BLOCK_FLAG_RAID6;
2203     else
2204         return BLOCK_FLAG_SINGLE;
2205 }
2206 
2207 static BOOL should_balance_chunk(device_extension* Vcb, UINT8 sort, chunk* c) {
2208     btrfs_balance_opts* opts;
2209 
2210     opts = &Vcb->balance.opts[sort];
2211 
2212     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2213         return FALSE;
2214 
2215     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2216         UINT64 type = get_chunk_dup_type(c);
2217 
2218         if (!(type & opts->profiles))
2219             return FALSE;
2220     }
2221 
2222     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2223         UINT16 i;
2224         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2225         BOOL b = FALSE;
2226 
2227         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2228             if (cis[i].dev_id == opts->devid) {
2229                 b = TRUE;
2230                 break;
2231             }
2232         }
2233 
2234         if (!b)
2235             return FALSE;
2236     }
2237 
2238     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2239         UINT16 i, factor;
2240         UINT64 physsize;
2241         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2242         BOOL b = FALSE;
2243 
2244         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2245             factor = c->chunk_item->num_stripes;
2246         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2247             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2248         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2249             factor = c->chunk_item->num_stripes - 1;
2250         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2251             factor = c->chunk_item->num_stripes - 2;
2252         else // SINGLE, DUPLICATE, RAID1
2253             factor = 1;
2254 
2255         physsize = c->chunk_item->size / factor;
2256 
2257         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2258             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2259                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2260                 b = TRUE;
2261                 break;
2262             }
2263         }
2264 
2265         if (!b)
2266             return FALSE;
2267     }
2268 
2269     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2270         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2271             return FALSE;
2272     }
2273 
2274     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2275         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2276             return FALSE;
2277     }
2278 
2279     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2280         UINT64 usage = c->used * 100 / c->chunk_item->size;
2281 
2282         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2283         if (c->used > 0 && usage == 0)
2284             usage = 1;
2285 
2286         if (usage < opts->usage_start || usage > opts->usage_end)
2287             return FALSE;
2288     }
2289 
2290     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2291         UINT64 type = get_chunk_dup_type(c);
2292 
2293         if (type == opts->convert)
2294             return FALSE;
2295     }
2296 
2297     return TRUE;
2298 }
2299 
2300 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2301     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2302         args->profiles = opts->profiles;
2303         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2304     }
2305 
2306     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2307         if (args->usage_start == 0) {
2308             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2309             args->usage_start = opts->usage_start;
2310             args->usage_end = opts->usage_end;
2311         } else {
2312             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2313             args->usage = opts->usage_end;
2314         }
2315     }
2316 
2317     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2318         args->devid = opts->devid;
2319         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2320     }
2321 
2322     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2323         args->drange_start = opts->drange_start;
2324         args->drange_end = opts->drange_end;
2325         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2326     }
2327 
2328     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2329         args->vrange_start = opts->vrange_start;
2330         args->vrange_end = opts->vrange_end;
2331         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2332     }
2333 
2334     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2335         args->convert = opts->convert;
2336         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2337 
2338         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2339             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2340     }
2341 
2342     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2343         if (args->limit_start == 0) {
2344             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2345             args->limit_start = (UINT32)opts->limit_start;
2346             args->limit_end = (UINT32)opts->limit_end;
2347         } else {
2348             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2349             args->limit = opts->limit_end;
2350         }
2351     }
2352 
2353     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2354         args->stripes_start = opts->stripes_start;
2355         args->stripes_end = opts->stripes_end;
2356         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2357     }
2358 }
2359 
2360 static NTSTATUS add_balance_item(device_extension* Vcb) {
2361     KEY searchkey;
2362     traverse_ptr tp;
2363     NTSTATUS Status;
2364     BALANCE_ITEM* bi;
2365 
2366     searchkey.obj_id = BALANCE_ITEM_ID;
2367     searchkey.obj_type = TYPE_TEMP_ITEM;
2368     searchkey.offset = 0;
2369 
2370     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2371 
2372     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2373     if (!NT_SUCCESS(Status)) {
2374         ERR("find_item returned %08x\n", Status);
2375         goto end;
2376     }
2377 
2378     if (!keycmp(tp.item->key, searchkey)) {
2379         Status = delete_tree_item(Vcb, &tp);
2380         if (!NT_SUCCESS(Status)) {
2381             ERR("delete_tree_item returned %08x\n", Status);
2382             goto end;
2383         }
2384     }
2385 
2386     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2387     if (!bi) {
2388         ERR("out of memory\n");
2389         Status = STATUS_INSUFFICIENT_RESOURCES;
2390         goto end;
2391     }
2392 
2393     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2394 
2395     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2396         bi->flags |= BALANCE_FLAGS_DATA;
2397         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2398     }
2399 
2400     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2401         bi->flags |= BALANCE_FLAGS_METADATA;
2402         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2403     }
2404 
2405     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2406         bi->flags |= BALANCE_FLAGS_SYSTEM;
2407         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2408     }
2409 
2410     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2411     if (!NT_SUCCESS(Status)) {
2412         ERR("insert_tree_item returned %08x\n", Status);
2413         ExFreePool(bi);
2414         goto end;
2415     }
2416 
2417     Status = STATUS_SUCCESS;
2418 
2419 end:
2420     if (NT_SUCCESS(Status)) {
2421         Status = do_write(Vcb, NULL);
2422         if (!NT_SUCCESS(Status))
2423             ERR("do_write returned %08x\n", Status);
2424     }
2425 
2426     free_trees(Vcb);
2427 
2428     ExReleaseResourceLite(&Vcb->tree_lock);
2429 
2430     return Status;
2431 }
2432 
2433 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2434     KEY searchkey;
2435     traverse_ptr tp;
2436     NTSTATUS Status;
2437 
2438     searchkey.obj_id = BALANCE_ITEM_ID;
2439     searchkey.obj_type = TYPE_TEMP_ITEM;
2440     searchkey.offset = 0;
2441 
2442     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2443 
2444     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2445     if (!NT_SUCCESS(Status)) {
2446         ERR("find_item returned %08x\n", Status);
2447         goto end;
2448     }
2449 
2450     if (!keycmp(tp.item->key, searchkey)) {
2451         Status = delete_tree_item(Vcb, &tp);
2452         if (!NT_SUCCESS(Status)) {
2453             ERR("delete_tree_item returned %08x\n", Status);
2454             goto end;
2455         }
2456 
2457         Status = do_write(Vcb, NULL);
2458         if (!NT_SUCCESS(Status)) {
2459             ERR("do_write returned %08x\n", Status);
2460             goto end;
2461         }
2462 
2463         free_trees(Vcb);
2464     }
2465 
2466     Status = STATUS_SUCCESS;
2467 
2468 end:
2469     ExReleaseResourceLite(&Vcb->tree_lock);
2470 
2471     return Status;
2472 }
2473 
2474 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2475     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2476 
2477     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2478         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2479         opts->profiles = args->profiles;
2480     }
2481 
2482     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2483         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2484 
2485         opts->usage_start = 0;
2486         opts->usage_end = (UINT8)args->usage;
2487     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2488         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2489 
2490         opts->usage_start = (UINT8)args->usage_start;
2491         opts->usage_end = (UINT8)args->usage_end;
2492     }
2493 
2494     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2495         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2496         opts->devid = args->devid;
2497     }
2498 
2499     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2500         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2501         opts->drange_start = args->drange_start;
2502         opts->drange_end = args->drange_end;
2503     }
2504 
2505     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2506         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2507         opts->vrange_start = args->vrange_start;
2508         opts->vrange_end = args->vrange_end;
2509     }
2510 
2511     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2512         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2513 
2514         opts->limit_start = 0;
2515         opts->limit_end = args->limit;
2516     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2517         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2518 
2519         opts->limit_start = args->limit_start;
2520         opts->limit_end = args->limit_end;
2521     }
2522 
2523     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2524         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2525 
2526         opts->stripes_start = (UINT16)args->stripes_start;
2527         opts->stripes_end = (UINT16)args->stripes_end;
2528     }
2529 
2530     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2531         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2532         opts->convert = args->convert;
2533 
2534         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2535             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2536     }
2537 }
2538 
2539 static NTSTATUS remove_superblocks(device* dev) {
2540     NTSTATUS Status;
2541     superblock* sb;
2542     int i = 0;
2543 
2544     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2545     if (!sb) {
2546         ERR("out of memory\n");
2547         return STATUS_INSUFFICIENT_RESOURCES;
2548     }
2549 
2550     RtlZeroMemory(sb, sizeof(superblock));
2551 
2552     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2553         Status = write_data_phys(dev->devobj, superblock_addrs[i], sb, sizeof(superblock));
2554 
2555         if (!NT_SUCCESS(Status)) {
2556             ExFreePool(sb);
2557             return Status;
2558         }
2559 
2560         i++;
2561     }
2562 
2563     ExFreePool(sb);
2564 
2565     return STATUS_SUCCESS;
2566 }
2567 
2568 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2569     KEY searchkey;
2570     traverse_ptr tp;
2571     NTSTATUS Status;
2572     LIST_ENTRY* le;
2573     volume_device_extension* vde;
2574 
2575     if (Vcb->need_write) {
2576         Status = do_write(Vcb, NULL);
2577 
2578         if (!NT_SUCCESS(Status))
2579             ERR("do_write returned %08x\n", Status);
2580     } else
2581         Status = STATUS_SUCCESS;
2582 
2583     free_trees(Vcb);
2584 
2585     if (!NT_SUCCESS(Status))
2586         return Status;
2587 
2588     // remove entry in chunk tree
2589 
2590     searchkey.obj_id = 1;
2591     searchkey.obj_type = TYPE_DEV_ITEM;
2592     searchkey.offset = dev->devitem.dev_id;
2593 
2594     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, FALSE, NULL);
2595     if (!NT_SUCCESS(Status)) {
2596         ERR("find_item returned %08x\n", Status);
2597         return Status;
2598     }
2599 
2600     if (!keycmp(searchkey, tp.item->key)) {
2601         Status = delete_tree_item(Vcb, &tp);
2602 
2603         if (!NT_SUCCESS(Status)) {
2604             ERR("delete_tree_item returned %08x\n", Status);
2605             return Status;
2606         }
2607     }
2608 
2609     // remove stats entry in device tree
2610 
2611     searchkey.obj_id = 0;
2612     searchkey.obj_type = TYPE_DEV_STATS;
2613     searchkey.offset = dev->devitem.dev_id;
2614 
2615     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2616     if (!NT_SUCCESS(Status)) {
2617         ERR("find_item returned %08x\n", Status);
2618         return Status;
2619     }
2620 
2621     if (!keycmp(searchkey, tp.item->key)) {
2622         Status = delete_tree_item(Vcb, &tp);
2623 
2624         if (!NT_SUCCESS(Status)) {
2625             ERR("delete_tree_item returned %08x\n", Status);
2626             return Status;
2627         }
2628     }
2629 
2630     // update superblock
2631 
2632     Vcb->superblock.num_devices--;
2633     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2634     Vcb->devices_loaded--;
2635 
2636     RemoveEntryList(&dev->list_entry);
2637 
2638     // flush
2639 
2640     Status = do_write(Vcb, NULL);
2641     if (!NT_SUCCESS(Status))
2642         ERR("do_write returned %08x\n", Status);
2643 
2644     free_trees(Vcb);
2645 
2646     if (!NT_SUCCESS(Status))
2647         return Status;
2648 
2649     if (!dev->readonly && dev->devobj) {
2650         Status = remove_superblocks(dev);
2651         if (!NT_SUCCESS(Status))
2652             WARN("remove_superblocks returned %08x\n", Status);
2653     }
2654 
2655     // remove entry in volume list
2656 
2657     vde = Vcb->vde;
2658 
2659     if (dev->devobj) {
2660         pdo_device_extension* pdode = vde->pdode;
2661 
2662         ExAcquireResourceExclusiveLite(&pdode->child_lock, TRUE);
2663 
2664         le = pdode->children.Flink;
2665         while (le != &pdode->children) {
2666             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2667 
2668             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2669                 PFILE_OBJECT FileObject;
2670                 PDEVICE_OBJECT mountmgr;
2671                 UNICODE_STRING mmdevpath;
2672 
2673                 pdode->children_loaded--;
2674 
2675                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2676                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2677                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2678                     if (!NT_SUCCESS(Status))
2679                         ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2680                     else {
2681                         MOUNTDEV_NAME mdn;
2682 
2683                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), TRUE, NULL);
2684                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2685                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2686                         else {
2687                             MOUNTDEV_NAME* mdn2;
2688                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2689 
2690                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2691                             if (!mdn2)
2692                                 ERR("out of memory\n");
2693                             else {
2694                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, TRUE, NULL);
2695                                 if (!NT_SUCCESS(Status))
2696                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2697                                 else {
2698                                     UNICODE_STRING name;
2699 
2700                                     name.Buffer = mdn2->Name;
2701                                     name.Length = name.MaximumLength = mdn2->NameLength;
2702 
2703                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2704                                     if (!NT_SUCCESS(Status))
2705                                         WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2706                                 }
2707 
2708                                 ExFreePool(mdn2);
2709                             }
2710                         }
2711 
2712                         ObDereferenceObject(FileObject);
2713                     }
2714                 }
2715 
2716                 ExFreePool(vc->pnp_name.Buffer);
2717                 RemoveEntryList(&vc->list_entry);
2718                 ExFreePool(vc);
2719 
2720                 ObDereferenceObject(vc->fileobj);
2721 
2722                 break;
2723             }
2724 
2725             le = le->Flink;
2726         }
2727 
2728         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2729             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2730 
2731             le = pdode->children.Flink;
2732             while (le != &pdode->children) {
2733                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2734 
2735                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2736                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2737                     break;
2738                 }
2739 
2740                 le = le->Flink;
2741             }
2742         }
2743 
2744         pdode->num_children = Vcb->superblock.num_devices;
2745 
2746         ExReleaseResourceLite(&pdode->child_lock);
2747 
2748         // free dev
2749 
2750         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2751             trim_whole_device(dev);
2752     }
2753 
2754     while (!IsListEmpty(&dev->space)) {
2755         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2756         space* s = CONTAINING_RECORD(le2, space, list_entry);
2757 
2758         ExFreePool(s);
2759     }
2760 
2761     ExFreePool(dev);
2762 
2763     if (Vcb->trim) {
2764         Vcb->trim = FALSE;
2765 
2766         le = Vcb->devices.Flink;
2767         while (le != &Vcb->devices) {
2768             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2769 
2770             if (dev2->trim) {
2771                 Vcb->trim = TRUE;
2772                 break;
2773             }
2774 
2775             le = le->Flink;
2776         }
2777     }
2778 
2779     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2780 
2781     return STATUS_SUCCESS;
2782 }
2783 
2784 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2785     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2786     DEVICE_DATA_SET_RANGE* ranges;
2787     ULONG datalen, i;
2788     KEY searchkey;
2789     traverse_ptr tp;
2790     NTSTATUS Status;
2791     BOOL b;
2792     UINT64 lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2793     LIST_ENTRY* le;
2794 
2795     dev->num_trim_entries = 0;
2796 
2797     searchkey.obj_id = dev->devitem.dev_id;
2798     searchkey.obj_type = TYPE_DEV_EXTENT;
2799     searchkey.offset = 0;
2800 
2801     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2802     if (!NT_SUCCESS(Status)) {
2803         ERR("find_item returned %08x\n", Status);
2804         return;
2805     }
2806 
2807     do {
2808         traverse_ptr next_tp;
2809 
2810         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2811             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2812                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2813 
2814                 if (tp.item->key.offset > lastoff)
2815                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2816 
2817                 lastoff = tp.item->key.offset + de->length;
2818             } else {
2819                 ERR("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2820                 return;
2821             }
2822         }
2823 
2824         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
2825 
2826         if (b) {
2827             tp = next_tp;
2828             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2829                 break;
2830         }
2831     } while (b);
2832 
2833     if (lastoff < dev->devitem.num_bytes)
2834         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2835 
2836     if (dev->num_trim_entries == 0)
2837         return;
2838 
2839     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2840 
2841     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2842     if (!dmdsa) {
2843         ERR("out of memory\n");
2844         goto end;
2845     }
2846 
2847     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2848     dmdsa->Action = DeviceDsmAction_Trim;
2849     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2850     dmdsa->ParameterBlockOffset = 0;
2851     dmdsa->ParameterBlockLength = 0;
2852     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64));
2853     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2854 
2855     ranges = (DEVICE_DATA_SET_RANGE*)((UINT8*)dmdsa + dmdsa->DataSetRangesOffset);
2856 
2857     i = 0;
2858     le = dev->trim_list.Flink;
2859     while (le != &dev->trim_list) {
2860         space* s = CONTAINING_RECORD(le, space, list_entry);
2861 
2862         ranges[i].StartingOffset = s->address;
2863         ranges[i].LengthInBytes = s->size;
2864         i++;
2865 
2866         le = le->Flink;
2867     }
2868 
2869     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, TRUE, NULL);
2870     if (!NT_SUCCESS(Status))
2871         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2872 
2873     ExFreePool(dmdsa);
2874 
2875 end:
2876     while (!IsListEmpty(&dev->trim_list)) {
2877         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2878         ExFreePool(s);
2879     }
2880 
2881     dev->num_trim_entries = 0;
2882 }
2883 
2884 static NTSTATUS try_consolidation(device_extension* Vcb, UINT64 flags, chunk** newchunk) {
2885     NTSTATUS Status;
2886     BOOL changed;
2887     LIST_ENTRY* le;
2888     chunk* rc;
2889 
2890     // FIXME - allow with metadata chunks?
2891 
2892     while (TRUE) {
2893         rc = NULL;
2894 
2895         ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
2896 
2897         ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
2898 
2899         // choose the least-used chunk we haven't looked at yet
2900         le = Vcb->chunks.Flink;
2901         while (le != &Vcb->chunks) {
2902             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2903 
2904             // FIXME - skip full-size chunks over e.g. 90% full?
2905             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2906                 rc = c;
2907 
2908             le = le->Flink;
2909         }
2910 
2911         ExReleaseResourceLite(&Vcb->chunk_lock);
2912 
2913         if (!rc) {
2914             ExReleaseResourceLite(&Vcb->tree_lock);
2915             break;
2916         }
2917 
2918         if (rc->list_entry_balance.Flink) {
2919             RemoveEntryList(&rc->list_entry_balance);
2920             Vcb->balance.chunks_left--;
2921         }
2922 
2923         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2924         rc->reloc = TRUE;
2925 
2926         ExReleaseResourceLite(&Vcb->tree_lock);
2927 
2928         do {
2929             changed = FALSE;
2930 
2931             Status = balance_data_chunk(Vcb, rc, &changed);
2932             if (!NT_SUCCESS(Status)) {
2933                 ERR("balance_data_chunk returned %08x\n", Status);
2934                 Vcb->balance.status = Status;
2935                 rc->list_entry_balance.Flink = NULL;
2936                 rc->reloc = FALSE;
2937                 return Status;
2938             }
2939 
2940             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
2941 
2942             if (Vcb->readonly)
2943                 Vcb->balance.stopping = TRUE;
2944 
2945             if (Vcb->balance.stopping)
2946                 return STATUS_SUCCESS;
2947         } while (changed);
2948 
2949         rc->list_entry_balance.Flink = NULL;
2950 
2951         rc->changed = TRUE;
2952         rc->space_changed = TRUE;
2953         rc->balance_num = Vcb->balance.balance_num;
2954 
2955         Status = do_write(Vcb, NULL);
2956         if (!NT_SUCCESS(Status)) {
2957             ERR("do_write returned %08x\n", Status);
2958             return Status;
2959         }
2960 
2961         free_trees(Vcb);
2962     }
2963 
2964     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
2965 
2966     Status = alloc_chunk(Vcb, flags, &rc, TRUE);
2967 
2968     ExReleaseResourceLite(&Vcb->chunk_lock);
2969 
2970     if (NT_SUCCESS(Status)) {
2971         *newchunk = rc;
2972         return Status;
2973     } else {
2974         ERR("alloc_chunk returned %08x\n", Status);
2975         return Status;
2976     }
2977 }
2978 
2979 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2980     LIST_ENTRY* le;
2981 
2982     while (!IsListEmpty(&dev->space)) {
2983         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
2984 
2985         ExFreePool(s);
2986     }
2987 
2988     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
2989 
2990     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
2991 
2992     le = Vcb->chunks.Flink;
2993     while (le != &Vcb->chunks) {
2994         UINT16 n;
2995         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2996         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2997 
2998         for (n = 0; n < c->chunk_item->num_stripes; n++) {
2999             UINT64 stripe_size = 0;
3000 
3001             if (cis[n].dev_id == dev->devitem.dev_id) {
3002                 if (stripe_size == 0) {
3003                     UINT16 factor;
3004 
3005                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3006                         factor = c->chunk_item->num_stripes;
3007                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3008                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3009                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3010                         factor = c->chunk_item->num_stripes - 1;
3011                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3012                         factor = c->chunk_item->num_stripes - 2;
3013                     else // SINGLE, DUP, RAID1
3014                         factor = 1;
3015 
3016                     stripe_size = c->chunk_item->size / factor;
3017                 }
3018 
3019                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3020             }
3021         }
3022 
3023         le = le->Flink;
3024     }
3025 
3026     return STATUS_SUCCESS;
3027 }
3028 
3029 _Function_class_(KSTART_ROUTINE)
3030 #ifndef __REACTOS__
3031 void balance_thread(void* context) {
3032 #else
3033 void NTAPI balance_thread(void* context) {
3034 #endif
3035     device_extension* Vcb = (device_extension*)context;
3036     LIST_ENTRY chunks;
3037     LIST_ENTRY* le;
3038     UINT64 num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3039     UINT64 old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3040     NTSTATUS Status;
3041 
3042     Vcb->balance.balance_num++;
3043 
3044     Vcb->balance.stopping = FALSE;
3045     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, FALSE);
3046 
3047     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3048         old_data_flags = Vcb->data_flags;
3049         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3050 
3051         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3052     }
3053 
3054     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3055         old_metadata_flags = Vcb->metadata_flags;
3056         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3057     }
3058 
3059     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3060         old_system_flags = Vcb->system_flags;
3061         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3062     }
3063 
3064     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3065         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3066             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3067         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3068             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3069     }
3070 
3071     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3072     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3073 
3074     InitializeListHead(&chunks);
3075 
3076     // FIXME - what are we supposed to do with limit_start?
3077 
3078     if (!Vcb->readonly) {
3079         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3080             Status = add_balance_item(Vcb);
3081             if (!NT_SUCCESS(Status)) {
3082                 ERR("add_balance_item returned %08x\n", Status);
3083                 Vcb->balance.status = Status;
3084                 goto end;
3085             }
3086         } else {
3087             if (Vcb->need_write) {
3088                 Status = do_write(Vcb, NULL);
3089 
3090                 free_trees(Vcb);
3091 
3092                 if (!NT_SUCCESS(Status)) {
3093                     ERR("do_write returned %08x\n", Status);
3094                     Vcb->balance.status = Status;
3095                     goto end;
3096                 }
3097             }
3098         }
3099     }
3100 
3101     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3102 
3103     if (Vcb->balance.stopping)
3104         goto end;
3105 
3106     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3107 
3108     le = Vcb->chunks.Flink;
3109     while (le != &Vcb->chunks) {
3110         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3111         UINT8 sort;
3112 
3113         acquire_chunk_lock(c, Vcb);
3114 
3115         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3116             sort = BALANCE_OPTS_DATA;
3117         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3118             sort = BALANCE_OPTS_METADATA;
3119         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3120             sort = BALANCE_OPTS_SYSTEM;
3121         else {
3122             ERR("unexpected chunk type %llx\n", c->chunk_item->type);
3123             release_chunk_lock(c, Vcb);
3124             break;
3125         }
3126 
3127         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3128             should_balance_chunk(Vcb, sort, c)) {
3129             InsertTailList(&chunks, &c->list_entry_balance);
3130 
3131             num_chunks[sort]++;
3132             Vcb->balance.total_chunks++;
3133             Vcb->balance.chunks_left++;
3134         } else if (sort == BALANCE_OPTS_METADATA)
3135             okay_metadata_chunks++;
3136         else if (sort == BALANCE_OPTS_DATA)
3137             okay_data_chunks++;
3138         else if (sort == BALANCE_OPTS_SYSTEM)
3139             okay_system_chunks++;
3140 
3141         if (!c->cache_loaded) {
3142             Status = load_cache_chunk(Vcb, c, NULL);
3143 
3144             if (!NT_SUCCESS(Status)) {
3145                 ERR("load_cache_chunk returned %08x\n", Status);
3146                 Vcb->balance.status = Status;
3147                 release_chunk_lock(c, Vcb);
3148                 ExReleaseResourceLite(&Vcb->chunk_lock);
3149                 goto end;
3150             }
3151         }
3152 
3153         release_chunk_lock(c, Vcb);
3154 
3155         le = le->Flink;
3156     }
3157 
3158     ExReleaseResourceLite(&Vcb->chunk_lock);
3159 
3160     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3161     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3162         BOOL consolidated = FALSE;
3163         chunk* c;
3164 
3165         if (okay_metadata_chunks == 0) {
3166             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3167 
3168             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, TRUE);
3169             if (NT_SUCCESS(Status))
3170                 c->balance_num = Vcb->balance.balance_num;
3171             else if (Status != STATUS_DISK_FULL || consolidated) {
3172                 ERR("alloc_chunk returned %08x\n", Status);
3173                 ExReleaseResourceLite(&Vcb->chunk_lock);
3174                 Vcb->balance.status = Status;
3175                 goto end;
3176             }
3177 
3178             ExReleaseResourceLite(&Vcb->chunk_lock);
3179 
3180             if (Status == STATUS_DISK_FULL) {
3181                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3182                 if (!NT_SUCCESS(Status)) {
3183                     ERR("try_consolidation returned %08x\n", Status);
3184                     Vcb->balance.status = Status;
3185                     goto end;
3186                 } else
3187                     c->balance_num = Vcb->balance.balance_num;
3188 
3189                 consolidated = TRUE;
3190 
3191                 if (Vcb->balance.stopping)
3192                     goto end;
3193             }
3194         }
3195 
3196         if (okay_data_chunks == 0) {
3197             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3198 
3199             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, TRUE);
3200             if (NT_SUCCESS(Status))
3201                 c->balance_num = Vcb->balance.balance_num;
3202             else if (Status != STATUS_DISK_FULL || consolidated) {
3203                 ERR("alloc_chunk returned %08x\n", Status);
3204                 ExReleaseResourceLite(&Vcb->chunk_lock);
3205                 Vcb->balance.status = Status;
3206                 goto end;
3207             }
3208 
3209             ExReleaseResourceLite(&Vcb->chunk_lock);
3210 
3211             if (Status == STATUS_DISK_FULL) {
3212                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3213                 if (!NT_SUCCESS(Status)) {
3214                     ERR("try_consolidation returned %08x\n", Status);
3215                     Vcb->balance.status = Status;
3216                     goto end;
3217                 } else
3218                     c->balance_num = Vcb->balance.balance_num;
3219 
3220                 consolidated = TRUE;
3221 
3222                 if (Vcb->balance.stopping)
3223                     goto end;
3224             }
3225         }
3226 
3227         if (okay_system_chunks == 0) {
3228             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3229 
3230             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, TRUE);
3231             if (NT_SUCCESS(Status))
3232                 c->balance_num = Vcb->balance.balance_num;
3233             else if (Status != STATUS_DISK_FULL || consolidated) {
3234                 ERR("alloc_chunk returned %08x\n", Status);
3235                 ExReleaseResourceLite(&Vcb->chunk_lock);
3236                 Vcb->balance.status = Status;
3237                 goto end;
3238             }
3239 
3240             ExReleaseResourceLite(&Vcb->chunk_lock);
3241 
3242             if (Status == STATUS_DISK_FULL) {
3243                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3244                 if (!NT_SUCCESS(Status)) {
3245                     ERR("try_consolidation returned %08x\n", Status);
3246                     Vcb->balance.status = Status;
3247                     goto end;
3248                 } else
3249                     c->balance_num = Vcb->balance.balance_num;
3250 
3251                 consolidated = TRUE;
3252 
3253                 if (Vcb->balance.stopping)
3254                     goto end;
3255             }
3256         }
3257     }
3258 
3259     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3260 
3261     le = chunks.Flink;
3262     while (le != &chunks) {
3263         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3264 
3265         c->reloc = TRUE;
3266 
3267         le = le->Flink;
3268     }
3269 
3270     ExReleaseResourceLite(&Vcb->chunk_lock);
3271 
3272     // do data chunks before metadata
3273     le = chunks.Flink;
3274     while (le != &chunks) {
3275         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3276         LIST_ENTRY* le2 = le->Flink;
3277 
3278         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3279             BOOL changed;
3280 
3281             do {
3282                 changed = FALSE;
3283 
3284                 Status = balance_data_chunk(Vcb, c, &changed);
3285                 if (!NT_SUCCESS(Status)) {
3286                     ERR("balance_data_chunk returned %08x\n", Status);
3287                     Vcb->balance.status = Status;
3288                     goto end;
3289                 }
3290 
3291                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3292 
3293                 if (Vcb->readonly)
3294                     Vcb->balance.stopping = TRUE;
3295 
3296                 if (Vcb->balance.stopping)
3297                     break;
3298             } while (changed);
3299 
3300             c->changed = TRUE;
3301             c->space_changed = TRUE;
3302         }
3303 
3304         if (Vcb->balance.stopping)
3305             goto end;
3306 
3307         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3308             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3309             RemoveEntryList(&c->list_entry_balance);
3310             c->list_entry_balance.Flink = NULL;
3311 
3312             Vcb->balance.chunks_left--;
3313         }
3314 
3315         le = le2;
3316     }
3317 
3318     // do metadata chunks
3319     while (!IsListEmpty(&chunks)) {
3320         chunk* c;
3321         BOOL changed;
3322 
3323         le = RemoveHeadList(&chunks);
3324         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3325 
3326         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3327             do {
3328                 Status = balance_metadata_chunk(Vcb, c, &changed);
3329                 if (!NT_SUCCESS(Status)) {
3330                     ERR("balance_metadata_chunk returned %08x\n", Status);
3331                     Vcb->balance.status = Status;
3332                     goto end;
3333                 }
3334 
3335                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3336 
3337                 if (Vcb->readonly)
3338                     Vcb->balance.stopping = TRUE;
3339 
3340                 if (Vcb->balance.stopping)
3341                     break;
3342             } while (changed);
3343 
3344             c->changed = TRUE;
3345             c->space_changed = TRUE;
3346         }
3347 
3348         if (Vcb->balance.stopping)
3349             break;
3350 
3351         c->list_entry_balance.Flink = NULL;
3352 
3353         Vcb->balance.chunks_left--;
3354     }
3355 
3356 end:
3357     if (!Vcb->readonly) {
3358         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3359             le = chunks.Flink;
3360             while (le != &chunks) {
3361                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3362                 c->reloc = FALSE;
3363 
3364                 le = le->Flink;
3365                 c->list_entry_balance.Flink = NULL;
3366             }
3367 
3368             if (old_data_flags != 0)
3369                 Vcb->data_flags = old_data_flags;
3370 
3371             if (old_metadata_flags != 0)
3372                 Vcb->metadata_flags = old_metadata_flags;
3373 
3374             if (old_system_flags != 0)
3375                 Vcb->system_flags = old_system_flags;
3376         }
3377 
3378         if (Vcb->balance.removing) {
3379             device* dev = NULL;
3380 
3381             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3382 
3383             le = Vcb->devices.Flink;
3384             while (le != &Vcb->devices) {
3385                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3386 
3387                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3388                     dev = dev2;
3389                     break;
3390                 }
3391 
3392                 le = le->Flink;
3393             }
3394 
3395             if (dev) {
3396                 if (Vcb->balance.chunks_left == 0) {
3397                     Status = finish_removing_device(Vcb, dev);
3398 
3399                     if (!NT_SUCCESS(Status)) {
3400                         ERR("finish_removing_device returned %08x\n", Status);
3401                         dev->reloc = FALSE;
3402                     }
3403                 } else
3404                     dev->reloc = FALSE;
3405             }
3406 
3407             ExReleaseResourceLite(&Vcb->tree_lock);
3408         } else if (Vcb->balance.shrinking) {
3409             device* dev = NULL;
3410 
3411             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3412 
3413             le = Vcb->devices.Flink;
3414             while (le != &Vcb->devices) {
3415                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3416 
3417                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3418                     dev = dev2;
3419                     break;
3420                 }
3421 
3422                 le = le->Flink;
3423             }
3424 
3425             if (!dev) {
3426                 ERR("could not find device %llx\n", Vcb->balance.opts[0].devid);
3427                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3428             }
3429 
3430             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3431                 if (dev) {
3432                     Status = regenerate_space_list(Vcb, dev);
3433                     if (!NT_SUCCESS(Status))
3434                         WARN("regenerate_space_list returned %08x\n", Status);
3435                 }
3436             } else {
3437                 UINT64 old_size;
3438 
3439                 old_size = dev->devitem.num_bytes;
3440                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3441 
3442                 Status = update_dev_item(Vcb, dev, NULL);
3443                 if (!NT_SUCCESS(Status)) {
3444                     ERR("update_dev_item returned %08x\n", Status);
3445                     dev->devitem.num_bytes = old_size;
3446                     Vcb->balance.status = Status;
3447 
3448                     Status = regenerate_space_list(Vcb, dev);
3449                     if (!NT_SUCCESS(Status))
3450                         WARN("regenerate_space_list returned %08x\n", Status);
3451                 } else {
3452                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3453 
3454                     Status = do_write(Vcb, NULL);
3455                     if (!NT_SUCCESS(Status))
3456                         ERR("do_write returned %08x\n", Status);
3457 
3458                     free_trees(Vcb);
3459                 }
3460             }
3461 
3462             ExReleaseResourceLite(&Vcb->tree_lock);
3463 
3464             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3465                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3466         } else {
3467             Status = remove_balance_item(Vcb);
3468             if (!NT_SUCCESS(Status)) {
3469                 ERR("remove_balance_item returned %08x\n", Status);
3470                 goto end;
3471             }
3472         }
3473 
3474         if (Vcb->trim && !Vcb->options.no_trim) {
3475             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3476 
3477             le = Vcb->devices.Flink;
3478             while (le != &Vcb->devices) {
3479                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3480 
3481                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3482                     trim_unalloc_space(Vcb, dev2);
3483 
3484                 le = le->Flink;
3485             }
3486 
3487             ExReleaseResourceLite(&Vcb->tree_lock);
3488         }
3489     }
3490 
3491     ZwClose(Vcb->balance.thread);
3492     Vcb->balance.thread = NULL;
3493 
3494     KeSetEvent(&Vcb->balance.finished, 0, FALSE);
3495 }
3496 
3497 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3498     NTSTATUS Status;
3499     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3500     UINT8 i;
3501 
3502     if (length < sizeof(btrfs_start_balance) || !data)
3503         return STATUS_INVALID_PARAMETER;
3504 
3505     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3506         return STATUS_PRIVILEGE_NOT_HELD;
3507 
3508     if (Vcb->locked) {
3509         WARN("cannot start balance while locked\n");
3510         return STATUS_DEVICE_NOT_READY;
3511     }
3512 
3513     if (Vcb->scrub.thread) {
3514         WARN("cannot start balance while scrub running\n");
3515         return STATUS_DEVICE_NOT_READY;
3516     }
3517 
3518     if (Vcb->balance.thread) {
3519         WARN("balance already running\n");
3520         return STATUS_DEVICE_NOT_READY;
3521     }
3522 
3523     if (Vcb->readonly)
3524         return STATUS_MEDIA_WRITE_PROTECTED;
3525 
3526     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3527         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3528         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3529         return STATUS_SUCCESS;
3530 
3531     for (i = 0; i < 3; i++) {
3532         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3533             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3534                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3535                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3536 
3537                 if (bsb->opts[i].profiles == 0)
3538                     return STATUS_INVALID_PARAMETER;
3539             }
3540 
3541             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3542                 if (bsb->opts[i].devid == 0)
3543                     return STATUS_INVALID_PARAMETER;
3544             }
3545 
3546             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3547                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3548                     return STATUS_INVALID_PARAMETER;
3549             }
3550 
3551             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3552                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3553                     return STATUS_INVALID_PARAMETER;
3554             }
3555 
3556             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3557                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3558                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3559 
3560                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3561                     return STATUS_INVALID_PARAMETER;
3562             }
3563 
3564             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3565                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3566                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3567 
3568                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3569                     return STATUS_INVALID_PARAMETER;
3570             }
3571 
3572             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3573                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3574                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3575 
3576                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3577                     return STATUS_INVALID_PARAMETER;
3578             }
3579 
3580             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3581                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3582                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3583                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3584                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3585                     return STATUS_INVALID_PARAMETER;
3586             }
3587         }
3588     }
3589 
3590     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3591     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3592     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3593 
3594     Vcb->balance.paused = FALSE;
3595     Vcb->balance.removing = FALSE;
3596     Vcb->balance.shrinking = FALSE;
3597     Vcb->balance.status = STATUS_SUCCESS;
3598     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3599 
3600     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3601     if (!NT_SUCCESS(Status)) {
3602         ERR("PsCreateSystemThread returned %08x\n", Status);
3603         return Status;
3604     }
3605 
3606     return STATUS_SUCCESS;
3607 }
3608 
3609 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3610     KEY searchkey;
3611     traverse_ptr tp;
3612     NTSTATUS Status;
3613     BALANCE_ITEM* bi;
3614     int i;
3615 
3616     searchkey.obj_id = BALANCE_ITEM_ID;
3617     searchkey.obj_type = TYPE_TEMP_ITEM;
3618     searchkey.offset = 0;
3619 
3620     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
3621     if (!NT_SUCCESS(Status)) {
3622         ERR("find_item returned %08x\n", Status);
3623         return Status;
3624     }
3625 
3626     if (keycmp(tp.item->key, searchkey)) {
3627         TRACE("no balance item found\n");
3628         return STATUS_NOT_FOUND;
3629     }
3630 
3631     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3632         WARN("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3633              tp.item->size, sizeof(BALANCE_ITEM));
3634         return STATUS_INTERNAL_ERROR;
3635     }
3636 
3637     bi = (BALANCE_ITEM*)tp.item->data;
3638 
3639     if (bi->flags & BALANCE_FLAGS_DATA)
3640         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3641 
3642     if (bi->flags & BALANCE_FLAGS_METADATA)
3643         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3644 
3645     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3646         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3647 
3648     // do the heuristics that Linux driver does
3649 
3650     for (i = 0; i < 3; i++) {
3651         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3652             // if converting, don't redo chunks already done
3653 
3654             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3655                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3656 
3657             // don't balance chunks more than 90% filled - presumably these
3658             // have already been done
3659 
3660             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3661                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3662             ) {
3663                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3664                 Vcb->balance.opts[i].usage_start = 0;
3665                 Vcb->balance.opts[i].usage_end = 90;
3666             }
3667         }
3668     }
3669 
3670     if (Vcb->readonly || Vcb->options.skip_balance)
3671         Vcb->balance.paused = TRUE;
3672     else
3673         Vcb->balance.paused = FALSE;
3674 
3675     Vcb->balance.removing = FALSE;
3676     Vcb->balance.shrinking = FALSE;
3677     Vcb->balance.status = STATUS_SUCCESS;
3678     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3679 
3680     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3681     if (!NT_SUCCESS(Status)) {
3682         ERR("PsCreateSystemThread returned %08x\n", Status);
3683         return Status;
3684     }
3685 
3686     return STATUS_SUCCESS;
3687 }
3688 
3689 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3690     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3691 
3692     if (length < sizeof(btrfs_query_balance) || !data)
3693         return STATUS_INVALID_PARAMETER;
3694 
3695     if (!Vcb->balance.thread) {
3696         bqb->status = BTRFS_BALANCE_STOPPED;
3697 
3698         if (!NT_SUCCESS(Vcb->balance.status)) {
3699             bqb->status |= BTRFS_BALANCE_ERROR;
3700             bqb->error = Vcb->balance.status;
3701         }
3702 
3703         return STATUS_SUCCESS;
3704     }
3705 
3706     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3707 
3708     if (Vcb->balance.removing)
3709         bqb->status |= BTRFS_BALANCE_REMOVAL;
3710 
3711     if (Vcb->balance.shrinking)
3712         bqb->status |= BTRFS_BALANCE_SHRINKING;
3713 
3714     if (!NT_SUCCESS(Vcb->balance.status))
3715         bqb->status |= BTRFS_BALANCE_ERROR;
3716 
3717     bqb->chunks_left = Vcb->balance.chunks_left;
3718     bqb->total_chunks = Vcb->balance.total_chunks;
3719     bqb->error = Vcb->balance.status;
3720     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3721     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3722     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3723 
3724     return STATUS_SUCCESS;
3725 }
3726 
3727 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3728     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3729         return STATUS_PRIVILEGE_NOT_HELD;
3730 
3731     if (!Vcb->balance.thread)
3732         return STATUS_DEVICE_NOT_READY;
3733 
3734     if (Vcb->balance.paused)
3735         return STATUS_DEVICE_NOT_READY;
3736 
3737     Vcb->balance.paused = TRUE;
3738     KeClearEvent(&Vcb->balance.event);
3739 
3740     return STATUS_SUCCESS;
3741 }
3742 
3743 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3744     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3745         return STATUS_PRIVILEGE_NOT_HELD;
3746 
3747     if (!Vcb->balance.thread)
3748         return STATUS_DEVICE_NOT_READY;
3749 
3750     if (!Vcb->balance.paused)
3751         return STATUS_DEVICE_NOT_READY;
3752 
3753     if (Vcb->readonly)
3754         return STATUS_MEDIA_WRITE_PROTECTED;
3755 
3756     Vcb->balance.paused = FALSE;
3757     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3758 
3759     return STATUS_SUCCESS;
3760 }
3761 
3762 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3763     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3764         return STATUS_PRIVILEGE_NOT_HELD;
3765 
3766     if (!Vcb->balance.thread)
3767         return STATUS_DEVICE_NOT_READY;
3768 
3769     Vcb->balance.paused = FALSE;
3770     Vcb->balance.stopping = TRUE;
3771     Vcb->balance.status = STATUS_SUCCESS;
3772     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3773 
3774     return STATUS_SUCCESS;
3775 }
3776 
3777 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3778     UINT64 devid;
3779     LIST_ENTRY* le;
3780     device* dev = NULL;
3781     NTSTATUS Status;
3782     int i;
3783     UINT64 num_rw_devices;
3784 
3785     TRACE("(%p, %p, %x)\n", Vcb, data, length);
3786 
3787     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3788         return STATUS_PRIVILEGE_NOT_HELD;
3789 
3790     if (length < sizeof(UINT64))
3791         return STATUS_INVALID_PARAMETER;
3792 
3793     devid = *(UINT64*)data;
3794 
3795     ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
3796 
3797     if (Vcb->readonly) {
3798         ExReleaseResourceLite(&Vcb->tree_lock);
3799         return STATUS_MEDIA_WRITE_PROTECTED;
3800     }
3801 
3802     num_rw_devices = 0;
3803 
3804     le = Vcb->devices.Flink;
3805     while (le != &Vcb->devices) {
3806         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3807 
3808         if (dev2->devitem.dev_id == devid)
3809             dev = dev2;
3810 
3811         if (!dev2->readonly)
3812             num_rw_devices++;
3813 
3814         le = le->Flink;
3815     }
3816 
3817     if (!dev) {
3818         ExReleaseResourceLite(&Vcb->tree_lock);
3819         WARN("device %llx not found\n", devid);
3820         return STATUS_NOT_FOUND;
3821     }
3822 
3823     if (!dev->readonly) {
3824         if (num_rw_devices == 1) {
3825             ExReleaseResourceLite(&Vcb->tree_lock);
3826             WARN("not removing last non-readonly device\n");
3827             return STATUS_INVALID_PARAMETER;
3828         }
3829 
3830         if (num_rw_devices == 4 &&
3831             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3832              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3833         ) {
3834             ExReleaseResourceLite(&Vcb->tree_lock);
3835             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3836             return STATUS_CANNOT_DELETE;
3837         }
3838 
3839         if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3840             ExReleaseResourceLite(&Vcb->tree_lock);
3841             ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3842             return STATUS_CANNOT_DELETE;
3843         }
3844 
3845         if (num_rw_devices == 2 &&
3846             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3847              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3848         ) {
3849             ExReleaseResourceLite(&Vcb->tree_lock);
3850             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3851             return STATUS_CANNOT_DELETE;
3852         }
3853     }
3854 
3855     ExReleaseResourceLite(&Vcb->tree_lock);
3856 
3857     if (Vcb->balance.thread) {
3858         WARN("balance already running\n");
3859         return STATUS_DEVICE_NOT_READY;
3860     }
3861 
3862     dev->reloc = TRUE;
3863 
3864     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3865 
3866     for (i = 0; i < 3; i++) {
3867         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3868         Vcb->balance.opts[i].devid = devid;
3869     }
3870 
3871     Vcb->balance.paused = FALSE;
3872     Vcb->balance.removing = TRUE;
3873     Vcb->balance.shrinking = FALSE;
3874     Vcb->balance.status = STATUS_SUCCESS;
3875     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3876 
3877     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3878     if (!NT_SUCCESS(Status)) {
3879         ERR("PsCreateSystemThread returned %08x\n", Status);
3880         dev->reloc = FALSE;
3881         return Status;
3882     }
3883 
3884     return STATUS_SUCCESS;
3885 }
3886