xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 7eead935)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include <ntddstor.h>
21 
22 typedef struct {
23     UINT64 address;
24     UINT64 new_address;
25     tree_header* data;
26     EXTENT_ITEM* ei;
27     tree* t;
28     BOOL system;
29     LIST_ENTRY refs;
30     LIST_ENTRY list_entry;
31 } metadata_reloc;
32 
33 typedef struct {
34     UINT8 type;
35     UINT64 hash;
36 
37     union {
38         TREE_BLOCK_REF tbr;
39         SHARED_BLOCK_REF sbr;
40     };
41 
42     metadata_reloc* parent;
43     BOOL top;
44     LIST_ENTRY list_entry;
45 } metadata_reloc_ref;
46 
47 typedef struct {
48     UINT64 address;
49     UINT64 size;
50     UINT64 new_address;
51     chunk* newchunk;
52     EXTENT_ITEM* ei;
53     LIST_ENTRY refs;
54     LIST_ENTRY list_entry;
55 } data_reloc;
56 
57 typedef struct {
58     UINT8 type;
59     UINT64 hash;
60 
61     union {
62         EXTENT_DATA_REF edr;
63         SHARED_DATA_REF sdr;
64     };
65 
66     metadata_reloc* parent;
67     LIST_ENTRY list_entry;
68 } data_reloc_ref;
69 
70 #ifndef __REACTOS__
71 #ifndef _MSC_VER // not in mingw yet
72 #define DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED 0x80000000
73 #endif
74 #endif
75 
76 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
77 
78 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
79                                    BOOL skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
80     NTSTATUS Status;
81     metadata_reloc* mr;
82     EXTENT_ITEM* ei;
83     UINT16 len;
84     UINT64 inline_rc;
85     UINT8* ptr;
86 
87     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
88     if (!mr) {
89         ERR("out of memory\n");
90         return STATUS_INSUFFICIENT_RESOURCES;
91     }
92 
93     mr->address = tp->item->key.obj_id;
94     mr->data = NULL;
95     mr->ei = (EXTENT_ITEM*)tp->item->data;
96     mr->system = FALSE;
97     InitializeListHead(&mr->refs);
98 
99     Status = delete_tree_item(Vcb, tp);
100     if (!NT_SUCCESS(Status)) {
101         ERR("delete_tree_item returned %08x\n", Status);
102         ExFreePool(mr);
103         return Status;
104     }
105 
106     if (!c)
107         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
108 
109     if (c) {
110         acquire_chunk_lock(c, Vcb);
111 
112         c->used -= Vcb->superblock.node_size;
113 
114         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
115 
116         release_chunk_lock(c, Vcb);
117     }
118 
119     ei = (EXTENT_ITEM*)tp->item->data;
120     inline_rc = 0;
121 
122     len = tp->item->size - sizeof(EXTENT_ITEM);
123     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
124     if (!skinny) {
125         len -= sizeof(EXTENT_ITEM2);
126         ptr += sizeof(EXTENT_ITEM2);
127     }
128 
129     while (len > 0) {
130         UINT8 secttype = *ptr;
131         UINT16 sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
132         metadata_reloc_ref* ref;
133 
134         len--;
135 
136         if (sectlen > len) {
137             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
138             return STATUS_INTERNAL_ERROR;
139         }
140 
141         if (sectlen == 0) {
142             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
143             return STATUS_INTERNAL_ERROR;
144         }
145 
146         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
147         if (!ref) {
148             ERR("out of memory\n");
149             return STATUS_INSUFFICIENT_RESOURCES;
150         }
151 
152         if (secttype == TYPE_TREE_BLOCK_REF) {
153             ref->type = TYPE_TREE_BLOCK_REF;
154             RtlCopyMemory(&ref->tbr, ptr + sizeof(UINT8), sizeof(TREE_BLOCK_REF));
155             inline_rc++;
156         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
157             ref->type = TYPE_SHARED_BLOCK_REF;
158             RtlCopyMemory(&ref->sbr, ptr + sizeof(UINT8), sizeof(SHARED_BLOCK_REF));
159             inline_rc++;
160         } else {
161             ERR("unexpected tree type %x\n", secttype);
162             ExFreePool(ref);
163             return STATUS_INTERNAL_ERROR;
164         }
165 
166         ref->parent = NULL;
167         ref->top = FALSE;
168         InsertTailList(&mr->refs, &ref->list_entry);
169 
170         len -= sectlen;
171         ptr += sizeof(UINT8) + sectlen;
172     }
173 
174     if (inline_rc < ei->refcount) { // look for non-inline entries
175         traverse_ptr tp2 = *tp, next_tp;
176 
177         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
178             tp2 = next_tp;
179 
180             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
181                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
182                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
183                     if (!ref) {
184                         ERR("out of memory\n");
185                         return STATUS_INSUFFICIENT_RESOURCES;
186                     }
187 
188                     ref->type = TYPE_TREE_BLOCK_REF;
189                     ref->tbr.offset = tp2.item->key.offset;
190                     ref->parent = NULL;
191                     ref->top = FALSE;
192                     InsertTailList(&mr->refs, &ref->list_entry);
193 
194                     Status = delete_tree_item(Vcb, &tp2);
195                     if (!NT_SUCCESS(Status)) {
196                         ERR("delete_tree_item returned %08x\n", Status);
197                         return Status;
198                     }
199                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
200                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
201                     if (!ref) {
202                         ERR("out of memory\n");
203                         return STATUS_INSUFFICIENT_RESOURCES;
204                     }
205 
206                     ref->type = TYPE_SHARED_BLOCK_REF;
207                     ref->sbr.offset = tp2.item->key.offset;
208                     ref->parent = NULL;
209                     ref->top = FALSE;
210                     InsertTailList(&mr->refs, &ref->list_entry);
211 
212                     Status = delete_tree_item(Vcb, &tp2);
213                     if (!NT_SUCCESS(Status)) {
214                         ERR("delete_tree_item returned %08x\n", Status);
215                         return Status;
216                     }
217                 }
218             } else
219                 break;
220         }
221     }
222 
223     InsertTailList(items, &mr->list_entry);
224 
225     if (mr2)
226         *mr2 = mr;
227 
228     return STATUS_SUCCESS;
229 }
230 
231 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
232                                           UINT64 address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
233     LIST_ENTRY* le;
234     KEY searchkey;
235     traverse_ptr tp;
236     BOOL skinny = FALSE;
237     NTSTATUS Status;
238 
239     le = items->Flink;
240     while (le != items) {
241         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
242 
243         if (mr->address == address) {
244             *mr2 = mr;
245             return STATUS_SUCCESS;
246         }
247 
248         le = le->Flink;
249     }
250 
251     searchkey.obj_id = address;
252     searchkey.obj_type = TYPE_METADATA_ITEM;
253     searchkey.offset = 0xffffffffffffffff;
254 
255     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
256     if (!NT_SUCCESS(Status)) {
257         ERR("find_item returned %08x\n", Status);
258         return Status;
259     }
260 
261     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
262         skinny = TRUE;
263     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
264              tp.item->size >= sizeof(EXTENT_ITEM)) {
265         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
266 
267         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
268             ERR("EXTENT_ITEM for %llx found, but tree flag not set\n", address);
269             return STATUS_INTERNAL_ERROR;
270         }
271     } else {
272         ERR("could not find valid EXTENT_ITEM for address %llx\n", address);
273         return STATUS_INTERNAL_ERROR;
274     }
275 
276     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
277     if (!NT_SUCCESS(Status)) {
278         ERR("add_metadata_reloc returned %08x\n", Status);
279         return Status;
280     }
281 
282     return STATUS_SUCCESS;
283 }
284 
285 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
286     LIST_ENTRY newlist, *le;
287 
288     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
289         return;
290 
291     // insertion sort
292 
293     InitializeListHead(&newlist);
294 
295     while (!IsListEmpty(&mr->refs)) {
296         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
297         BOOL inserted = FALSE;
298 
299         if (ref->type == TYPE_TREE_BLOCK_REF)
300             ref->hash = ref->tbr.offset;
301         else if (ref->type == TYPE_SHARED_BLOCK_REF)
302             ref->hash = ref->parent->new_address;
303 
304         le = newlist.Flink;
305         while (le != &newlist) {
306             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
307 
308             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
309                 InsertHeadList(le->Blink, &ref->list_entry);
310                 inserted = TRUE;
311                 break;
312             }
313 
314             le = le->Flink;
315         }
316 
317         if (!inserted)
318             InsertTailList(&newlist, &ref->list_entry);
319     }
320 
321     newlist.Flink->Blink = &mr->refs;
322     newlist.Blink->Flink = &mr->refs;
323     mr->refs.Flink = newlist.Flink;
324     mr->refs.Blink = newlist.Blink;
325 }
326 
327 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
328     NTSTATUS Status;
329     LIST_ENTRY* le;
330     UINT64 rc = 0;
331     UINT16 inline_len;
332     BOOL all_inline = TRUE;
333     metadata_reloc_ref* first_noninline = NULL;
334     EXTENT_ITEM* ei;
335     UINT8* ptr;
336 
337     inline_len = sizeof(EXTENT_ITEM);
338     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
339         inline_len += sizeof(EXTENT_ITEM2);
340 
341     sort_metadata_reloc_refs(mr);
342 
343     le = mr->refs.Flink;
344     while (le != &mr->refs) {
345         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
346         UINT16 extlen = 0;
347 
348         rc++;
349 
350         if (ref->type == TYPE_TREE_BLOCK_REF)
351             extlen += sizeof(TREE_BLOCK_REF);
352         else if (ref->type == TYPE_SHARED_BLOCK_REF)
353             extlen += sizeof(SHARED_BLOCK_REF);
354 
355         if (all_inline) {
356             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
357                 all_inline = FALSE;
358                 first_noninline = ref;
359             } else
360                 inline_len += extlen + 1;
361         }
362 
363         le = le->Flink;
364     }
365 
366     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
367     if (!ei) {
368         ERR("out of memory\n");
369         return STATUS_INSUFFICIENT_RESOURCES;
370     }
371 
372     ei->refcount = rc;
373     ei->generation = mr->ei->generation;
374     ei->flags = mr->ei->flags;
375     ptr = (UINT8*)&ei[1];
376 
377     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
378         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
379 
380         ei2->firstitem = *(KEY*)&mr->data[1];
381         ei2->level = mr->data->level;
382 
383         ptr += sizeof(EXTENT_ITEM2);
384     }
385 
386     le = mr->refs.Flink;
387     while (le != &mr->refs) {
388         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
389 
390         if (ref == first_noninline)
391             break;
392 
393         *ptr = ref->type;
394         ptr++;
395 
396         if (ref->type == TYPE_TREE_BLOCK_REF) {
397             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
398 
399             tbr->offset = ref->tbr.offset;
400 
401             ptr += sizeof(TREE_BLOCK_REF);
402         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
403             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
404 
405             sbr->offset = ref->parent->new_address;
406 
407             ptr += sizeof(SHARED_BLOCK_REF);
408         }
409 
410         le = le->Flink;
411     }
412 
413     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
414         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
415     else
416         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
417 
418     if (!NT_SUCCESS(Status)) {
419         ERR("insert_tree_item returned %08x\n", Status);
420         ExFreePool(ei);
421         return Status;
422     }
423 
424     if (!all_inline) {
425         le = &first_noninline->list_entry;
426 
427         while (le != &mr->refs) {
428             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
429 
430             if (ref->type == TYPE_TREE_BLOCK_REF) {
431                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
432                 if (!NT_SUCCESS(Status)) {
433                     ERR("insert_tree_item returned %08x\n", Status);
434                     return Status;
435                 }
436             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
437                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
438                 if (!NT_SUCCESS(Status)) {
439                     ERR("insert_tree_item returned %08x\n", Status);
440                     return Status;
441                 }
442             }
443 
444             le = le->Flink;
445         }
446     }
447 
448     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
449         if (mr->data->level > 0) {
450             UINT16 i;
451             internal_node* in = (internal_node*)&mr->data[1];
452 
453             for (i = 0; i < mr->data->num_items; i++) {
454                 UINT64 sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
455 
456                 if (sbrrc > 0) {
457                     SHARED_BLOCK_REF sbr;
458 
459                     sbr.offset = mr->new_address;
460 
461                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
462                     if (!NT_SUCCESS(Status)) {
463                         ERR("increase_extent_refcount returned %08x\n", Status);
464                         return Status;
465                     }
466 
467                     sbr.offset = mr->address;
468 
469                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
470                                                       sbr.offset, FALSE, NULL);
471                     if (!NT_SUCCESS(Status)) {
472                         ERR("decrease_extent_refcount returned %08x\n", Status);
473                         return Status;
474                     }
475                 }
476             }
477         } else {
478             UINT16 i;
479             leaf_node* ln = (leaf_node*)&mr->data[1];
480 
481             for (i = 0; i < mr->data->num_items; i++) {
482                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
483                     EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
484 
485                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
486                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
487 
488                         if (ed2->size > 0) { // not sparse
489                             UINT32 sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
490 
491                             if (sdrrc > 0) {
492                                 SHARED_DATA_REF sdr;
493                                 chunk* c;
494 
495                                 sdr.offset = mr->new_address;
496                                 sdr.count = sdrrc;
497 
498                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
499                                 if (!NT_SUCCESS(Status)) {
500                                     ERR("increase_extent_refcount returned %08x\n", Status);
501                                     return Status;
502                                 }
503 
504                                 sdr.offset = mr->address;
505 
506                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
507                                                                   sdr.offset, FALSE, NULL);
508                                 if (!NT_SUCCESS(Status)) {
509                                     ERR("decrease_extent_refcount returned %08x\n", Status);
510                                     return Status;
511                                 }
512 
513                                 c = get_chunk_from_address(Vcb, ed2->address);
514 
515                                 if (c) {
516                                     // check changed_extents
517 
518                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, TRUE);
519 
520                                     le = c->changed_extents.Flink;
521 
522                                     while (le != &c->changed_extents) {
523                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
524 
525                                         if (ce->address == ed2->address) {
526                                             LIST_ENTRY* le2;
527 
528                                             le2 = ce->refs.Flink;
529                                             while (le2 != &ce->refs) {
530                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
531 
532                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
533                                                     cer->sdr.offset = mr->new_address;
534                                                     break;
535                                                 }
536 
537                                                 le2 = le2->Flink;
538                                             }
539 
540                                             le2 = ce->old_refs.Flink;
541                                             while (le2 != &ce->old_refs) {
542                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
543 
544                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
545                                                     cer->sdr.offset = mr->new_address;
546                                                     break;
547                                                 }
548 
549                                                 le2 = le2->Flink;
550                                             }
551 
552                                             break;
553                                         }
554 
555                                         le = le->Flink;
556                                     }
557 
558                                     ExReleaseResourceLite(&c->changed_extents_lock);
559                                 }
560                             }
561                         }
562                     }
563                 }
564             }
565         }
566     }
567 
568     return STATUS_SUCCESS;
569 }
570 
571 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
572                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
573     LIST_ENTRY tree_writes, *le;
574     NTSTATUS Status;
575     traverse_ptr tp;
576     UINT8 level, max_level = 0;
577     chunk* newchunk = NULL;
578 
579     InitializeListHead(&tree_writes);
580 
581     le = items->Flink;
582     while (le != items) {
583         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
584         LIST_ENTRY* le2;
585         chunk* pc;
586 
587         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
588         if (!mr->data) {
589             ERR("out of memory\n");
590             return STATUS_INSUFFICIENT_RESOURCES;
591         }
592 
593         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, TRUE, (UINT8*)mr->data,
594                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, FALSE, NormalPagePriority);
595         if (!NT_SUCCESS(Status)) {
596             ERR("read_data returned %08x\n", Status);
597             return Status;
598         }
599 
600         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
601             mr->system = TRUE;
602 
603         if (data_items && mr->data->level == 0) {
604             le2 = data_items->Flink;
605             while (le2 != data_items) {
606                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
607                 leaf_node* ln = (leaf_node*)&mr->data[1];
608                 UINT16 i;
609 
610                 for (i = 0; i < mr->data->num_items; i++) {
611                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
612                         EXTENT_DATA* ed = (EXTENT_DATA*)((UINT8*)mr->data + sizeof(tree_header) + ln[i].offset);
613 
614                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
615                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
616 
617                             if (ed2->address == dr->address)
618                                 ed2->address = dr->new_address;
619                         }
620                     }
621                 }
622 
623                 le2 = le2->Flink;
624             }
625         }
626 
627         if (mr->data->level > max_level)
628             max_level = mr->data->level;
629 
630         le2 = mr->refs.Flink;
631         while (le2 != &mr->refs) {
632             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
633 
634             if (ref->type == TYPE_TREE_BLOCK_REF) {
635                 KEY* firstitem;
636                 root* r = NULL;
637                 LIST_ENTRY* le3;
638                 tree* t;
639 
640                 firstitem = (KEY*)&mr->data[1];
641 
642                 le3 = Vcb->roots.Flink;
643                 while (le3 != &Vcb->roots) {
644                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
645 
646                     if (r2->id == ref->tbr.offset) {
647                         r = r2;
648                         break;
649                     }
650 
651                     le3 = le3->Flink;
652                 }
653 
654                 if (!r) {
655                     ERR("could not find subvol with id %llx\n", ref->tbr.offset);
656                     return STATUS_INTERNAL_ERROR;
657                 }
658 
659                 Status = find_item_to_level(Vcb, r, &tp, firstitem, FALSE, mr->data->level + 1, NULL);
660                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
661                     ERR("find_item_to_level returned %08x\n", Status);
662                     return Status;
663                 }
664 
665                 t = tp.tree;
666                 while (t && t->header.level < mr->data->level + 1) {
667                     t = t->parent;
668                 }
669 
670                 if (!t)
671                     ref->top = TRUE;
672                 else {
673                     metadata_reloc* mr2;
674 
675                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
676                     if (!NT_SUCCESS(Status)) {
677                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
678                         return Status;
679                     }
680 
681                     ref->parent = mr2;
682                 }
683             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
684                 metadata_reloc* mr2;
685 
686                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
687                 if (!NT_SUCCESS(Status)) {
688                     ERR("add_metadata_reloc_parent returned %08x\n", Status);
689                     return Status;
690                 }
691 
692                 ref->parent = mr2;
693             }
694 
695             le2 = le2->Flink;
696         }
697 
698         le = le->Flink;
699     }
700 
701     le = items->Flink;
702     while (le != items) {
703         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
704         LIST_ENTRY* le2;
705         UINT32 hash;
706 
707         mr->t = NULL;
708 
709         hash = calc_crc32c(0xffffffff, (UINT8*)&mr->address, sizeof(UINT64));
710 
711         le2 = Vcb->trees_ptrs[hash >> 24];
712 
713         if (le2) {
714             while (le2 != &Vcb->trees_hash) {
715                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
716 
717                 if (t->header.address == mr->address) {
718                     mr->t = t;
719                     break;
720                 } else if (t->hash > hash)
721                     break;
722 
723                 le2 = le2->Flink;
724             }
725         }
726 
727         le = le->Flink;
728     }
729 
730     for (level = 0; level <= max_level; level++) {
731         le = items->Flink;
732         while (le != items) {
733             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
734 
735             if (mr->data->level == level) {
736                 BOOL done = FALSE;
737                 LIST_ENTRY* le2;
738                 tree_write* tw;
739                 UINT64 flags;
740                 tree* t3;
741 
742                 if (mr->system)
743                     flags = Vcb->system_flags;
744                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
745                     flags = Vcb->data_flags;
746                 else
747                     flags = Vcb->metadata_flags;
748 
749                 if (newchunk) {
750                     acquire_chunk_lock(newchunk, Vcb);
751 
752                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
753                         newchunk->used += Vcb->superblock.node_size;
754                         space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
755                         done = TRUE;
756                     }
757 
758                     release_chunk_lock(newchunk, Vcb);
759                 }
760 
761                 if (!done) {
762                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
763 
764                     le2 = Vcb->chunks.Flink;
765                     while (le2 != &Vcb->chunks) {
766                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
767 
768                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
769                             acquire_chunk_lock(c2, Vcb);
770 
771                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
772                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
773                                     c2->used += Vcb->superblock.node_size;
774                                     space_list_subtract(c2, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
775                                     release_chunk_lock(c2, Vcb);
776                                     newchunk = c2;
777                                     done = TRUE;
778                                     break;
779                                 }
780                             }
781 
782                             release_chunk_lock(c2, Vcb);
783                         }
784 
785                         le2 = le2->Flink;
786                     }
787 
788                     // allocate new chunk if necessary
789                     if (!done) {
790                         Status = alloc_chunk(Vcb, flags, &newchunk, FALSE);
791 
792                         if (!NT_SUCCESS(Status)) {
793                             ERR("alloc_chunk returned %08x\n", Status);
794                             ExReleaseResourceLite(&Vcb->chunk_lock);
795                             goto end;
796                         }
797 
798                         acquire_chunk_lock(newchunk, Vcb);
799 
800                         newchunk->balance_num = Vcb->balance.balance_num;
801 
802                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
803                             release_chunk_lock(newchunk, Vcb);
804                             ExReleaseResourceLite(&Vcb->chunk_lock);
805                             ERR("could not find address in new chunk\n");
806                             Status = STATUS_DISK_FULL;
807                             goto end;
808                         } else {
809                             newchunk->used += Vcb->superblock.node_size;
810                             space_list_subtract(newchunk, FALSE, mr->new_address, Vcb->superblock.node_size, rollback);
811                         }
812 
813                         release_chunk_lock(newchunk, Vcb);
814                     }
815 
816                     ExReleaseResourceLite(&Vcb->chunk_lock);
817                 }
818 
819                 // update parents
820                 le2 = mr->refs.Flink;
821                 while (le2 != &mr->refs) {
822                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
823 
824                     if (ref->parent) {
825                         UINT16 i;
826                         internal_node* in = (internal_node*)&ref->parent->data[1];
827 
828                         for (i = 0; i < ref->parent->data->num_items; i++) {
829                             if (in[i].address == mr->address) {
830                                 in[i].address = mr->new_address;
831                                 break;
832                             }
833                         }
834 
835                         if (ref->parent->t) {
836                             LIST_ENTRY* le3;
837 
838                             le3 = ref->parent->t->itemlist.Flink;
839                             while (le3 != &ref->parent->t->itemlist) {
840                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
841 
842                                 if (!td->inserted && td->treeholder.address == mr->address)
843                                     td->treeholder.address = mr->new_address;
844 
845                                 le3 = le3->Flink;
846                             }
847                         }
848                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
849                         LIST_ENTRY* le3;
850                         root* r = NULL;
851 
852                         // alter ROOT_ITEM
853 
854                         le3 = Vcb->roots.Flink;
855                         while (le3 != &Vcb->roots) {
856                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
857 
858                             if (r2->id == ref->tbr.offset) {
859                                 r = r2;
860                                 break;
861                             }
862 
863                             le3 = le3->Flink;
864                         }
865 
866                         if (r) {
867                             r->treeholder.address = mr->new_address;
868 
869                             if (r == Vcb->root_root)
870                                 Vcb->superblock.root_tree_addr = mr->new_address;
871                             else if (r == Vcb->chunk_root)
872                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
873                             else if (r->root_item.block_number == mr->address) {
874                                 KEY searchkey;
875                                 ROOT_ITEM* ri;
876 
877                                 r->root_item.block_number = mr->new_address;
878 
879                                 searchkey.obj_id = r->id;
880                                 searchkey.obj_type = TYPE_ROOT_ITEM;
881                                 searchkey.offset = 0xffffffffffffffff;
882 
883                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
884                                 if (!NT_SUCCESS(Status)) {
885                                     ERR("find_item returned %08x\n", Status);
886                                     goto end;
887                                 }
888 
889                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
890                                     ERR("could not find ROOT_ITEM for tree %llx\n", searchkey.obj_id);
891                                     Status = STATUS_INTERNAL_ERROR;
892                                     goto end;
893                                 }
894 
895                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
896                                 if (!ri) {
897                                     ERR("out of memory\n");
898                                     Status = STATUS_INSUFFICIENT_RESOURCES;
899                                     goto end;
900                                 }
901 
902                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
903 
904                                 Status = delete_tree_item(Vcb, &tp);
905                                 if (!NT_SUCCESS(Status)) {
906                                     ERR("delete_tree_item returned %08x\n", Status);
907                                     goto end;
908                                 }
909 
910                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
911                                 if (!NT_SUCCESS(Status)) {
912                                     ERR("insert_tree_item returned %08x\n", Status);
913                                     goto end;
914                                 }
915                             }
916                         }
917                     }
918 
919                     le2 = le2->Flink;
920                 }
921 
922                 mr->data->address = mr->new_address;
923 
924                 t3 = mr->t;
925 
926                 while (t3) {
927                     UINT8 h;
928                     BOOL inserted;
929                     tree* t4 = NULL;
930 
931                     // check if tree loaded more than once
932                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
933                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
934 
935                         if (nt->header.address == t3->header.address)
936                             t4 = nt;
937                     }
938 
939                     t3->header.address = mr->new_address;
940 
941                     h = t3->hash >> 24;
942 
943                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
944                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
945                             Vcb->trees_ptrs[h] = NULL;
946                         else {
947                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
948 
949                             if (t2->hash >> 24 == h)
950                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
951                             else
952                                 Vcb->trees_ptrs[h] = NULL;
953                         }
954                     }
955 
956                     RemoveEntryList(&t3->list_entry_hash);
957 
958                     t3->hash = calc_crc32c(0xffffffff, (UINT8*)&t3->header.address, sizeof(UINT64));
959                     h = t3->hash >> 24;
960 
961                     if (!Vcb->trees_ptrs[h]) {
962                         UINT8 h2 = h;
963 
964                         le2 = Vcb->trees_hash.Flink;
965 
966                         if (h2 > 0) {
967                             h2--;
968                             do {
969                                 if (Vcb->trees_ptrs[h2]) {
970                                     le2 = Vcb->trees_ptrs[h2];
971                                     break;
972                                 }
973 
974                                 h2--;
975                             } while (h2 > 0);
976                         }
977                     } else
978                         le2 = Vcb->trees_ptrs[h];
979 
980                     inserted = FALSE;
981                     while (le2 != &Vcb->trees_hash) {
982                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
983 
984                         if (t2->hash >= t3->hash) {
985                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
986                             inserted = TRUE;
987                             break;
988                         }
989 
990                         le2 = le2->Flink;
991                     }
992 
993                     if (!inserted)
994                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
995 
996                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
997                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
998 
999                     if (data_items && level == 0) {
1000                         le2 = data_items->Flink;
1001 
1002                         while (le2 != data_items) {
1003                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
1004                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1005 
1006                             while (le3 != &t3->itemlist) {
1007                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1008 
1009                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1010                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1011 
1012                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1013                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1014 
1015                                         if (ed2->address == dr->address)
1016                                             ed2->address = dr->new_address;
1017                                     }
1018                                 }
1019 
1020                                 le3 = le3->Flink;
1021                             }
1022 
1023                             le2 = le2->Flink;
1024                         }
1025                     }
1026 
1027                     t3 = t4;
1028                 }
1029 
1030                 *((UINT32*)mr->data) = ~calc_crc32c(0xffffffff, (UINT8*)&mr->data->fs_uuid, Vcb->superblock.node_size - sizeof(mr->data->csum));
1031 
1032                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1033                 if (!tw) {
1034                     ERR("out of memory\n");
1035                     Status = STATUS_INSUFFICIENT_RESOURCES;
1036                     goto end;
1037                 }
1038 
1039                 tw->address = mr->new_address;
1040                 tw->length = Vcb->superblock.node_size;
1041                 tw->data = (UINT8*)mr->data;
1042 
1043                 if (IsListEmpty(&tree_writes))
1044                     InsertTailList(&tree_writes, &tw->list_entry);
1045                 else {
1046                     BOOL inserted = FALSE;
1047 
1048                     le2 = tree_writes.Flink;
1049                     while (le2 != &tree_writes) {
1050                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1051 
1052                         if (tw2->address > tw->address) {
1053                             InsertHeadList(le2->Blink, &tw->list_entry);
1054                             inserted = TRUE;
1055                             break;
1056                         }
1057 
1058                         le2 = le2->Flink;
1059                     }
1060 
1061                     if (!inserted)
1062                         InsertTailList(&tree_writes, &tw->list_entry);
1063                 }
1064             }
1065 
1066             le = le->Flink;
1067         }
1068     }
1069 
1070     Status = do_tree_writes(Vcb, &tree_writes, TRUE);
1071     if (!NT_SUCCESS(Status)) {
1072         ERR("do_tree_writes returned %08x\n", Status);
1073         goto end;
1074     }
1075 
1076     le = items->Flink;
1077     while (le != items) {
1078         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1079 
1080         Status = add_metadata_reloc_extent_item(Vcb, mr);
1081         if (!NT_SUCCESS(Status)) {
1082             ERR("add_metadata_reloc_extent_item returned %08x\n", Status);
1083             goto end;
1084         }
1085 
1086         le = le->Flink;
1087     }
1088 
1089     Status = STATUS_SUCCESS;
1090 
1091 end:
1092     while (!IsListEmpty(&tree_writes)) {
1093         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1094         ExFreePool(tw);
1095     }
1096 
1097     return Status;
1098 }
1099 
1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1101     KEY searchkey;
1102     traverse_ptr tp;
1103     NTSTATUS Status;
1104     BOOL b;
1105     LIST_ENTRY items, rollback;
1106     UINT32 loaded = 0;
1107 
1108     TRACE("chunk %llx\n", c->offset);
1109 
1110     InitializeListHead(&rollback);
1111     InitializeListHead(&items);
1112 
1113     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1114 
1115     searchkey.obj_id = c->offset;
1116     searchkey.obj_type = TYPE_METADATA_ITEM;
1117     searchkey.offset = 0xffffffffffffffff;
1118 
1119     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1120     if (!NT_SUCCESS(Status)) {
1121         ERR("find_item returned %08x\n", Status);
1122         goto end;
1123     }
1124 
1125     do {
1126         traverse_ptr next_tp;
1127 
1128         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129             break;
1130 
1131         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132             BOOL tree = FALSE, skinny = FALSE;
1133 
1134             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135                 tree = TRUE;
1136                 skinny = TRUE;
1137             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1139                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140 
1141                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1142                     tree = TRUE;
1143             }
1144 
1145             if (tree) {
1146                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147 
1148                 if (!NT_SUCCESS(Status)) {
1149                     ERR("add_metadata_reloc returned %08x\n", Status);
1150                     goto end;
1151                 }
1152 
1153                 loaded++;
1154 
1155                 if (loaded >= 64) // only do 64 at a time
1156                     break;
1157             }
1158         }
1159 
1160         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1161 
1162         if (b)
1163             tp = next_tp;
1164     } while (b);
1165 
1166     if (IsListEmpty(&items)) {
1167         *changed = FALSE;
1168         Status = STATUS_SUCCESS;
1169         goto end;
1170     } else
1171         *changed = TRUE;
1172 
1173     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1174     if (!NT_SUCCESS(Status)) {
1175         ERR("write_metadata_items returned %08x\n", Status);
1176         goto end;
1177     }
1178 
1179     Status = STATUS_SUCCESS;
1180 
1181     Vcb->need_write = TRUE;
1182 
1183 end:
1184     if (NT_SUCCESS(Status)) {
1185         Status = do_write(Vcb, NULL);
1186         if (!NT_SUCCESS(Status))
1187             ERR("do_write returned %08x\n", Status);
1188     }
1189 
1190     if (NT_SUCCESS(Status))
1191         clear_rollback(&rollback);
1192     else
1193         do_rollback(Vcb, &rollback);
1194 
1195     free_trees(Vcb);
1196 
1197     ExReleaseResourceLite(&Vcb->tree_lock);
1198 
1199     while (!IsListEmpty(&items)) {
1200         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1201 
1202         while (!IsListEmpty(&mr->refs)) {
1203             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1204 
1205             ExFreePool(ref);
1206         }
1207 
1208         ExFreePool(mr);
1209     }
1210 
1211     return Status;
1212 }
1213 
1214 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1215                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1216     NTSTATUS Status;
1217     LIST_ENTRY* le;
1218     KEY searchkey;
1219     traverse_ptr tp;
1220     root* r = NULL;
1221     metadata_reloc* mr;
1222     UINT64 last_tree = 0;
1223     data_reloc_ref* ref;
1224 
1225     le = Vcb->roots.Flink;
1226     while (le != &Vcb->roots) {
1227         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1228 
1229         if (r2->id == edr->root) {
1230             r = r2;
1231             break;
1232         }
1233 
1234         le = le->Flink;
1235     }
1236 
1237     if (!r) {
1238         ERR("could not find subvol %llx\n", edr->count);
1239         return STATUS_INTERNAL_ERROR;
1240     }
1241 
1242     searchkey.obj_id = edr->objid;
1243     searchkey.obj_type = TYPE_EXTENT_DATA;
1244     searchkey.offset = 0;
1245 
1246     Status = find_item(Vcb, r, &tp, &searchkey, FALSE, NULL);
1247     if (!NT_SUCCESS(Status)) {
1248         ERR("find_item returned %08x\n", Status);
1249         return Status;
1250     }
1251 
1252     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1253         traverse_ptr tp2;
1254 
1255         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1256             tp = tp2;
1257         else {
1258             ERR("could not find EXTENT_DATA for inode %llx in root %llx\n", searchkey.obj_id, r->id);
1259             return STATUS_INTERNAL_ERROR;
1260         }
1261     }
1262 
1263     ref = NULL;
1264 
1265     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1266         traverse_ptr tp2;
1267 
1268         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1269             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1270 
1271             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1272                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1273 
1274                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1275                     if (ref && last_tree == tp.tree->header.address)
1276                         ref->edr.count++;
1277                     else {
1278                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1279                         if (!ref) {
1280                             ERR("out of memory\n");
1281                             return STATUS_INSUFFICIENT_RESOURCES;
1282                         }
1283 
1284                         ref->type = TYPE_EXTENT_DATA_REF;
1285                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1286                         ref->edr.count = 1;
1287 
1288                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1289                         if (!NT_SUCCESS(Status)) {
1290                             ERR("add_metadata_reloc_parent returned %08x\n", Status);
1291                             ExFreePool(ref);
1292                             return Status;
1293                         }
1294 
1295                         last_tree = tp.tree->header.address;
1296                         ref->parent = mr;
1297 
1298                         InsertTailList(&dr->refs, &ref->list_entry);
1299                     }
1300                 }
1301             }
1302         }
1303 
1304         if (find_next_item(Vcb, &tp, &tp2, FALSE, NULL))
1305             tp = tp2;
1306         else
1307             break;
1308     }
1309 
1310     return STATUS_SUCCESS;
1311 }
1312 
1313 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1314                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1315     NTSTATUS Status;
1316     data_reloc* dr;
1317     EXTENT_ITEM* ei;
1318     UINT16 len;
1319     UINT64 inline_rc;
1320     UINT8* ptr;
1321 
1322     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1323     if (!dr) {
1324         ERR("out of memory\n");
1325         return STATUS_INSUFFICIENT_RESOURCES;
1326     }
1327 
1328     dr->address = tp->item->key.obj_id;
1329     dr->size = tp->item->key.offset;
1330     dr->ei = (EXTENT_ITEM*)tp->item->data;
1331     InitializeListHead(&dr->refs);
1332 
1333     Status = delete_tree_item(Vcb, tp);
1334     if (!NT_SUCCESS(Status)) {
1335         ERR("delete_tree_item returned %08x\n", Status);
1336         return Status;
1337     }
1338 
1339     if (!c)
1340         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1341 
1342     if (c) {
1343         acquire_chunk_lock(c, Vcb);
1344 
1345         c->used -= tp->item->key.offset;
1346 
1347         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1348 
1349         release_chunk_lock(c, Vcb);
1350     }
1351 
1352     ei = (EXTENT_ITEM*)tp->item->data;
1353     inline_rc = 0;
1354 
1355     len = tp->item->size - sizeof(EXTENT_ITEM);
1356     ptr = (UINT8*)tp->item->data + sizeof(EXTENT_ITEM);
1357 
1358     while (len > 0) {
1359         UINT8 secttype = *ptr;
1360         UINT16 sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1361 
1362         len--;
1363 
1364         if (sectlen > len) {
1365             ERR("(%llx,%x,%llx): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1366             return STATUS_INTERNAL_ERROR;
1367         }
1368 
1369         if (sectlen == 0) {
1370             ERR("(%llx,%x,%llx): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1371             return STATUS_INTERNAL_ERROR;
1372         }
1373 
1374         if (secttype == TYPE_EXTENT_DATA_REF) {
1375             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(UINT8));
1376 
1377             inline_rc += edr->count;
1378 
1379             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1380             if (!NT_SUCCESS(Status)) {
1381                 ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1382                 return Status;
1383             }
1384         } else if (secttype == TYPE_SHARED_DATA_REF) {
1385             metadata_reloc* mr;
1386             data_reloc_ref* ref;
1387 
1388             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1389             if (!ref) {
1390                 ERR("out of memory\n");
1391                 return STATUS_INSUFFICIENT_RESOURCES;
1392             }
1393 
1394             ref->type = TYPE_SHARED_DATA_REF;
1395             RtlCopyMemory(&ref->sdr, ptr + sizeof(UINT8), sizeof(SHARED_DATA_REF));
1396             inline_rc += ref->sdr.count;
1397 
1398             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1399             if (!NT_SUCCESS(Status)) {
1400                 ERR("add_metadata_reloc_parent returned %08x\n", Status);
1401                 ExFreePool(ref);
1402                 return Status;
1403             }
1404 
1405             ref->parent = mr;
1406 
1407             InsertTailList(&dr->refs, &ref->list_entry);
1408         } else {
1409             ERR("unexpected tree type %x\n", secttype);
1410             return STATUS_INTERNAL_ERROR;
1411         }
1412 
1413 
1414         len -= sectlen;
1415         ptr += sizeof(UINT8) + sectlen;
1416     }
1417 
1418     if (inline_rc < ei->refcount) { // look for non-inline entries
1419         traverse_ptr tp2 = *tp, next_tp;
1420 
1421         while (find_next_item(Vcb, &tp2, &next_tp, FALSE, NULL)) {
1422             tp2 = next_tp;
1423 
1424             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1425                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1426                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1427                     if (!NT_SUCCESS(Status)) {
1428                         ERR("data_reloc_add_tree_edr returned %08x\n", Status);
1429                         return Status;
1430                     }
1431 
1432                     Status = delete_tree_item(Vcb, &tp2);
1433                     if (!NT_SUCCESS(Status)) {
1434                         ERR("delete_tree_item returned %08x\n", Status);
1435                         return Status;
1436                     }
1437                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(UINT32)) {
1438                     metadata_reloc* mr;
1439                     data_reloc_ref* ref;
1440 
1441                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1442                     if (!ref) {
1443                         ERR("out of memory\n");
1444                         return STATUS_INSUFFICIENT_RESOURCES;
1445                     }
1446 
1447                     ref->type = TYPE_SHARED_DATA_REF;
1448                     ref->sdr.offset = tp2.item->key.offset;
1449                     ref->sdr.count = *((UINT32*)tp2.item->data);
1450 
1451                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1452                     if (!NT_SUCCESS(Status)) {
1453                         ERR("add_metadata_reloc_parent returned %08x\n", Status);
1454                         ExFreePool(ref);
1455                         return Status;
1456                     }
1457 
1458                     ref->parent = mr;
1459                     InsertTailList(&dr->refs, &ref->list_entry);
1460 
1461                     Status = delete_tree_item(Vcb, &tp2);
1462                     if (!NT_SUCCESS(Status)) {
1463                         ERR("delete_tree_item returned %08x\n", Status);
1464                         return Status;
1465                     }
1466                 }
1467             } else
1468                 break;
1469         }
1470     }
1471 
1472     InsertTailList(items, &dr->list_entry);
1473 
1474     return STATUS_SUCCESS;
1475 }
1476 
1477 static void sort_data_reloc_refs(data_reloc* dr) {
1478     LIST_ENTRY newlist, *le;
1479 
1480     if (IsListEmpty(&dr->refs))
1481         return;
1482 
1483     // insertion sort
1484 
1485     InitializeListHead(&newlist);
1486 
1487     while (!IsListEmpty(&dr->refs)) {
1488         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1489         BOOL inserted = FALSE;
1490 
1491         if (ref->type == TYPE_EXTENT_DATA_REF)
1492             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1493         else if (ref->type == TYPE_SHARED_DATA_REF)
1494             ref->hash = ref->parent->new_address;
1495 
1496         le = newlist.Flink;
1497         while (le != &newlist) {
1498             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1499 
1500             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1501                 InsertHeadList(le->Blink, &ref->list_entry);
1502                 inserted = TRUE;
1503                 break;
1504             }
1505 
1506             le = le->Flink;
1507         }
1508 
1509         if (!inserted)
1510             InsertTailList(&newlist, &ref->list_entry);
1511     }
1512 
1513     le = newlist.Flink;
1514     while (le != &newlist) {
1515         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1516 
1517         if (le->Flink != &newlist) {
1518             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1519 
1520             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1521                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1522                 RemoveEntryList(&ref2->list_entry);
1523                 ref->edr.count += ref2->edr.count;
1524                 ExFreePool(ref2);
1525                 continue;
1526             }
1527         }
1528 
1529         le = le->Flink;
1530     }
1531 
1532     newlist.Flink->Blink = &dr->refs;
1533     newlist.Blink->Flink = &dr->refs;
1534     dr->refs.Flink = newlist.Flink;
1535     dr->refs.Blink = newlist.Blink;
1536 }
1537 
1538 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1539     NTSTATUS Status;
1540     LIST_ENTRY* le;
1541     UINT64 rc = 0;
1542     UINT16 inline_len;
1543     BOOL all_inline = TRUE;
1544     data_reloc_ref* first_noninline = NULL;
1545     EXTENT_ITEM* ei;
1546     UINT8* ptr;
1547 
1548     inline_len = sizeof(EXTENT_ITEM);
1549 
1550     sort_data_reloc_refs(dr);
1551 
1552     le = dr->refs.Flink;
1553     while (le != &dr->refs) {
1554         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1555         UINT16 extlen = 0;
1556 
1557         if (ref->type == TYPE_EXTENT_DATA_REF) {
1558             extlen += sizeof(EXTENT_DATA_REF);
1559             rc += ref->edr.count;
1560         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1561             extlen += sizeof(SHARED_DATA_REF);
1562             rc++;
1563         }
1564 
1565         if (all_inline) {
1566             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1567                 all_inline = FALSE;
1568                 first_noninline = ref;
1569             } else
1570                 inline_len += extlen + 1;
1571         }
1572 
1573         le = le->Flink;
1574     }
1575 
1576     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1577     if (!ei) {
1578         ERR("out of memory\n");
1579         return STATUS_INSUFFICIENT_RESOURCES;
1580     }
1581 
1582     ei->refcount = rc;
1583     ei->generation = dr->ei->generation;
1584     ei->flags = dr->ei->flags;
1585     ptr = (UINT8*)&ei[1];
1586 
1587     le = dr->refs.Flink;
1588     while (le != &dr->refs) {
1589         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1590 
1591         if (ref == first_noninline)
1592             break;
1593 
1594         *ptr = ref->type;
1595         ptr++;
1596 
1597         if (ref->type == TYPE_EXTENT_DATA_REF) {
1598             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1599 
1600             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1601 
1602             ptr += sizeof(EXTENT_DATA_REF);
1603         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1604             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1605 
1606             sdr->offset = ref->parent->new_address;
1607             sdr->count = ref->sdr.count;
1608 
1609             ptr += sizeof(SHARED_DATA_REF);
1610         }
1611 
1612         le = le->Flink;
1613     }
1614 
1615     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1616     if (!NT_SUCCESS(Status)) {
1617         ERR("insert_tree_item returned %08x\n", Status);
1618         return Status;
1619     }
1620 
1621     if (!all_inline) {
1622         le = &first_noninline->list_entry;
1623 
1624         while (le != &dr->refs) {
1625             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1626 
1627             if (ref->type == TYPE_EXTENT_DATA_REF) {
1628                 EXTENT_DATA_REF* edr;
1629 
1630                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1631                 if (!edr) {
1632                     ERR("out of memory\n");
1633                     return STATUS_INSUFFICIENT_RESOURCES;
1634                 }
1635 
1636                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1637 
1638                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1639                 if (!NT_SUCCESS(Status)) {
1640                     ERR("insert_tree_item returned %08x\n", Status);
1641                     return Status;
1642                 }
1643             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1644                 UINT32* sdr;
1645 
1646                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(UINT32), ALLOC_TAG);
1647                 if (!sdr) {
1648                     ERR("out of memory\n");
1649                     return STATUS_INSUFFICIENT_RESOURCES;
1650                 }
1651 
1652                 *sdr = ref->sdr.count;
1653 
1654                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(UINT32), NULL, NULL);
1655                 if (!NT_SUCCESS(Status)) {
1656                     ERR("insert_tree_item returned %08x\n", Status);
1657                     return Status;
1658                 }
1659             }
1660 
1661             le = le->Flink;
1662         }
1663     }
1664 
1665     return STATUS_SUCCESS;
1666 }
1667 
1668 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, BOOL* changed) {
1669     KEY searchkey;
1670     traverse_ptr tp;
1671     NTSTATUS Status;
1672     BOOL b;
1673     LIST_ENTRY items, metadata_items, rollback, *le;
1674     UINT64 loaded = 0, num_loaded = 0;
1675     chunk* newchunk = NULL;
1676     UINT8* data = NULL;
1677 
1678     TRACE("chunk %llx\n", c->offset);
1679 
1680     InitializeListHead(&rollback);
1681     InitializeListHead(&items);
1682     InitializeListHead(&metadata_items);
1683 
1684     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
1685 
1686     searchkey.obj_id = c->offset;
1687     searchkey.obj_type = TYPE_EXTENT_ITEM;
1688     searchkey.offset = 0xffffffffffffffff;
1689 
1690     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, FALSE, NULL);
1691     if (!NT_SUCCESS(Status)) {
1692         ERR("find_item returned %08x\n", Status);
1693         goto end;
1694     }
1695 
1696     do {
1697         traverse_ptr next_tp;
1698 
1699         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1700             break;
1701 
1702         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1703             BOOL tree = FALSE;
1704 
1705             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1706                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1707 
1708                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1709                     tree = TRUE;
1710             }
1711 
1712             if (!tree) {
1713                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1714 
1715                 if (!NT_SUCCESS(Status)) {
1716                     ERR("add_data_reloc returned %08x\n", Status);
1717                     goto end;
1718                 }
1719 
1720                 loaded += tp.item->key.offset;
1721                 num_loaded++;
1722 
1723                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1724                     break;
1725             }
1726         }
1727 
1728         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
1729 
1730         if (b)
1731             tp = next_tp;
1732     } while (b);
1733 
1734     if (IsListEmpty(&items)) {
1735         *changed = FALSE;
1736         Status = STATUS_SUCCESS;
1737         goto end;
1738     } else
1739         *changed = TRUE;
1740 
1741     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1742     if (!data) {
1743         ERR("out of memory\n");
1744         Status = STATUS_INSUFFICIENT_RESOURCES;
1745         goto end;
1746     }
1747 
1748     le = items.Flink;
1749     while (le != &items) {
1750         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1751         BOOL done = FALSE;
1752         LIST_ENTRY* le2;
1753         UINT32* csum;
1754         RTL_BITMAP bmp;
1755         ULONG* bmparr;
1756         ULONG runlength, index, lastoff;
1757 
1758         if (newchunk) {
1759             acquire_chunk_lock(newchunk, Vcb);
1760 
1761             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1762                 newchunk->used += dr->size;
1763                 space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1764                 done = TRUE;
1765             }
1766 
1767             release_chunk_lock(newchunk, Vcb);
1768         }
1769 
1770         if (!done) {
1771             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
1772 
1773             le2 = Vcb->chunks.Flink;
1774             while (le2 != &Vcb->chunks) {
1775                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1776 
1777                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1778                     acquire_chunk_lock(c2, Vcb);
1779 
1780                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1781                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1782                             c2->used += dr->size;
1783                             space_list_subtract(c2, FALSE, dr->new_address, dr->size, &rollback);
1784                             release_chunk_lock(c2, Vcb);
1785                             newchunk = c2;
1786                             done = TRUE;
1787                             break;
1788                         }
1789                     }
1790 
1791                     release_chunk_lock(c2, Vcb);
1792                 }
1793 
1794                 le2 = le2->Flink;
1795             }
1796 
1797             // allocate new chunk if necessary
1798             if (!done) {
1799                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, FALSE);
1800 
1801                 if (!NT_SUCCESS(Status)) {
1802                     ERR("alloc_chunk returned %08x\n", Status);
1803                     ExReleaseResourceLite(&Vcb->chunk_lock);
1804                     goto end;
1805                 }
1806 
1807                 acquire_chunk_lock(newchunk, Vcb);
1808 
1809                 newchunk->balance_num = Vcb->balance.balance_num;
1810 
1811                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1812                     release_chunk_lock(newchunk, Vcb);
1813                     ExReleaseResourceLite(&Vcb->chunk_lock);
1814                     ERR("could not find address in new chunk\n");
1815                     Status = STATUS_DISK_FULL;
1816                     goto end;
1817                 } else {
1818                     newchunk->used += dr->size;
1819                     space_list_subtract(newchunk, FALSE, dr->new_address, dr->size, &rollback);
1820                 }
1821 
1822                 release_chunk_lock(newchunk, Vcb);
1823             }
1824 
1825             ExReleaseResourceLite(&Vcb->chunk_lock);
1826         }
1827 
1828         dr->newchunk = newchunk;
1829 
1830         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align((dr->size / Vcb->superblock.sector_size) + 1, sizeof(ULONG)), ALLOC_TAG);
1831         if (!bmparr) {
1832             ERR("out of memory\n");
1833             Status = STATUS_INSUFFICIENT_RESOURCES;
1834             goto end;
1835         }
1836 
1837         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(dr->size * sizeof(UINT32) / Vcb->superblock.sector_size), ALLOC_TAG);
1838         if (!csum) {
1839             ERR("out of memory\n");
1840             ExFreePool(bmparr);
1841             Status = STATUS_INSUFFICIENT_RESOURCES;
1842             goto end;
1843         }
1844 
1845         RtlInitializeBitMap(&bmp, bmparr, (ULONG)(dr->size / Vcb->superblock.sector_size));
1846         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1847 
1848         searchkey.obj_id = EXTENT_CSUM_ID;
1849         searchkey.obj_type = TYPE_EXTENT_CSUM;
1850         searchkey.offset = dr->address;
1851 
1852         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, FALSE, NULL);
1853         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1854             ERR("find_item returned %08x\n", Status);
1855             ExFreePool(csum);
1856             ExFreePool(bmparr);
1857             goto end;
1858         }
1859 
1860         if (Status != STATUS_NOT_FOUND) {
1861             do {
1862                 traverse_ptr next_tp;
1863 
1864                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1865                     if (tp.item->key.offset >= dr->address + dr->size)
1866                         break;
1867                     else if (tp.item->size >= sizeof(UINT32) && tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)) >= dr->address) {
1868                         UINT64 cs = max(dr->address, tp.item->key.offset);
1869                         UINT64 ce = min(dr->address + dr->size, tp.item->key.offset + (tp.item->size * Vcb->superblock.sector_size / sizeof(UINT32)));
1870 
1871                         RtlCopyMemory(csum + ((cs - dr->address) / Vcb->superblock.sector_size),
1872                                       tp.item->data + ((cs - tp.item->key.offset) * sizeof(UINT32) / Vcb->superblock.sector_size),
1873                                       (ULONG)((ce - cs) * sizeof(UINT32) / Vcb->superblock.sector_size));
1874 
1875                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
1876 
1877                         if (ce == dr->address + dr->size)
1878                             break;
1879                     }
1880                 }
1881 
1882                 if (find_next_item(Vcb, &tp, &next_tp, FALSE, NULL))
1883                     tp = next_tp;
1884                 else
1885                     break;
1886             } while (TRUE);
1887         }
1888 
1889         lastoff = 0;
1890         runlength = RtlFindFirstRunClear(&bmp, &index);
1891 
1892         while (runlength != 0) {
1893             if (index > lastoff) {
1894                 ULONG off = lastoff;
1895                 ULONG size = index - lastoff;
1896 
1897                 // handle no csum run
1898                 do {
1899                     ULONG rl;
1900 
1901                     if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1902                         rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1903                     else
1904                         rl = size;
1905 
1906                     Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1907                                        c, NULL, NULL, 0, FALSE, NormalPagePriority);
1908                     if (!NT_SUCCESS(Status)) {
1909                         ERR("read_data returned %08x\n", Status);
1910                         ExFreePool(csum);
1911                         ExFreePool(bmparr);
1912                         goto end;
1913                     }
1914 
1915                     Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1916                                                  NULL, newchunk, FALSE, 0, NormalPagePriority);
1917                     if (!NT_SUCCESS(Status)) {
1918                         ERR("write_data_complete returned %08x\n", Status);
1919                         ExFreePool(csum);
1920                         ExFreePool(bmparr);
1921                         goto end;
1922                     }
1923 
1924                     size -= rl;
1925                     off += rl;
1926                 } while (size > 0);
1927             }
1928 
1929             add_checksum_entry(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), runlength, &csum[index], NULL);
1930             add_checksum_entry(Vcb, dr->address + (index * Vcb->superblock.sector_size), runlength, NULL, NULL);
1931 
1932             // handle csum run
1933             do {
1934                 ULONG rl;
1935 
1936                 if (runlength * Vcb->superblock.sector_size > BALANCE_UNIT)
1937                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1938                 else
1939                     rl = runlength;
1940 
1941                 Status = read_data(Vcb, dr->address + (index * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index], FALSE, data,
1942                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1943                 if (!NT_SUCCESS(Status)) {
1944                     ERR("read_data returned %08x\n", Status);
1945                     ExFreePool(csum);
1946                     ExFreePool(bmparr);
1947                     goto end;
1948                 }
1949 
1950                 Status = write_data_complete(Vcb, dr->new_address + (index * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1951                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1952                 if (!NT_SUCCESS(Status)) {
1953                     ERR("write_data_complete returned %08x\n", Status);
1954                     ExFreePool(csum);
1955                     ExFreePool(bmparr);
1956                     goto end;
1957                 }
1958 
1959                 runlength -= rl;
1960                 index += rl;
1961             } while (runlength > 0);
1962 
1963             lastoff = index;
1964             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1965         }
1966 
1967         ExFreePool(csum);
1968         ExFreePool(bmparr);
1969 
1970         // handle final nocsum run
1971         if (lastoff < dr->size / Vcb->superblock.sector_size) {
1972             ULONG off = lastoff;
1973             ULONG size = (ULONG)((dr->size / Vcb->superblock.sector_size) - lastoff);
1974 
1975             do {
1976                 ULONG rl;
1977 
1978                 if (size * Vcb->superblock.sector_size > BALANCE_UNIT)
1979                     rl = BALANCE_UNIT / Vcb->superblock.sector_size;
1980                 else
1981                     rl = size;
1982 
1983                 Status = read_data(Vcb, dr->address + (off * Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, NULL, FALSE, data,
1984                                    c, NULL, NULL, 0, FALSE, NormalPagePriority);
1985                 if (!NT_SUCCESS(Status)) {
1986                     ERR("read_data returned %08x\n", Status);
1987                     goto end;
1988                 }
1989 
1990                 Status = write_data_complete(Vcb, dr->new_address + (off * Vcb->superblock.sector_size), data, rl * Vcb->superblock.sector_size,
1991                                              NULL, newchunk, FALSE, 0, NormalPagePriority);
1992                 if (!NT_SUCCESS(Status)) {
1993                     ERR("write_data_complete returned %08x\n", Status);
1994                     goto end;
1995                 }
1996 
1997                 size -= rl;
1998                 off += rl;
1999             } while (size > 0);
2000         }
2001 
2002         le = le->Flink;
2003     }
2004 
2005     ExFreePool(data);
2006     data = NULL;
2007 
2008     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2009     if (!NT_SUCCESS(Status)) {
2010         ERR("write_metadata_items returned %08x\n", Status);
2011         goto end;
2012     }
2013 
2014     le = items.Flink;
2015     while (le != &items) {
2016         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2017 
2018         Status = add_data_reloc_extent_item(Vcb, dr);
2019         if (!NT_SUCCESS(Status)) {
2020             ERR("add_data_reloc_extent_item returned %08x\n", Status);
2021             goto end;
2022         }
2023 
2024         le = le->Flink;
2025     }
2026 
2027     le = c->changed_extents.Flink;
2028     while (le != &c->changed_extents) {
2029         LIST_ENTRY *le2, *le3;
2030         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2031 
2032         le3 = le->Flink;
2033 
2034         le2 = items.Flink;
2035         while (le2 != &items) {
2036             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2037 
2038             if (ce->address == dr->address) {
2039                 ce->address = dr->new_address;
2040                 RemoveEntryList(&ce->list_entry);
2041                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2042                 break;
2043             }
2044 
2045             le2 = le2->Flink;
2046         }
2047 
2048         le = le3;
2049     }
2050 
2051     Status = STATUS_SUCCESS;
2052 
2053     Vcb->need_write = TRUE;
2054 
2055 end:
2056     if (NT_SUCCESS(Status)) {
2057         // update extents in cache inodes before we flush
2058         le = Vcb->chunks.Flink;
2059         while (le != &Vcb->chunks) {
2060             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2061 
2062             if (c2->cache) {
2063                 LIST_ENTRY* le2;
2064 
2065                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, TRUE);
2066 
2067                 le2 = c2->cache->extents.Flink;
2068                 while (le2 != &c2->cache->extents) {
2069                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2070 
2071                     if (!ext->ignore) {
2072                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2073                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2074 
2075                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2076                                 LIST_ENTRY* le3 = items.Flink;
2077                                 while (le3 != &items) {
2078                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2079 
2080                                     if (ed2->address == dr->address) {
2081                                         ed2->address = dr->new_address;
2082                                         break;
2083                                     }
2084 
2085                                     le3 = le3->Flink;
2086                                 }
2087                             }
2088                         }
2089                     }
2090 
2091                     le2 = le2->Flink;
2092                 }
2093 
2094                 ExReleaseResourceLite(c2->cache->Header.Resource);
2095             }
2096 
2097             le = le->Flink;
2098         }
2099 
2100         Status = do_write(Vcb, NULL);
2101         if (!NT_SUCCESS(Status))
2102             ERR("do_write returned %08x\n", Status);
2103     }
2104 
2105     if (NT_SUCCESS(Status)) {
2106         clear_rollback(&rollback);
2107 
2108         // update open FCBs
2109         // FIXME - speed this up(?)
2110 
2111         le = Vcb->all_fcbs.Flink;
2112         while (le != &Vcb->all_fcbs) {
2113             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2114             LIST_ENTRY* le2;
2115 
2116             ExAcquireResourceExclusiveLite(fcb->Header.Resource, TRUE);
2117 
2118             le2 = fcb->extents.Flink;
2119             while (le2 != &fcb->extents) {
2120                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2121 
2122                 if (!ext->ignore) {
2123                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2124                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2125 
2126                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2127                             LIST_ENTRY* le3 = items.Flink;
2128                             while (le3 != &items) {
2129                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2130 
2131                                 if (ed2->address == dr->address) {
2132                                     ed2->address = dr->new_address;
2133                                     break;
2134                                 }
2135 
2136                                 le3 = le3->Flink;
2137                             }
2138                         }
2139                     }
2140                 }
2141 
2142                 le2 = le2->Flink;
2143             }
2144 
2145             ExReleaseResourceLite(fcb->Header.Resource);
2146 
2147             le = le->Flink;
2148         }
2149     } else
2150         do_rollback(Vcb, &rollback);
2151 
2152     free_trees(Vcb);
2153 
2154     ExReleaseResourceLite(&Vcb->tree_lock);
2155 
2156     if (data)
2157         ExFreePool(data);
2158 
2159     while (!IsListEmpty(&items)) {
2160         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2161 
2162         while (!IsListEmpty(&dr->refs)) {
2163             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2164 
2165             ExFreePool(ref);
2166         }
2167 
2168         ExFreePool(dr);
2169     }
2170 
2171     while (!IsListEmpty(&metadata_items)) {
2172         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2173 
2174         while (!IsListEmpty(&mr->refs)) {
2175             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2176 
2177             ExFreePool(ref);
2178         }
2179 
2180         ExFreePool(mr);
2181     }
2182 
2183     return Status;
2184 }
2185 
2186 static __inline UINT64 get_chunk_dup_type(chunk* c) {
2187     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2188         return BLOCK_FLAG_RAID0;
2189     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2190         return BLOCK_FLAG_RAID1;
2191     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2192         return BLOCK_FLAG_DUPLICATE;
2193     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2194         return BLOCK_FLAG_RAID10;
2195     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2196         return BLOCK_FLAG_RAID5;
2197     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2198         return BLOCK_FLAG_RAID6;
2199     else
2200         return BLOCK_FLAG_SINGLE;
2201 }
2202 
2203 static BOOL should_balance_chunk(device_extension* Vcb, UINT8 sort, chunk* c) {
2204     btrfs_balance_opts* opts;
2205 
2206     opts = &Vcb->balance.opts[sort];
2207 
2208     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2209         return FALSE;
2210 
2211     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2212         UINT64 type = get_chunk_dup_type(c);
2213 
2214         if (!(type & opts->profiles))
2215             return FALSE;
2216     }
2217 
2218     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2219         UINT16 i;
2220         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2221         BOOL b = FALSE;
2222 
2223         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2224             if (cis[i].dev_id == opts->devid) {
2225                 b = TRUE;
2226                 break;
2227             }
2228         }
2229 
2230         if (!b)
2231             return FALSE;
2232     }
2233 
2234     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2235         UINT16 i, factor;
2236         UINT64 physsize;
2237         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2238         BOOL b = FALSE;
2239 
2240         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2241             factor = c->chunk_item->num_stripes;
2242         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2243             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2244         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2245             factor = c->chunk_item->num_stripes - 1;
2246         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2247             factor = c->chunk_item->num_stripes - 2;
2248         else // SINGLE, DUPLICATE, RAID1
2249             factor = 1;
2250 
2251         physsize = c->chunk_item->size / factor;
2252 
2253         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2254             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2255                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2256                 b = TRUE;
2257                 break;
2258             }
2259         }
2260 
2261         if (!b)
2262             return FALSE;
2263     }
2264 
2265     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2266         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2267             return FALSE;
2268     }
2269 
2270     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2271         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2272             return FALSE;
2273     }
2274 
2275     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2276         UINT64 usage = c->used * 100 / c->chunk_item->size;
2277 
2278         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2279         if (c->used > 0 && usage == 0)
2280             usage = 1;
2281 
2282         if (usage < opts->usage_start || usage > opts->usage_end)
2283             return FALSE;
2284     }
2285 
2286     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2287         UINT64 type = get_chunk_dup_type(c);
2288 
2289         if (type == opts->convert)
2290             return FALSE;
2291     }
2292 
2293     return TRUE;
2294 }
2295 
2296 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2297     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2298         args->profiles = opts->profiles;
2299         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2300     }
2301 
2302     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2303         if (args->usage_start == 0) {
2304             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2305             args->usage_start = opts->usage_start;
2306             args->usage_end = opts->usage_end;
2307         } else {
2308             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2309             args->usage = opts->usage_end;
2310         }
2311     }
2312 
2313     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2314         args->devid = opts->devid;
2315         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2316     }
2317 
2318     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2319         args->drange_start = opts->drange_start;
2320         args->drange_end = opts->drange_end;
2321         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2322     }
2323 
2324     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2325         args->vrange_start = opts->vrange_start;
2326         args->vrange_end = opts->vrange_end;
2327         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2328     }
2329 
2330     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2331         args->convert = opts->convert;
2332         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2333 
2334         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2335             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2336     }
2337 
2338     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2339         if (args->limit_start == 0) {
2340             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2341             args->limit_start = (UINT32)opts->limit_start;
2342             args->limit_end = (UINT32)opts->limit_end;
2343         } else {
2344             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2345             args->limit = opts->limit_end;
2346         }
2347     }
2348 
2349     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2350         args->stripes_start = opts->stripes_start;
2351         args->stripes_end = opts->stripes_end;
2352         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2353     }
2354 }
2355 
2356 static NTSTATUS add_balance_item(device_extension* Vcb) {
2357     KEY searchkey;
2358     traverse_ptr tp;
2359     NTSTATUS Status;
2360     BALANCE_ITEM* bi;
2361 
2362     searchkey.obj_id = BALANCE_ITEM_ID;
2363     searchkey.obj_type = TYPE_TEMP_ITEM;
2364     searchkey.offset = 0;
2365 
2366     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2367 
2368     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2369     if (!NT_SUCCESS(Status)) {
2370         ERR("find_item returned %08x\n", Status);
2371         goto end;
2372     }
2373 
2374     if (!keycmp(tp.item->key, searchkey)) {
2375         Status = delete_tree_item(Vcb, &tp);
2376         if (!NT_SUCCESS(Status)) {
2377             ERR("delete_tree_item returned %08x\n", Status);
2378             goto end;
2379         }
2380     }
2381 
2382     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2383     if (!bi) {
2384         ERR("out of memory\n");
2385         Status = STATUS_INSUFFICIENT_RESOURCES;
2386         goto end;
2387     }
2388 
2389     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2390 
2391     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2392         bi->flags |= BALANCE_FLAGS_DATA;
2393         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2394     }
2395 
2396     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2397         bi->flags |= BALANCE_FLAGS_METADATA;
2398         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2399     }
2400 
2401     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2402         bi->flags |= BALANCE_FLAGS_SYSTEM;
2403         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2404     }
2405 
2406     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2407     if (!NT_SUCCESS(Status)) {
2408         ERR("insert_tree_item returned %08x\n", Status);
2409         ExFreePool(bi);
2410         goto end;
2411     }
2412 
2413     Status = STATUS_SUCCESS;
2414 
2415 end:
2416     if (NT_SUCCESS(Status)) {
2417         Status = do_write(Vcb, NULL);
2418         if (!NT_SUCCESS(Status))
2419             ERR("do_write returned %08x\n", Status);
2420     }
2421 
2422     free_trees(Vcb);
2423 
2424     ExReleaseResourceLite(&Vcb->tree_lock);
2425 
2426     return Status;
2427 }
2428 
2429 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2430     KEY searchkey;
2431     traverse_ptr tp;
2432     NTSTATUS Status;
2433 
2434     searchkey.obj_id = BALANCE_ITEM_ID;
2435     searchkey.obj_type = TYPE_TEMP_ITEM;
2436     searchkey.offset = 0;
2437 
2438     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
2439 
2440     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
2441     if (!NT_SUCCESS(Status)) {
2442         ERR("find_item returned %08x\n", Status);
2443         goto end;
2444     }
2445 
2446     if (!keycmp(tp.item->key, searchkey)) {
2447         Status = delete_tree_item(Vcb, &tp);
2448         if (!NT_SUCCESS(Status)) {
2449             ERR("delete_tree_item returned %08x\n", Status);
2450             goto end;
2451         }
2452 
2453         Status = do_write(Vcb, NULL);
2454         if (!NT_SUCCESS(Status)) {
2455             ERR("do_write returned %08x\n", Status);
2456             goto end;
2457         }
2458 
2459         free_trees(Vcb);
2460     }
2461 
2462     Status = STATUS_SUCCESS;
2463 
2464 end:
2465     ExReleaseResourceLite(&Vcb->tree_lock);
2466 
2467     return Status;
2468 }
2469 
2470 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2471     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2472 
2473     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2474         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2475         opts->profiles = args->profiles;
2476     }
2477 
2478     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2479         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2480 
2481         opts->usage_start = 0;
2482         opts->usage_end = (UINT8)args->usage;
2483     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2484         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2485 
2486         opts->usage_start = (UINT8)args->usage_start;
2487         opts->usage_end = (UINT8)args->usage_end;
2488     }
2489 
2490     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2491         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2492         opts->devid = args->devid;
2493     }
2494 
2495     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2496         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2497         opts->drange_start = args->drange_start;
2498         opts->drange_end = args->drange_end;
2499     }
2500 
2501     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2502         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2503         opts->vrange_start = args->vrange_start;
2504         opts->vrange_end = args->vrange_end;
2505     }
2506 
2507     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2508         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2509 
2510         opts->limit_start = 0;
2511         opts->limit_end = args->limit;
2512     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2513         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2514 
2515         opts->limit_start = args->limit_start;
2516         opts->limit_end = args->limit_end;
2517     }
2518 
2519     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2520         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2521 
2522         opts->stripes_start = (UINT16)args->stripes_start;
2523         opts->stripes_end = (UINT16)args->stripes_end;
2524     }
2525 
2526     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2527         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2528         opts->convert = args->convert;
2529 
2530         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2531             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2532     }
2533 }
2534 
2535 static NTSTATUS remove_superblocks(device* dev) {
2536     NTSTATUS Status;
2537     superblock* sb;
2538     int i = 0;
2539 
2540     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2541     if (!sb) {
2542         ERR("out of memory\n");
2543         return STATUS_INSUFFICIENT_RESOURCES;
2544     }
2545 
2546     RtlZeroMemory(sb, sizeof(superblock));
2547 
2548     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2549         Status = write_data_phys(dev->devobj, superblock_addrs[i], sb, sizeof(superblock));
2550 
2551         if (!NT_SUCCESS(Status)) {
2552             ExFreePool(sb);
2553             return Status;
2554         }
2555 
2556         i++;
2557     }
2558 
2559     ExFreePool(sb);
2560 
2561     return STATUS_SUCCESS;
2562 }
2563 
2564 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2565     KEY searchkey;
2566     traverse_ptr tp;
2567     NTSTATUS Status;
2568     LIST_ENTRY* le;
2569     volume_device_extension* vde;
2570 
2571     if (Vcb->need_write) {
2572         Status = do_write(Vcb, NULL);
2573 
2574         if (!NT_SUCCESS(Status))
2575             ERR("do_write returned %08x\n", Status);
2576     } else
2577         Status = STATUS_SUCCESS;
2578 
2579     free_trees(Vcb);
2580 
2581     if (!NT_SUCCESS(Status))
2582         return Status;
2583 
2584     // remove entry in chunk tree
2585 
2586     searchkey.obj_id = 1;
2587     searchkey.obj_type = TYPE_DEV_ITEM;
2588     searchkey.offset = dev->devitem.dev_id;
2589 
2590     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, FALSE, NULL);
2591     if (!NT_SUCCESS(Status)) {
2592         ERR("find_item returned %08x\n", Status);
2593         return Status;
2594     }
2595 
2596     if (!keycmp(searchkey, tp.item->key)) {
2597         Status = delete_tree_item(Vcb, &tp);
2598 
2599         if (!NT_SUCCESS(Status)) {
2600             ERR("delete_tree_item returned %08x\n", Status);
2601             return Status;
2602         }
2603     }
2604 
2605     // remove stats entry in device tree
2606 
2607     searchkey.obj_id = 0;
2608     searchkey.obj_type = TYPE_DEV_STATS;
2609     searchkey.offset = dev->devitem.dev_id;
2610 
2611     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2612     if (!NT_SUCCESS(Status)) {
2613         ERR("find_item returned %08x\n", Status);
2614         return Status;
2615     }
2616 
2617     if (!keycmp(searchkey, tp.item->key)) {
2618         Status = delete_tree_item(Vcb, &tp);
2619 
2620         if (!NT_SUCCESS(Status)) {
2621             ERR("delete_tree_item returned %08x\n", Status);
2622             return Status;
2623         }
2624     }
2625 
2626     // update superblock
2627 
2628     Vcb->superblock.num_devices--;
2629     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2630     Vcb->devices_loaded--;
2631 
2632     RemoveEntryList(&dev->list_entry);
2633 
2634     // flush
2635 
2636     Status = do_write(Vcb, NULL);
2637     if (!NT_SUCCESS(Status))
2638         ERR("do_write returned %08x\n", Status);
2639 
2640     free_trees(Vcb);
2641 
2642     if (!NT_SUCCESS(Status))
2643         return Status;
2644 
2645     if (!dev->readonly && dev->devobj) {
2646         Status = remove_superblocks(dev);
2647         if (!NT_SUCCESS(Status))
2648             WARN("remove_superblocks returned %08x\n", Status);
2649     }
2650 
2651     // remove entry in volume list
2652 
2653     vde = Vcb->vde;
2654 
2655     if (dev->devobj) {
2656         pdo_device_extension* pdode = vde->pdode;
2657 
2658         ExAcquireResourceExclusiveLite(&pdode->child_lock, TRUE);
2659 
2660         le = pdode->children.Flink;
2661         while (le != &pdode->children) {
2662             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2663 
2664             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2665                 PFILE_OBJECT FileObject;
2666                 PDEVICE_OBJECT mountmgr;
2667                 UNICODE_STRING mmdevpath;
2668 
2669                 pdode->children_loaded--;
2670 
2671                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2672                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2673                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2674                     if (!NT_SUCCESS(Status))
2675                         ERR("IoGetDeviceObjectPointer returned %08x\n", Status);
2676                     else {
2677                         MOUNTDEV_NAME mdn;
2678 
2679                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), TRUE, NULL);
2680                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2681                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2682                         else {
2683                             MOUNTDEV_NAME* mdn2;
2684                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2685 
2686                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2687                             if (!mdn2)
2688                                 ERR("out of memory\n");
2689                             else {
2690                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, TRUE, NULL);
2691                                 if (!NT_SUCCESS(Status))
2692                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08x\n", Status);
2693                                 else {
2694                                     UNICODE_STRING name;
2695 
2696                                     name.Buffer = mdn2->Name;
2697                                     name.Length = name.MaximumLength = mdn2->NameLength;
2698 
2699                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2700                                     if (!NT_SUCCESS(Status))
2701                                         WARN("mountmgr_add_drive_letter returned %08x\n", Status);
2702                                 }
2703 
2704                                 ExFreePool(mdn2);
2705                             }
2706                         }
2707 
2708                         ObDereferenceObject(FileObject);
2709                     }
2710                 }
2711 
2712                 ExFreePool(vc->pnp_name.Buffer);
2713                 RemoveEntryList(&vc->list_entry);
2714                 ExFreePool(vc);
2715 
2716                 ObDereferenceObject(vc->fileobj);
2717 
2718                 break;
2719             }
2720 
2721             le = le->Flink;
2722         }
2723 
2724         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2725             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2726 
2727             le = pdode->children.Flink;
2728             while (le != &pdode->children) {
2729                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2730 
2731                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2732                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2733                     break;
2734                 }
2735 
2736                 le = le->Flink;
2737             }
2738         }
2739 
2740         pdode->num_children = Vcb->superblock.num_devices;
2741 
2742         ExReleaseResourceLite(&pdode->child_lock);
2743 
2744         // free dev
2745 
2746         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2747             trim_whole_device(dev);
2748     }
2749 
2750     while (!IsListEmpty(&dev->space)) {
2751         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2752         space* s = CONTAINING_RECORD(le2, space, list_entry);
2753 
2754         ExFreePool(s);
2755     }
2756 
2757     ExFreePool(dev);
2758 
2759     if (Vcb->trim) {
2760         Vcb->trim = FALSE;
2761 
2762         le = Vcb->devices.Flink;
2763         while (le != &Vcb->devices) {
2764             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2765 
2766             if (dev2->trim) {
2767                 Vcb->trim = TRUE;
2768                 break;
2769             }
2770 
2771             le = le->Flink;
2772         }
2773     }
2774 
2775     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2776 
2777     return STATUS_SUCCESS;
2778 }
2779 
2780 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2781     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2782     DEVICE_DATA_SET_RANGE* ranges;
2783     ULONG datalen, i;
2784     KEY searchkey;
2785     traverse_ptr tp;
2786     NTSTATUS Status;
2787     BOOL b;
2788     UINT64 lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2789     LIST_ENTRY* le;
2790 
2791     dev->num_trim_entries = 0;
2792 
2793     searchkey.obj_id = dev->devitem.dev_id;
2794     searchkey.obj_type = TYPE_DEV_EXTENT;
2795     searchkey.offset = 0;
2796 
2797     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, FALSE, NULL);
2798     if (!NT_SUCCESS(Status)) {
2799         ERR("find_item returned %08x\n", Status);
2800         return;
2801     }
2802 
2803     do {
2804         traverse_ptr next_tp;
2805 
2806         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2807             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2808                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2809 
2810                 if (tp.item->key.offset > lastoff)
2811                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2812 
2813                 lastoff = tp.item->key.offset + de->length;
2814             } else {
2815                 ERR("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2816                 return;
2817             }
2818         }
2819 
2820         b = find_next_item(Vcb, &tp, &next_tp, FALSE, NULL);
2821 
2822         if (b) {
2823             tp = next_tp;
2824             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2825                 break;
2826         }
2827     } while (b);
2828 
2829     if (lastoff < dev->devitem.num_bytes)
2830         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2831 
2832     if (dev->num_trim_entries == 0)
2833         return;
2834 
2835     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2836 
2837     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2838     if (!dmdsa) {
2839         ERR("out of memory\n");
2840         goto end;
2841     }
2842 
2843     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2844     dmdsa->Action = DeviceDsmAction_Trim;
2845     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2846     dmdsa->ParameterBlockOffset = 0;
2847     dmdsa->ParameterBlockLength = 0;
2848     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(UINT64));
2849     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2850 
2851     ranges = (DEVICE_DATA_SET_RANGE*)((UINT8*)dmdsa + dmdsa->DataSetRangesOffset);
2852 
2853     i = 0;
2854     le = dev->trim_list.Flink;
2855     while (le != &dev->trim_list) {
2856         space* s = CONTAINING_RECORD(le, space, list_entry);
2857 
2858         ranges[i].StartingOffset = s->address;
2859         ranges[i].LengthInBytes = s->size;
2860         i++;
2861 
2862         le = le->Flink;
2863     }
2864 
2865     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, TRUE, NULL);
2866     if (!NT_SUCCESS(Status))
2867         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08x\n", Status);
2868 
2869     ExFreePool(dmdsa);
2870 
2871 end:
2872     while (!IsListEmpty(&dev->trim_list)) {
2873         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2874         ExFreePool(s);
2875     }
2876 
2877     dev->num_trim_entries = 0;
2878 }
2879 
2880 static NTSTATUS try_consolidation(device_extension* Vcb, UINT64 flags, chunk** newchunk) {
2881     NTSTATUS Status;
2882     BOOL changed;
2883     LIST_ENTRY* le;
2884     chunk* rc;
2885 
2886     // FIXME - allow with metadata chunks?
2887 
2888     while (TRUE) {
2889         rc = NULL;
2890 
2891         ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
2892 
2893         ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
2894 
2895         // choose the least-used chunk we haven't looked at yet
2896         le = Vcb->chunks.Flink;
2897         while (le != &Vcb->chunks) {
2898             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2899 
2900             // FIXME - skip full-size chunks over e.g. 90% full?
2901             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2902                 rc = c;
2903 
2904             le = le->Flink;
2905         }
2906 
2907         ExReleaseResourceLite(&Vcb->chunk_lock);
2908 
2909         if (!rc) {
2910             ExReleaseResourceLite(&Vcb->tree_lock);
2911             break;
2912         }
2913 
2914         if (rc->list_entry_balance.Flink) {
2915             RemoveEntryList(&rc->list_entry_balance);
2916             Vcb->balance.chunks_left--;
2917         }
2918 
2919         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2920         rc->reloc = TRUE;
2921 
2922         ExReleaseResourceLite(&Vcb->tree_lock);
2923 
2924         do {
2925             changed = FALSE;
2926 
2927             Status = balance_data_chunk(Vcb, rc, &changed);
2928             if (!NT_SUCCESS(Status)) {
2929                 ERR("balance_data_chunk returned %08x\n", Status);
2930                 Vcb->balance.status = Status;
2931                 rc->list_entry_balance.Flink = NULL;
2932                 rc->reloc = FALSE;
2933                 return Status;
2934             }
2935 
2936             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
2937 
2938             if (Vcb->readonly)
2939                 Vcb->balance.stopping = TRUE;
2940 
2941             if (Vcb->balance.stopping)
2942                 return STATUS_SUCCESS;
2943         } while (changed);
2944 
2945         rc->list_entry_balance.Flink = NULL;
2946 
2947         rc->changed = TRUE;
2948         rc->space_changed = TRUE;
2949         rc->balance_num = Vcb->balance.balance_num;
2950 
2951         Status = do_write(Vcb, NULL);
2952         if (!NT_SUCCESS(Status)) {
2953             ERR("do_write returned %08x\n", Status);
2954             return Status;
2955         }
2956 
2957         free_trees(Vcb);
2958     }
2959 
2960     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
2961 
2962     Status = alloc_chunk(Vcb, flags, &rc, TRUE);
2963 
2964     ExReleaseResourceLite(&Vcb->chunk_lock);
2965 
2966     if (NT_SUCCESS(Status)) {
2967         *newchunk = rc;
2968         return Status;
2969     } else {
2970         ERR("alloc_chunk returned %08x\n", Status);
2971         return Status;
2972     }
2973 }
2974 
2975 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2976     LIST_ENTRY* le;
2977 
2978     while (!IsListEmpty(&dev->space)) {
2979         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
2980 
2981         ExFreePool(s);
2982     }
2983 
2984     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
2985 
2986     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
2987 
2988     le = Vcb->chunks.Flink;
2989     while (le != &Vcb->chunks) {
2990         UINT16 n;
2991         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2992         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2993 
2994         for (n = 0; n < c->chunk_item->num_stripes; n++) {
2995             UINT64 stripe_size = 0;
2996 
2997             if (cis[n].dev_id == dev->devitem.dev_id) {
2998                 if (stripe_size == 0) {
2999                     UINT16 factor;
3000 
3001                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3002                         factor = c->chunk_item->num_stripes;
3003                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3004                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3005                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3006                         factor = c->chunk_item->num_stripes - 1;
3007                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3008                         factor = c->chunk_item->num_stripes - 2;
3009                     else // SINGLE, DUP, RAID1
3010                         factor = 1;
3011 
3012                     stripe_size = c->chunk_item->size / factor;
3013                 }
3014 
3015                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3016             }
3017         }
3018 
3019         le = le->Flink;
3020     }
3021 
3022     return STATUS_SUCCESS;
3023 }
3024 
3025 _Function_class_(KSTART_ROUTINE)
3026 #ifndef __REACTOS__
3027 void balance_thread(void* context) {
3028 #else
3029 void NTAPI balance_thread(void* context) {
3030 #endif
3031     device_extension* Vcb = (device_extension*)context;
3032     LIST_ENTRY chunks;
3033     LIST_ENTRY* le;
3034     UINT64 num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3035     UINT64 old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3036     NTSTATUS Status;
3037 
3038     Vcb->balance.balance_num++;
3039 
3040     Vcb->balance.stopping = FALSE;
3041     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, FALSE);
3042 
3043     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3044         old_data_flags = Vcb->data_flags;
3045         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3046 
3047         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3048     }
3049 
3050     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3051         old_metadata_flags = Vcb->metadata_flags;
3052         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3053     }
3054 
3055     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3056         old_system_flags = Vcb->system_flags;
3057         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3058     }
3059 
3060     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3061         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3062             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3063         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3064             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3065     }
3066 
3067     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3068     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3069 
3070     InitializeListHead(&chunks);
3071 
3072     // FIXME - what are we supposed to do with limit_start?
3073 
3074     if (!Vcb->readonly) {
3075         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3076             Status = add_balance_item(Vcb);
3077             if (!NT_SUCCESS(Status)) {
3078                 ERR("add_balance_item returned %08x\n", Status);
3079                 Vcb->balance.status = Status;
3080                 goto end;
3081             }
3082         } else {
3083             if (Vcb->need_write) {
3084                 Status = do_write(Vcb, NULL);
3085 
3086                 free_trees(Vcb);
3087 
3088                 if (!NT_SUCCESS(Status)) {
3089                     ERR("do_write returned %08x\n", Status);
3090                     Vcb->balance.status = Status;
3091                     goto end;
3092                 }
3093             }
3094         }
3095     }
3096 
3097     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3098 
3099     if (Vcb->balance.stopping)
3100         goto end;
3101 
3102     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3103 
3104     le = Vcb->chunks.Flink;
3105     while (le != &Vcb->chunks) {
3106         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3107         UINT8 sort;
3108 
3109         acquire_chunk_lock(c, Vcb);
3110 
3111         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3112             sort = BALANCE_OPTS_DATA;
3113         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3114             sort = BALANCE_OPTS_METADATA;
3115         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3116             sort = BALANCE_OPTS_SYSTEM;
3117         else {
3118             ERR("unexpected chunk type %llx\n", c->chunk_item->type);
3119             release_chunk_lock(c, Vcb);
3120             break;
3121         }
3122 
3123         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3124             should_balance_chunk(Vcb, sort, c)) {
3125             InsertTailList(&chunks, &c->list_entry_balance);
3126 
3127             num_chunks[sort]++;
3128             Vcb->balance.total_chunks++;
3129             Vcb->balance.chunks_left++;
3130         } else if (sort == BALANCE_OPTS_METADATA)
3131             okay_metadata_chunks++;
3132         else if (sort == BALANCE_OPTS_DATA)
3133             okay_data_chunks++;
3134         else if (sort == BALANCE_OPTS_SYSTEM)
3135             okay_system_chunks++;
3136 
3137         if (!c->cache_loaded) {
3138             Status = load_cache_chunk(Vcb, c, NULL);
3139 
3140             if (!NT_SUCCESS(Status)) {
3141                 ERR("load_cache_chunk returned %08x\n", Status);
3142                 Vcb->balance.status = Status;
3143                 release_chunk_lock(c, Vcb);
3144                 ExReleaseResourceLite(&Vcb->chunk_lock);
3145                 goto end;
3146             }
3147         }
3148 
3149         release_chunk_lock(c, Vcb);
3150 
3151         le = le->Flink;
3152     }
3153 
3154     ExReleaseResourceLite(&Vcb->chunk_lock);
3155 
3156     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3157     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3158         BOOL consolidated = FALSE;
3159         chunk* c;
3160 
3161         if (okay_metadata_chunks == 0) {
3162             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3163 
3164             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, TRUE);
3165             if (NT_SUCCESS(Status))
3166                 c->balance_num = Vcb->balance.balance_num;
3167             else if (Status != STATUS_DISK_FULL || consolidated) {
3168                 ERR("alloc_chunk returned %08x\n", Status);
3169                 ExReleaseResourceLite(&Vcb->chunk_lock);
3170                 Vcb->balance.status = Status;
3171                 goto end;
3172             }
3173 
3174             ExReleaseResourceLite(&Vcb->chunk_lock);
3175 
3176             if (Status == STATUS_DISK_FULL) {
3177                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3178                 if (!NT_SUCCESS(Status)) {
3179                     ERR("try_consolidation returned %08x\n", Status);
3180                     Vcb->balance.status = Status;
3181                     goto end;
3182                 } else
3183                     c->balance_num = Vcb->balance.balance_num;
3184 
3185                 consolidated = TRUE;
3186 
3187                 if (Vcb->balance.stopping)
3188                     goto end;
3189             }
3190         }
3191 
3192         if (okay_data_chunks == 0) {
3193             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3194 
3195             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, TRUE);
3196             if (NT_SUCCESS(Status))
3197                 c->balance_num = Vcb->balance.balance_num;
3198             else if (Status != STATUS_DISK_FULL || consolidated) {
3199                 ERR("alloc_chunk returned %08x\n", Status);
3200                 ExReleaseResourceLite(&Vcb->chunk_lock);
3201                 Vcb->balance.status = Status;
3202                 goto end;
3203             }
3204 
3205             ExReleaseResourceLite(&Vcb->chunk_lock);
3206 
3207             if (Status == STATUS_DISK_FULL) {
3208                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3209                 if (!NT_SUCCESS(Status)) {
3210                     ERR("try_consolidation returned %08x\n", Status);
3211                     Vcb->balance.status = Status;
3212                     goto end;
3213                 } else
3214                     c->balance_num = Vcb->balance.balance_num;
3215 
3216                 consolidated = TRUE;
3217 
3218                 if (Vcb->balance.stopping)
3219                     goto end;
3220             }
3221         }
3222 
3223         if (okay_system_chunks == 0) {
3224             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, TRUE);
3225 
3226             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, TRUE);
3227             if (NT_SUCCESS(Status))
3228                 c->balance_num = Vcb->balance.balance_num;
3229             else if (Status != STATUS_DISK_FULL || consolidated) {
3230                 ERR("alloc_chunk returned %08x\n", Status);
3231                 ExReleaseResourceLite(&Vcb->chunk_lock);
3232                 Vcb->balance.status = Status;
3233                 goto end;
3234             }
3235 
3236             ExReleaseResourceLite(&Vcb->chunk_lock);
3237 
3238             if (Status == STATUS_DISK_FULL) {
3239                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3240                 if (!NT_SUCCESS(Status)) {
3241                     ERR("try_consolidation returned %08x\n", Status);
3242                     Vcb->balance.status = Status;
3243                     goto end;
3244                 } else
3245                     c->balance_num = Vcb->balance.balance_num;
3246 
3247                 consolidated = TRUE;
3248 
3249                 if (Vcb->balance.stopping)
3250                     goto end;
3251             }
3252         }
3253     }
3254 
3255     ExAcquireResourceSharedLite(&Vcb->chunk_lock, TRUE);
3256 
3257     le = chunks.Flink;
3258     while (le != &chunks) {
3259         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3260 
3261         c->reloc = TRUE;
3262 
3263         le = le->Flink;
3264     }
3265 
3266     ExReleaseResourceLite(&Vcb->chunk_lock);
3267 
3268     // do data chunks before metadata
3269     le = chunks.Flink;
3270     while (le != &chunks) {
3271         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3272         LIST_ENTRY* le2 = le->Flink;
3273 
3274         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3275             BOOL changed;
3276 
3277             do {
3278                 changed = FALSE;
3279 
3280                 Status = balance_data_chunk(Vcb, c, &changed);
3281                 if (!NT_SUCCESS(Status)) {
3282                     ERR("balance_data_chunk returned %08x\n", Status);
3283                     Vcb->balance.status = Status;
3284                     goto end;
3285                 }
3286 
3287                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3288 
3289                 if (Vcb->readonly)
3290                     Vcb->balance.stopping = TRUE;
3291 
3292                 if (Vcb->balance.stopping)
3293                     break;
3294             } while (changed);
3295 
3296             c->changed = TRUE;
3297             c->space_changed = TRUE;
3298         }
3299 
3300         if (Vcb->balance.stopping)
3301             goto end;
3302 
3303         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3304             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3305             RemoveEntryList(&c->list_entry_balance);
3306             c->list_entry_balance.Flink = NULL;
3307 
3308             Vcb->balance.chunks_left--;
3309         }
3310 
3311         le = le2;
3312     }
3313 
3314     // do metadata chunks
3315     while (!IsListEmpty(&chunks)) {
3316         chunk* c;
3317         BOOL changed;
3318 
3319         le = RemoveHeadList(&chunks);
3320         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3321 
3322         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3323             do {
3324                 Status = balance_metadata_chunk(Vcb, c, &changed);
3325                 if (!NT_SUCCESS(Status)) {
3326                     ERR("balance_metadata_chunk returned %08x\n", Status);
3327                     Vcb->balance.status = Status;
3328                     goto end;
3329                 }
3330 
3331                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, FALSE, NULL);
3332 
3333                 if (Vcb->readonly)
3334                     Vcb->balance.stopping = TRUE;
3335 
3336                 if (Vcb->balance.stopping)
3337                     break;
3338             } while (changed);
3339 
3340             c->changed = TRUE;
3341             c->space_changed = TRUE;
3342         }
3343 
3344         if (Vcb->balance.stopping)
3345             break;
3346 
3347         c->list_entry_balance.Flink = NULL;
3348 
3349         Vcb->balance.chunks_left--;
3350     }
3351 
3352 end:
3353     if (!Vcb->readonly) {
3354         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3355             le = chunks.Flink;
3356             while (le != &chunks) {
3357                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3358                 c->reloc = FALSE;
3359 
3360                 le = le->Flink;
3361                 c->list_entry_balance.Flink = NULL;
3362             }
3363 
3364             if (old_data_flags != 0)
3365                 Vcb->data_flags = old_data_flags;
3366 
3367             if (old_metadata_flags != 0)
3368                 Vcb->metadata_flags = old_metadata_flags;
3369 
3370             if (old_system_flags != 0)
3371                 Vcb->system_flags = old_system_flags;
3372         }
3373 
3374         if (Vcb->balance.removing) {
3375             device* dev = NULL;
3376 
3377             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3378 
3379             le = Vcb->devices.Flink;
3380             while (le != &Vcb->devices) {
3381                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3382 
3383                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3384                     dev = dev2;
3385                     break;
3386                 }
3387 
3388                 le = le->Flink;
3389             }
3390 
3391             if (dev) {
3392                 if (Vcb->balance.chunks_left == 0) {
3393                     Status = finish_removing_device(Vcb, dev);
3394 
3395                     if (!NT_SUCCESS(Status)) {
3396                         ERR("finish_removing_device returned %08x\n", Status);
3397                         dev->reloc = FALSE;
3398                     }
3399                 } else
3400                     dev->reloc = FALSE;
3401             }
3402 
3403             ExReleaseResourceLite(&Vcb->tree_lock);
3404         } else if (Vcb->balance.shrinking) {
3405             device* dev = NULL;
3406 
3407             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3408 
3409             le = Vcb->devices.Flink;
3410             while (le != &Vcb->devices) {
3411                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3412 
3413                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3414                     dev = dev2;
3415                     break;
3416                 }
3417 
3418                 le = le->Flink;
3419             }
3420 
3421             if (!dev) {
3422                 ERR("could not find device %llx\n", Vcb->balance.opts[0].devid);
3423                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3424             }
3425 
3426             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3427                 if (dev) {
3428                     Status = regenerate_space_list(Vcb, dev);
3429                     if (!NT_SUCCESS(Status))
3430                         WARN("regenerate_space_list returned %08x\n", Status);
3431                 }
3432             } else {
3433                 UINT64 old_size;
3434 
3435                 old_size = dev->devitem.num_bytes;
3436                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3437 
3438                 Status = update_dev_item(Vcb, dev, NULL);
3439                 if (!NT_SUCCESS(Status)) {
3440                     ERR("update_dev_item returned %08x\n", Status);
3441                     dev->devitem.num_bytes = old_size;
3442                     Vcb->balance.status = Status;
3443 
3444                     Status = regenerate_space_list(Vcb, dev);
3445                     if (!NT_SUCCESS(Status))
3446                         WARN("regenerate_space_list returned %08x\n", Status);
3447                 } else {
3448                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3449 
3450                     Status = do_write(Vcb, NULL);
3451                     if (!NT_SUCCESS(Status))
3452                         ERR("do_write returned %08x\n", Status);
3453 
3454                     free_trees(Vcb);
3455                 }
3456             }
3457 
3458             ExReleaseResourceLite(&Vcb->tree_lock);
3459 
3460             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3461                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3462         } else {
3463             Status = remove_balance_item(Vcb);
3464             if (!NT_SUCCESS(Status)) {
3465                 ERR("remove_balance_item returned %08x\n", Status);
3466                 goto end;
3467             }
3468         }
3469 
3470         if (Vcb->trim && !Vcb->options.no_trim) {
3471             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, TRUE);
3472 
3473             le = Vcb->devices.Flink;
3474             while (le != &Vcb->devices) {
3475                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3476 
3477                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3478                     trim_unalloc_space(Vcb, dev2);
3479 
3480                 le = le->Flink;
3481             }
3482 
3483             ExReleaseResourceLite(&Vcb->tree_lock);
3484         }
3485     }
3486 
3487     ZwClose(Vcb->balance.thread);
3488     Vcb->balance.thread = NULL;
3489 
3490     KeSetEvent(&Vcb->balance.finished, 0, FALSE);
3491 }
3492 
3493 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3494     NTSTATUS Status;
3495     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3496     UINT8 i;
3497 
3498     if (length < sizeof(btrfs_start_balance) || !data)
3499         return STATUS_INVALID_PARAMETER;
3500 
3501     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3502         return STATUS_PRIVILEGE_NOT_HELD;
3503 
3504     if (Vcb->locked) {
3505         WARN("cannot start balance while locked\n");
3506         return STATUS_DEVICE_NOT_READY;
3507     }
3508 
3509     if (Vcb->scrub.thread) {
3510         WARN("cannot start balance while scrub running\n");
3511         return STATUS_DEVICE_NOT_READY;
3512     }
3513 
3514     if (Vcb->balance.thread) {
3515         WARN("balance already running\n");
3516         return STATUS_DEVICE_NOT_READY;
3517     }
3518 
3519     if (Vcb->readonly)
3520         return STATUS_MEDIA_WRITE_PROTECTED;
3521 
3522     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3523         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3524         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3525         return STATUS_SUCCESS;
3526 
3527     for (i = 0; i < 3; i++) {
3528         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3529             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3530                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3531                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE;
3532 
3533                 if (bsb->opts[i].profiles == 0)
3534                     return STATUS_INVALID_PARAMETER;
3535             }
3536 
3537             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3538                 if (bsb->opts[i].devid == 0)
3539                     return STATUS_INVALID_PARAMETER;
3540             }
3541 
3542             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3543                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3544                     return STATUS_INVALID_PARAMETER;
3545             }
3546 
3547             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3548                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3549                     return STATUS_INVALID_PARAMETER;
3550             }
3551 
3552             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3553                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3554                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3555 
3556                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3557                     return STATUS_INVALID_PARAMETER;
3558             }
3559 
3560             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3561                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3562                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3563 
3564                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3565                     return STATUS_INVALID_PARAMETER;
3566             }
3567 
3568             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3569                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3570                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3571 
3572                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3573                     return STATUS_INVALID_PARAMETER;
3574             }
3575 
3576             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3577                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3578                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3579                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3580                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE)
3581                     return STATUS_INVALID_PARAMETER;
3582             }
3583         }
3584     }
3585 
3586     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3587     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3588     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3589 
3590     Vcb->balance.paused = FALSE;
3591     Vcb->balance.removing = FALSE;
3592     Vcb->balance.shrinking = FALSE;
3593     Vcb->balance.status = STATUS_SUCCESS;
3594     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3595 
3596     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3597     if (!NT_SUCCESS(Status)) {
3598         ERR("PsCreateSystemThread returned %08x\n", Status);
3599         return Status;
3600     }
3601 
3602     return STATUS_SUCCESS;
3603 }
3604 
3605 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3606     KEY searchkey;
3607     traverse_ptr tp;
3608     NTSTATUS Status;
3609     BALANCE_ITEM* bi;
3610     int i;
3611 
3612     searchkey.obj_id = BALANCE_ITEM_ID;
3613     searchkey.obj_type = TYPE_TEMP_ITEM;
3614     searchkey.offset = 0;
3615 
3616     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, FALSE, NULL);
3617     if (!NT_SUCCESS(Status)) {
3618         ERR("find_item returned %08x\n", Status);
3619         return Status;
3620     }
3621 
3622     if (keycmp(tp.item->key, searchkey)) {
3623         TRACE("no balance item found\n");
3624         return STATUS_NOT_FOUND;
3625     }
3626 
3627     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3628         WARN("(%llx,%x,%llx) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3629              tp.item->size, sizeof(BALANCE_ITEM));
3630         return STATUS_INTERNAL_ERROR;
3631     }
3632 
3633     bi = (BALANCE_ITEM*)tp.item->data;
3634 
3635     if (bi->flags & BALANCE_FLAGS_DATA)
3636         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3637 
3638     if (bi->flags & BALANCE_FLAGS_METADATA)
3639         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3640 
3641     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3642         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3643 
3644     // do the heuristics that Linux driver does
3645 
3646     for (i = 0; i < 3; i++) {
3647         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3648             // if converting, don't redo chunks already done
3649 
3650             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3651                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3652 
3653             // don't balance chunks more than 90% filled - presumably these
3654             // have already been done
3655 
3656             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3657                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3658             ) {
3659                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3660                 Vcb->balance.opts[i].usage_start = 0;
3661                 Vcb->balance.opts[i].usage_end = 90;
3662             }
3663         }
3664     }
3665 
3666     if (Vcb->readonly || Vcb->options.skip_balance)
3667         Vcb->balance.paused = TRUE;
3668     else
3669         Vcb->balance.paused = FALSE;
3670 
3671     Vcb->balance.removing = FALSE;
3672     Vcb->balance.shrinking = FALSE;
3673     Vcb->balance.status = STATUS_SUCCESS;
3674     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3675 
3676     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3677     if (!NT_SUCCESS(Status)) {
3678         ERR("PsCreateSystemThread returned %08x\n", Status);
3679         return Status;
3680     }
3681 
3682     return STATUS_SUCCESS;
3683 }
3684 
3685 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3686     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3687 
3688     if (length < sizeof(btrfs_query_balance) || !data)
3689         return STATUS_INVALID_PARAMETER;
3690 
3691     if (!Vcb->balance.thread) {
3692         bqb->status = BTRFS_BALANCE_STOPPED;
3693 
3694         if (!NT_SUCCESS(Vcb->balance.status)) {
3695             bqb->status |= BTRFS_BALANCE_ERROR;
3696             bqb->error = Vcb->balance.status;
3697         }
3698 
3699         return STATUS_SUCCESS;
3700     }
3701 
3702     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3703 
3704     if (Vcb->balance.removing)
3705         bqb->status |= BTRFS_BALANCE_REMOVAL;
3706 
3707     if (Vcb->balance.shrinking)
3708         bqb->status |= BTRFS_BALANCE_SHRINKING;
3709 
3710     if (!NT_SUCCESS(Vcb->balance.status))
3711         bqb->status |= BTRFS_BALANCE_ERROR;
3712 
3713     bqb->chunks_left = Vcb->balance.chunks_left;
3714     bqb->total_chunks = Vcb->balance.total_chunks;
3715     bqb->error = Vcb->balance.status;
3716     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3717     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3718     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3719 
3720     return STATUS_SUCCESS;
3721 }
3722 
3723 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3724     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3725         return STATUS_PRIVILEGE_NOT_HELD;
3726 
3727     if (!Vcb->balance.thread)
3728         return STATUS_DEVICE_NOT_READY;
3729 
3730     if (Vcb->balance.paused)
3731         return STATUS_DEVICE_NOT_READY;
3732 
3733     Vcb->balance.paused = TRUE;
3734     KeClearEvent(&Vcb->balance.event);
3735 
3736     return STATUS_SUCCESS;
3737 }
3738 
3739 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3740     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3741         return STATUS_PRIVILEGE_NOT_HELD;
3742 
3743     if (!Vcb->balance.thread)
3744         return STATUS_DEVICE_NOT_READY;
3745 
3746     if (!Vcb->balance.paused)
3747         return STATUS_DEVICE_NOT_READY;
3748 
3749     if (Vcb->readonly)
3750         return STATUS_MEDIA_WRITE_PROTECTED;
3751 
3752     Vcb->balance.paused = FALSE;
3753     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3754 
3755     return STATUS_SUCCESS;
3756 }
3757 
3758 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3759     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3760         return STATUS_PRIVILEGE_NOT_HELD;
3761 
3762     if (!Vcb->balance.thread)
3763         return STATUS_DEVICE_NOT_READY;
3764 
3765     Vcb->balance.paused = FALSE;
3766     Vcb->balance.stopping = TRUE;
3767     Vcb->balance.status = STATUS_SUCCESS;
3768     KeSetEvent(&Vcb->balance.event, 0, FALSE);
3769 
3770     return STATUS_SUCCESS;
3771 }
3772 
3773 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3774     UINT64 devid;
3775     LIST_ENTRY* le;
3776     device* dev = NULL;
3777     NTSTATUS Status;
3778     int i;
3779     UINT64 num_rw_devices;
3780 
3781     TRACE("(%p, %p, %x)\n", Vcb, data, length);
3782 
3783     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3784         return STATUS_PRIVILEGE_NOT_HELD;
3785 
3786     if (length < sizeof(UINT64))
3787         return STATUS_INVALID_PARAMETER;
3788 
3789     devid = *(UINT64*)data;
3790 
3791     ExAcquireResourceSharedLite(&Vcb->tree_lock, TRUE);
3792 
3793     if (Vcb->readonly) {
3794         ExReleaseResourceLite(&Vcb->tree_lock);
3795         return STATUS_MEDIA_WRITE_PROTECTED;
3796     }
3797 
3798     num_rw_devices = 0;
3799 
3800     le = Vcb->devices.Flink;
3801     while (le != &Vcb->devices) {
3802         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3803 
3804         if (dev2->devitem.dev_id == devid)
3805             dev = dev2;
3806 
3807         if (!dev2->readonly)
3808             num_rw_devices++;
3809 
3810         le = le->Flink;
3811     }
3812 
3813     if (!dev) {
3814         ExReleaseResourceLite(&Vcb->tree_lock);
3815         WARN("device %llx not found\n", devid);
3816         return STATUS_NOT_FOUND;
3817     }
3818 
3819     if (!dev->readonly) {
3820         if (num_rw_devices == 1) {
3821             ExReleaseResourceLite(&Vcb->tree_lock);
3822             WARN("not removing last non-readonly device\n");
3823             return STATUS_INVALID_PARAMETER;
3824         }
3825 
3826         if (num_rw_devices == 4 &&
3827             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3828              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6))
3829         ) {
3830             ExReleaseResourceLite(&Vcb->tree_lock);
3831             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10)\n");
3832             return STATUS_CANNOT_DELETE;
3833         }
3834 
3835         if (num_rw_devices == 3 && (Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5)) {
3836             ExReleaseResourceLite(&Vcb->tree_lock);
3837             ERR("would not be enough devices to satisfy RAID requirement (RAID5)\n");
3838             return STATUS_CANNOT_DELETE;
3839         }
3840 
3841         if (num_rw_devices == 2 &&
3842             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3843              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3844         ) {
3845             ExReleaseResourceLite(&Vcb->tree_lock);
3846             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3847             return STATUS_CANNOT_DELETE;
3848         }
3849     }
3850 
3851     ExReleaseResourceLite(&Vcb->tree_lock);
3852 
3853     if (Vcb->balance.thread) {
3854         WARN("balance already running\n");
3855         return STATUS_DEVICE_NOT_READY;
3856     }
3857 
3858     dev->reloc = TRUE;
3859 
3860     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3861 
3862     for (i = 0; i < 3; i++) {
3863         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3864         Vcb->balance.opts[i].devid = devid;
3865     }
3866 
3867     Vcb->balance.paused = FALSE;
3868     Vcb->balance.removing = TRUE;
3869     Vcb->balance.shrinking = FALSE;
3870     Vcb->balance.status = STATUS_SUCCESS;
3871     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3872 
3873     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, NULL, NULL, NULL, balance_thread, Vcb);
3874     if (!NT_SUCCESS(Status)) {
3875         ERR("PsCreateSystemThread returned %08x\n", Status);
3876         dev->reloc = FALSE;
3877         return Status;
3878     }
3879 
3880     return STATUS_SUCCESS;
3881 }
3882