xref: /reactos/drivers/filesystems/btrfs/balance.c (revision 3cfd8ab7)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "btrfsioctl.h"
20 #include "crc32c.h"
21 #include <ntddstor.h>
22 
23 typedef struct {
24     uint64_t address;
25     uint64_t new_address;
26     tree_header* data;
27     EXTENT_ITEM* ei;
28     tree* t;
29     bool system;
30     LIST_ENTRY refs;
31     LIST_ENTRY list_entry;
32 } metadata_reloc;
33 
34 typedef struct {
35     uint8_t type;
36     uint64_t hash;
37 
38     union {
39         TREE_BLOCK_REF tbr;
40         SHARED_BLOCK_REF sbr;
41     };
42 
43     metadata_reloc* parent;
44     bool top;
45     LIST_ENTRY list_entry;
46 } metadata_reloc_ref;
47 
48 typedef struct {
49     uint64_t address;
50     uint64_t size;
51     uint64_t new_address;
52     chunk* newchunk;
53     EXTENT_ITEM* ei;
54     LIST_ENTRY refs;
55     LIST_ENTRY list_entry;
56 } data_reloc;
57 
58 typedef struct {
59     uint8_t type;
60     uint64_t hash;
61 
62     union {
63         EXTENT_DATA_REF edr;
64         SHARED_DATA_REF sdr;
65     };
66 
67     metadata_reloc* parent;
68     LIST_ENTRY list_entry;
69 } data_reloc_ref;
70 
71 #define BALANCE_UNIT 0x100000 // only read 1 MB at a time
72 
73 static NTSTATUS add_metadata_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, traverse_ptr* tp,
74                                    bool skinny, metadata_reloc** mr2, chunk* c, LIST_ENTRY* rollback) {
75     NTSTATUS Status;
76     metadata_reloc* mr;
77     EXTENT_ITEM* ei;
78     uint16_t len;
79     uint64_t inline_rc;
80     uint8_t* ptr;
81 
82     mr = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc), ALLOC_TAG);
83     if (!mr) {
84         ERR("out of memory\n");
85         return STATUS_INSUFFICIENT_RESOURCES;
86     }
87 
88     mr->address = tp->item->key.obj_id;
89     mr->data = NULL;
90     mr->ei = (EXTENT_ITEM*)tp->item->data;
91     mr->system = false;
92     InitializeListHead(&mr->refs);
93 
94     Status = delete_tree_item(Vcb, tp);
95     if (!NT_SUCCESS(Status)) {
96         ERR("delete_tree_item returned %08lx\n", Status);
97         ExFreePool(mr);
98         return Status;
99     }
100 
101     if (!c)
102         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
103 
104     if (c) {
105         acquire_chunk_lock(c, Vcb);
106 
107         c->used -= Vcb->superblock.node_size;
108 
109         space_list_add(c, tp->item->key.obj_id, Vcb->superblock.node_size, rollback);
110 
111         release_chunk_lock(c, Vcb);
112     }
113 
114     ei = (EXTENT_ITEM*)tp->item->data;
115     inline_rc = 0;
116 
117     len = tp->item->size - sizeof(EXTENT_ITEM);
118     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
119     if (!skinny) {
120         len -= sizeof(EXTENT_ITEM2);
121         ptr += sizeof(EXTENT_ITEM2);
122     }
123 
124     while (len > 0) {
125         uint8_t secttype = *ptr;
126         uint16_t sectlen = secttype == TYPE_TREE_BLOCK_REF ? sizeof(TREE_BLOCK_REF) : (secttype == TYPE_SHARED_BLOCK_REF ? sizeof(SHARED_BLOCK_REF) : 0);
127         metadata_reloc_ref* ref;
128 
129         len--;
130 
131         if (sectlen > len) {
132             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
133             return STATUS_INTERNAL_ERROR;
134         }
135 
136         if (sectlen == 0) {
137             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
138             return STATUS_INTERNAL_ERROR;
139         }
140 
141         ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
142         if (!ref) {
143             ERR("out of memory\n");
144             return STATUS_INSUFFICIENT_RESOURCES;
145         }
146 
147         if (secttype == TYPE_TREE_BLOCK_REF) {
148             ref->type = TYPE_TREE_BLOCK_REF;
149             RtlCopyMemory(&ref->tbr, ptr + sizeof(uint8_t), sizeof(TREE_BLOCK_REF));
150             inline_rc++;
151         } else if (secttype == TYPE_SHARED_BLOCK_REF) {
152             ref->type = TYPE_SHARED_BLOCK_REF;
153             RtlCopyMemory(&ref->sbr, ptr + sizeof(uint8_t), sizeof(SHARED_BLOCK_REF));
154             inline_rc++;
155         } else {
156             ERR("unexpected tree type %x\n", secttype);
157             ExFreePool(ref);
158             return STATUS_INTERNAL_ERROR;
159         }
160 
161         ref->parent = NULL;
162         ref->top = false;
163         InsertTailList(&mr->refs, &ref->list_entry);
164 
165         len -= sectlen;
166         ptr += sizeof(uint8_t) + sectlen;
167     }
168 
169     if (inline_rc < ei->refcount) { // look for non-inline entries
170         traverse_ptr tp2 = *tp, next_tp;
171 
172         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
173             tp2 = next_tp;
174 
175             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
176                 if (tp2.item->key.obj_type == TYPE_TREE_BLOCK_REF) {
177                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
178                     if (!ref) {
179                         ERR("out of memory\n");
180                         return STATUS_INSUFFICIENT_RESOURCES;
181                     }
182 
183                     ref->type = TYPE_TREE_BLOCK_REF;
184                     ref->tbr.offset = tp2.item->key.offset;
185                     ref->parent = NULL;
186                     ref->top = false;
187                     InsertTailList(&mr->refs, &ref->list_entry);
188 
189                     Status = delete_tree_item(Vcb, &tp2);
190                     if (!NT_SUCCESS(Status)) {
191                         ERR("delete_tree_item returned %08lx\n", Status);
192                         return Status;
193                     }
194                 } else if (tp2.item->key.obj_type == TYPE_SHARED_BLOCK_REF) {
195                     metadata_reloc_ref* ref = ExAllocatePoolWithTag(PagedPool, sizeof(metadata_reloc_ref), ALLOC_TAG);
196                     if (!ref) {
197                         ERR("out of memory\n");
198                         return STATUS_INSUFFICIENT_RESOURCES;
199                     }
200 
201                     ref->type = TYPE_SHARED_BLOCK_REF;
202                     ref->sbr.offset = tp2.item->key.offset;
203                     ref->parent = NULL;
204                     ref->top = false;
205                     InsertTailList(&mr->refs, &ref->list_entry);
206 
207                     Status = delete_tree_item(Vcb, &tp2);
208                     if (!NT_SUCCESS(Status)) {
209                         ERR("delete_tree_item returned %08lx\n", Status);
210                         return Status;
211                     }
212                 }
213             } else
214                 break;
215         }
216     }
217 
218     InsertTailList(items, &mr->list_entry);
219 
220     if (mr2)
221         *mr2 = mr;
222 
223     return STATUS_SUCCESS;
224 }
225 
226 static NTSTATUS add_metadata_reloc_parent(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
227                                           uint64_t address, metadata_reloc** mr2, LIST_ENTRY* rollback) {
228     LIST_ENTRY* le;
229     KEY searchkey;
230     traverse_ptr tp;
231     bool skinny = false;
232     NTSTATUS Status;
233 
234     le = items->Flink;
235     while (le != items) {
236         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
237 
238         if (mr->address == address) {
239             *mr2 = mr;
240             return STATUS_SUCCESS;
241         }
242 
243         le = le->Flink;
244     }
245 
246     searchkey.obj_id = address;
247     searchkey.obj_type = TYPE_METADATA_ITEM;
248     searchkey.offset = 0xffffffffffffffff;
249 
250     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
251     if (!NT_SUCCESS(Status)) {
252         ERR("find_item returned %08lx\n", Status);
253         return Status;
254     }
255 
256     if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM))
257         skinny = true;
258     else if (tp.item->key.obj_id == address && tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
259              tp.item->size >= sizeof(EXTENT_ITEM)) {
260         EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
261 
262         if (!(ei->flags & EXTENT_ITEM_TREE_BLOCK)) {
263             ERR("EXTENT_ITEM for %I64x found, but tree flag not set\n", address);
264             return STATUS_INTERNAL_ERROR;
265         }
266     } else {
267         ERR("could not find valid EXTENT_ITEM for address %I64x\n", address);
268         return STATUS_INTERNAL_ERROR;
269     }
270 
271     Status = add_metadata_reloc(Vcb, items, &tp, skinny, mr2, NULL, rollback);
272     if (!NT_SUCCESS(Status)) {
273         ERR("add_metadata_reloc returned %08lx\n", Status);
274         return Status;
275     }
276 
277     return STATUS_SUCCESS;
278 }
279 
sort_metadata_reloc_refs(metadata_reloc * mr)280 static void sort_metadata_reloc_refs(metadata_reloc* mr) {
281     LIST_ENTRY newlist, *le;
282 
283     if (mr->refs.Flink == mr->refs.Blink) // 0 or 1 items
284         return;
285 
286     // insertion sort
287 
288     InitializeListHead(&newlist);
289 
290     while (!IsListEmpty(&mr->refs)) {
291         metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
292         bool inserted = false;
293 
294         if (ref->type == TYPE_TREE_BLOCK_REF)
295             ref->hash = ref->tbr.offset;
296         else if (ref->type == TYPE_SHARED_BLOCK_REF)
297             ref->hash = ref->parent->new_address;
298 
299         le = newlist.Flink;
300         while (le != &newlist) {
301             metadata_reloc_ref* ref2 = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
302 
303             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
304                 InsertHeadList(le->Blink, &ref->list_entry);
305                 inserted = true;
306                 break;
307             }
308 
309             le = le->Flink;
310         }
311 
312         if (!inserted)
313             InsertTailList(&newlist, &ref->list_entry);
314     }
315 
316     newlist.Flink->Blink = &mr->refs;
317     newlist.Blink->Flink = &mr->refs;
318     mr->refs.Flink = newlist.Flink;
319     mr->refs.Blink = newlist.Blink;
320 }
321 
322 static NTSTATUS add_metadata_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, metadata_reloc* mr) {
323     NTSTATUS Status;
324     LIST_ENTRY* le;
325     uint64_t rc = 0;
326     uint16_t inline_len;
327     bool all_inline = true;
328     metadata_reloc_ref* first_noninline = NULL;
329     EXTENT_ITEM* ei;
330     uint8_t* ptr;
331 
332     inline_len = sizeof(EXTENT_ITEM);
333     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA))
334         inline_len += sizeof(EXTENT_ITEM2);
335 
336     sort_metadata_reloc_refs(mr);
337 
338     le = mr->refs.Flink;
339     while (le != &mr->refs) {
340         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
341         uint16_t extlen = 0;
342 
343         rc++;
344 
345         if (ref->type == TYPE_TREE_BLOCK_REF)
346             extlen += sizeof(TREE_BLOCK_REF);
347         else if (ref->type == TYPE_SHARED_BLOCK_REF)
348             extlen += sizeof(SHARED_BLOCK_REF);
349 
350         if (all_inline) {
351             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
352                 all_inline = false;
353                 first_noninline = ref;
354             } else
355                 inline_len += extlen + 1;
356         }
357 
358         le = le->Flink;
359     }
360 
361     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
362     if (!ei) {
363         ERR("out of memory\n");
364         return STATUS_INSUFFICIENT_RESOURCES;
365     }
366 
367     ei->refcount = rc;
368     ei->generation = mr->ei->generation;
369     ei->flags = mr->ei->flags;
370     ptr = (uint8_t*)&ei[1];
371 
372     if (!(Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)) {
373         EXTENT_ITEM2* ei2 = (EXTENT_ITEM2*)ptr;
374 
375         ei2->firstitem = *(KEY*)&mr->data[1];
376         ei2->level = mr->data->level;
377 
378         ptr += sizeof(EXTENT_ITEM2);
379     }
380 
381     le = mr->refs.Flink;
382     while (le != &mr->refs) {
383         metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
384 
385         if (ref == first_noninline)
386             break;
387 
388         *ptr = ref->type;
389         ptr++;
390 
391         if (ref->type == TYPE_TREE_BLOCK_REF) {
392             TREE_BLOCK_REF* tbr = (TREE_BLOCK_REF*)ptr;
393 
394             tbr->offset = ref->tbr.offset;
395 
396             ptr += sizeof(TREE_BLOCK_REF);
397         } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
398             SHARED_BLOCK_REF* sbr = (SHARED_BLOCK_REF*)ptr;
399 
400             sbr->offset = ref->parent->new_address;
401 
402             ptr += sizeof(SHARED_BLOCK_REF);
403         }
404 
405         le = le->Flink;
406     }
407 
408     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_SKINNY_METADATA)
409         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_METADATA_ITEM, mr->data->level, ei, inline_len, NULL, NULL);
410     else
411         Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_EXTENT_ITEM, Vcb->superblock.node_size, ei, inline_len, NULL, NULL);
412 
413     if (!NT_SUCCESS(Status)) {
414         ERR("insert_tree_item returned %08lx\n", Status);
415         ExFreePool(ei);
416         return Status;
417     }
418 
419     if (!all_inline) {
420         le = &first_noninline->list_entry;
421 
422         while (le != &mr->refs) {
423             metadata_reloc_ref* ref = CONTAINING_RECORD(le, metadata_reloc_ref, list_entry);
424 
425             if (ref->type == TYPE_TREE_BLOCK_REF) {
426                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_TREE_BLOCK_REF, ref->tbr.offset, NULL, 0, NULL, NULL);
427                 if (!NT_SUCCESS(Status)) {
428                     ERR("insert_tree_item returned %08lx\n", Status);
429                     return Status;
430                 }
431             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
432                 Status = insert_tree_item(Vcb, Vcb->extent_root, mr->new_address, TYPE_SHARED_BLOCK_REF, ref->parent->new_address, NULL, 0, NULL, NULL);
433                 if (!NT_SUCCESS(Status)) {
434                     ERR("insert_tree_item returned %08lx\n", Status);
435                     return Status;
436                 }
437             }
438 
439             le = le->Flink;
440         }
441     }
442 
443     if (ei->flags & EXTENT_ITEM_SHARED_BACKREFS || mr->data->flags & HEADER_FLAG_SHARED_BACKREF || !(mr->data->flags & HEADER_FLAG_MIXED_BACKREF)) {
444         if (mr->data->level > 0) {
445             uint16_t i;
446             internal_node* in = (internal_node*)&mr->data[1];
447 
448             for (i = 0; i < mr->data->num_items; i++) {
449                 uint64_t sbrrc = find_extent_shared_tree_refcount(Vcb, in[i].address, mr->address, NULL);
450 
451                 if (sbrrc > 0) {
452                     SHARED_BLOCK_REF sbr;
453 
454                     sbr.offset = mr->new_address;
455 
456                     Status = increase_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0, NULL);
457                     if (!NT_SUCCESS(Status)) {
458                         ERR("increase_extent_refcount returned %08lx\n", Status);
459                         return Status;
460                     }
461 
462                     sbr.offset = mr->address;
463 
464                     Status = decrease_extent_refcount(Vcb, in[i].address, Vcb->superblock.node_size, TYPE_SHARED_BLOCK_REF, &sbr, NULL, 0,
465                                                       sbr.offset, false, NULL);
466                     if (!NT_SUCCESS(Status)) {
467                         ERR("decrease_extent_refcount returned %08lx\n", Status);
468                         return Status;
469                     }
470                 }
471             }
472         } else {
473             uint16_t i;
474             leaf_node* ln = (leaf_node*)&mr->data[1];
475 
476             for (i = 0; i < mr->data->num_items; i++) {
477                 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
478                     EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
479 
480                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
481                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
482 
483                         if (ed2->size > 0) { // not sparse
484                             uint32_t sdrrc = find_extent_shared_data_refcount(Vcb, ed2->address, mr->address, NULL);
485 
486                             if (sdrrc > 0) {
487                                 SHARED_DATA_REF sdr;
488                                 chunk* c;
489 
490                                 sdr.offset = mr->new_address;
491                                 sdr.count = sdrrc;
492 
493                                 Status = increase_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0, NULL);
494                                 if (!NT_SUCCESS(Status)) {
495                                     ERR("increase_extent_refcount returned %08lx\n", Status);
496                                     return Status;
497                                 }
498 
499                                 sdr.offset = mr->address;
500 
501                                 Status = decrease_extent_refcount(Vcb, ed2->address, ed2->size, TYPE_SHARED_DATA_REF, &sdr, NULL, 0,
502                                                                   sdr.offset, false, NULL);
503                                 if (!NT_SUCCESS(Status)) {
504                                     ERR("decrease_extent_refcount returned %08lx\n", Status);
505                                     return Status;
506                                 }
507 
508                                 c = get_chunk_from_address(Vcb, ed2->address);
509 
510                                 if (c) {
511                                     // check changed_extents
512 
513                                     ExAcquireResourceExclusiveLite(&c->changed_extents_lock, true);
514 
515                                     le = c->changed_extents.Flink;
516 
517                                     while (le != &c->changed_extents) {
518                                         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
519 
520                                         if (ce->address == ed2->address) {
521                                             LIST_ENTRY* le2;
522 
523                                             le2 = ce->refs.Flink;
524                                             while (le2 != &ce->refs) {
525                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
526 
527                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
528                                                     cer->sdr.offset = mr->new_address;
529                                                     break;
530                                                 }
531 
532                                                 le2 = le2->Flink;
533                                             }
534 
535                                             le2 = ce->old_refs.Flink;
536                                             while (le2 != &ce->old_refs) {
537                                                 changed_extent_ref* cer = CONTAINING_RECORD(le2, changed_extent_ref, list_entry);
538 
539                                                 if (cer->type == TYPE_SHARED_DATA_REF && cer->sdr.offset == mr->address) {
540                                                     cer->sdr.offset = mr->new_address;
541                                                     break;
542                                                 }
543 
544                                                 le2 = le2->Flink;
545                                             }
546 
547                                             break;
548                                         }
549 
550                                         le = le->Flink;
551                                     }
552 
553                                     ExReleaseResourceLite(&c->changed_extents_lock);
554                                 }
555                             }
556                         }
557                     }
558                 }
559             }
560         }
561     }
562 
563     return STATUS_SUCCESS;
564 }
565 
566 static NTSTATUS write_metadata_items(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items,
567                                      LIST_ENTRY* data_items, chunk* c, LIST_ENTRY* rollback) {
568     LIST_ENTRY tree_writes, *le;
569     NTSTATUS Status;
570     traverse_ptr tp;
571     uint8_t level, max_level = 0;
572     chunk* newchunk = NULL;
573 
574     InitializeListHead(&tree_writes);
575 
576     le = items->Flink;
577     while (le != items) {
578         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
579         LIST_ENTRY* le2;
580         chunk* pc;
581 
582         mr->data = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
583         if (!mr->data) {
584             ERR("out of memory\n");
585             return STATUS_INSUFFICIENT_RESOURCES;
586         }
587 
588         Status = read_data(Vcb, mr->address, Vcb->superblock.node_size, NULL, true, (uint8_t*)mr->data,
589                            c && mr->address >= c->offset && mr->address < c->offset + c->chunk_item->size ? c : NULL, &pc, NULL, 0, false, NormalPagePriority);
590         if (!NT_SUCCESS(Status)) {
591             ERR("read_data returned %08lx\n", Status);
592             return Status;
593         }
594 
595         if (pc->chunk_item->type & BLOCK_FLAG_SYSTEM)
596             mr->system = true;
597 
598         if (data_items && mr->data->level == 0) {
599             le2 = data_items->Flink;
600             while (le2 != data_items) {
601                 data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
602                 leaf_node* ln = (leaf_node*)&mr->data[1];
603                 uint16_t i;
604 
605                 for (i = 0; i < mr->data->num_items; i++) {
606                     if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
607                         EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)mr->data + sizeof(tree_header) + ln[i].offset);
608 
609                         if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
610                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
611 
612                             if (ed2->address == dr->address)
613                                 ed2->address = dr->new_address;
614                         }
615                     }
616                 }
617 
618                 le2 = le2->Flink;
619             }
620         }
621 
622         if (mr->data->level > max_level)
623             max_level = mr->data->level;
624 
625         le2 = mr->refs.Flink;
626         while (le2 != &mr->refs) {
627             metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
628 
629             if (ref->type == TYPE_TREE_BLOCK_REF) {
630                 KEY* firstitem;
631                 root* r = NULL;
632                 LIST_ENTRY* le3;
633                 tree* t;
634 
635                 firstitem = (KEY*)&mr->data[1];
636 
637                 le3 = Vcb->roots.Flink;
638                 while (le3 != &Vcb->roots) {
639                     root* r2 = CONTAINING_RECORD(le3, root, list_entry);
640 
641                     if (r2->id == ref->tbr.offset) {
642                         r = r2;
643                         break;
644                     }
645 
646                     le3 = le3->Flink;
647                 }
648 
649                 if (!r) {
650                     ERR("could not find subvol with id %I64x\n", ref->tbr.offset);
651                     return STATUS_INTERNAL_ERROR;
652                 }
653 
654                 Status = find_item_to_level(Vcb, r, &tp, firstitem, false, mr->data->level + 1, NULL);
655                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
656                     ERR("find_item_to_level returned %08lx\n", Status);
657                     return Status;
658                 }
659 
660                 t = tp.tree;
661                 while (t && t->header.level < mr->data->level + 1) {
662                     t = t->parent;
663                 }
664 
665                 if (!t)
666                     ref->top = true;
667                 else {
668                     metadata_reloc* mr2;
669 
670                     Status = add_metadata_reloc_parent(Vcb, items, t->header.address, &mr2, rollback);
671                     if (!NT_SUCCESS(Status)) {
672                         ERR("add_metadata_reloc_parent returned %08lx\n", Status);
673                         return Status;
674                     }
675 
676                     ref->parent = mr2;
677                 }
678             } else if (ref->type == TYPE_SHARED_BLOCK_REF) {
679                 metadata_reloc* mr2;
680 
681                 Status = add_metadata_reloc_parent(Vcb, items, ref->sbr.offset, &mr2, rollback);
682                 if (!NT_SUCCESS(Status)) {
683                     ERR("add_metadata_reloc_parent returned %08lx\n", Status);
684                     return Status;
685                 }
686 
687                 ref->parent = mr2;
688             }
689 
690             le2 = le2->Flink;
691         }
692 
693         le = le->Flink;
694     }
695 
696     le = items->Flink;
697     while (le != items) {
698         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
699         LIST_ENTRY* le2;
700         uint32_t hash;
701 
702         mr->t = NULL;
703 
704         hash = calc_crc32c(0xffffffff, (uint8_t*)&mr->address, sizeof(uint64_t));
705 
706         le2 = Vcb->trees_ptrs[hash >> 24];
707 
708         if (le2) {
709             while (le2 != &Vcb->trees_hash) {
710                 tree* t = CONTAINING_RECORD(le2, tree, list_entry_hash);
711 
712                 if (t->header.address == mr->address) {
713                     mr->t = t;
714                     break;
715                 } else if (t->hash > hash)
716                     break;
717 
718                 le2 = le2->Flink;
719             }
720         }
721 
722         le = le->Flink;
723     }
724 
725     for (level = 0; level <= max_level; level++) {
726         le = items->Flink;
727         while (le != items) {
728             metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
729 
730             if (mr->data->level == level) {
731                 bool done = false;
732                 LIST_ENTRY* le2;
733                 tree_write* tw;
734                 uint64_t flags;
735                 tree* t3;
736 
737                 if (mr->system)
738                     flags = Vcb->system_flags;
739                 else if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS)
740                     flags = Vcb->data_flags;
741                 else
742                     flags = Vcb->metadata_flags;
743 
744                 if (newchunk) {
745                     acquire_chunk_lock(newchunk, Vcb);
746 
747                     if (newchunk->chunk_item->type == flags && find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
748                         newchunk->used += Vcb->superblock.node_size;
749                         space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
750                         done = true;
751                     }
752 
753                     release_chunk_lock(newchunk, Vcb);
754                 }
755 
756                 if (!done) {
757                     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
758 
759                     le2 = Vcb->chunks.Flink;
760                     while (le2 != &Vcb->chunks) {
761                         chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
762 
763                         if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == flags) {
764                             acquire_chunk_lock(c2, Vcb);
765 
766                             if ((c2->chunk_item->size - c2->used) >= Vcb->superblock.node_size) {
767                                 if (find_metadata_address_in_chunk(Vcb, c2, &mr->new_address)) {
768                                     c2->used += Vcb->superblock.node_size;
769                                     space_list_subtract(c2, mr->new_address, Vcb->superblock.node_size, rollback);
770                                     release_chunk_lock(c2, Vcb);
771                                     newchunk = c2;
772                                     done = true;
773                                     break;
774                                 }
775                             }
776 
777                             release_chunk_lock(c2, Vcb);
778                         }
779 
780                         le2 = le2->Flink;
781                     }
782 
783                     // allocate new chunk if necessary
784                     if (!done) {
785                         Status = alloc_chunk(Vcb, flags, &newchunk, false);
786 
787                         if (!NT_SUCCESS(Status)) {
788                             ERR("alloc_chunk returned %08lx\n", Status);
789                             ExReleaseResourceLite(&Vcb->chunk_lock);
790                             goto end;
791                         }
792 
793                         acquire_chunk_lock(newchunk, Vcb);
794 
795                         newchunk->balance_num = Vcb->balance.balance_num;
796 
797                         if (!find_metadata_address_in_chunk(Vcb, newchunk, &mr->new_address)) {
798                             release_chunk_lock(newchunk, Vcb);
799                             ExReleaseResourceLite(&Vcb->chunk_lock);
800                             ERR("could not find address in new chunk\n");
801                             Status = STATUS_DISK_FULL;
802                             goto end;
803                         } else {
804                             newchunk->used += Vcb->superblock.node_size;
805                             space_list_subtract(newchunk, mr->new_address, Vcb->superblock.node_size, rollback);
806                         }
807 
808                         release_chunk_lock(newchunk, Vcb);
809                     }
810 
811                     ExReleaseResourceLite(&Vcb->chunk_lock);
812                 }
813 
814                 // update parents
815                 le2 = mr->refs.Flink;
816                 while (le2 != &mr->refs) {
817                     metadata_reloc_ref* ref = CONTAINING_RECORD(le2, metadata_reloc_ref, list_entry);
818 
819                     if (ref->parent) {
820                         uint16_t i;
821                         internal_node* in = (internal_node*)&ref->parent->data[1];
822 
823                         for (i = 0; i < ref->parent->data->num_items; i++) {
824                             if (in[i].address == mr->address) {
825                                 in[i].address = mr->new_address;
826                                 break;
827                             }
828                         }
829 
830                         if (ref->parent->t) {
831                             LIST_ENTRY* le3;
832 
833                             le3 = ref->parent->t->itemlist.Flink;
834                             while (le3 != &ref->parent->t->itemlist) {
835                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
836 
837                                 if (!td->inserted && td->treeholder.address == mr->address)
838                                     td->treeholder.address = mr->new_address;
839 
840                                 le3 = le3->Flink;
841                             }
842                         }
843                     } else if (ref->top && ref->type == TYPE_TREE_BLOCK_REF) {
844                         LIST_ENTRY* le3;
845                         root* r = NULL;
846 
847                         // alter ROOT_ITEM
848 
849                         le3 = Vcb->roots.Flink;
850                         while (le3 != &Vcb->roots) {
851                             root* r2 = CONTAINING_RECORD(le3, root, list_entry);
852 
853                             if (r2->id == ref->tbr.offset) {
854                                 r = r2;
855                                 break;
856                             }
857 
858                             le3 = le3->Flink;
859                         }
860 
861                         if (r) {
862                             r->treeholder.address = mr->new_address;
863 
864                             if (r == Vcb->root_root)
865                                 Vcb->superblock.root_tree_addr = mr->new_address;
866                             else if (r == Vcb->chunk_root)
867                                 Vcb->superblock.chunk_tree_addr = mr->new_address;
868                             else if (r->root_item.block_number == mr->address) {
869                                 KEY searchkey;
870                                 ROOT_ITEM* ri;
871 
872                                 r->root_item.block_number = mr->new_address;
873 
874                                 searchkey.obj_id = r->id;
875                                 searchkey.obj_type = TYPE_ROOT_ITEM;
876                                 searchkey.offset = 0xffffffffffffffff;
877 
878                                 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
879                                 if (!NT_SUCCESS(Status)) {
880                                     ERR("find_item returned %08lx\n", Status);
881                                     goto end;
882                                 }
883 
884                                 if (tp.item->key.obj_id != searchkey.obj_id || tp.item->key.obj_type != searchkey.obj_type) {
885                                     ERR("could not find ROOT_ITEM for tree %I64x\n", searchkey.obj_id);
886                                     Status = STATUS_INTERNAL_ERROR;
887                                     goto end;
888                                 }
889 
890                                 ri = ExAllocatePoolWithTag(PagedPool, sizeof(ROOT_ITEM), ALLOC_TAG);
891                                 if (!ri) {
892                                     ERR("out of memory\n");
893                                     Status = STATUS_INSUFFICIENT_RESOURCES;
894                                     goto end;
895                                 }
896 
897                                 RtlCopyMemory(ri, &r->root_item, sizeof(ROOT_ITEM));
898 
899                                 Status = delete_tree_item(Vcb, &tp);
900                                 if (!NT_SUCCESS(Status)) {
901                                     ERR("delete_tree_item returned %08lx\n", Status);
902                                     goto end;
903                                 }
904 
905                                 Status = insert_tree_item(Vcb, Vcb->root_root, tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, ri, sizeof(ROOT_ITEM), NULL, NULL);
906                                 if (!NT_SUCCESS(Status)) {
907                                     ERR("insert_tree_item returned %08lx\n", Status);
908                                     goto end;
909                                 }
910                             }
911                         }
912                     }
913 
914                     le2 = le2->Flink;
915                 }
916 
917                 mr->data->address = mr->new_address;
918 
919                 t3 = mr->t;
920 
921                 while (t3) {
922                     uint8_t h;
923                     bool inserted;
924                     tree* t4 = NULL;
925 
926                     // check if tree loaded more than once
927                     if (t3->list_entry.Flink != &Vcb->trees_hash) {
928                         tree* nt = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
929 
930                         if (nt->header.address == t3->header.address)
931                             t4 = nt;
932                     }
933 
934                     t3->header.address = mr->new_address;
935 
936                     h = t3->hash >> 24;
937 
938                     if (Vcb->trees_ptrs[h] == &t3->list_entry_hash) {
939                         if (t3->list_entry_hash.Flink == &Vcb->trees_hash)
940                             Vcb->trees_ptrs[h] = NULL;
941                         else {
942                             tree* t2 = CONTAINING_RECORD(t3->list_entry_hash.Flink, tree, list_entry_hash);
943 
944                             if (t2->hash >> 24 == h)
945                                 Vcb->trees_ptrs[h] = &t2->list_entry_hash;
946                             else
947                                 Vcb->trees_ptrs[h] = NULL;
948                         }
949                     }
950 
951                     RemoveEntryList(&t3->list_entry_hash);
952 
953                     t3->hash = calc_crc32c(0xffffffff, (uint8_t*)&t3->header.address, sizeof(uint64_t));
954                     h = t3->hash >> 24;
955 
956                     if (!Vcb->trees_ptrs[h]) {
957                         uint8_t h2 = h;
958 
959                         le2 = Vcb->trees_hash.Flink;
960 
961                         if (h2 > 0) {
962                             h2--;
963                             do {
964                                 if (Vcb->trees_ptrs[h2]) {
965                                     le2 = Vcb->trees_ptrs[h2];
966                                     break;
967                                 }
968 
969                                 h2--;
970                             } while (h2 > 0);
971                         }
972                     } else
973                         le2 = Vcb->trees_ptrs[h];
974 
975                     inserted = false;
976                     while (le2 != &Vcb->trees_hash) {
977                         tree* t2 = CONTAINING_RECORD(le2, tree, list_entry_hash);
978 
979                         if (t2->hash >= t3->hash) {
980                             InsertHeadList(le2->Blink, &t3->list_entry_hash);
981                             inserted = true;
982                             break;
983                         }
984 
985                         le2 = le2->Flink;
986                     }
987 
988                     if (!inserted)
989                         InsertTailList(&Vcb->trees_hash, &t3->list_entry_hash);
990 
991                     if (!Vcb->trees_ptrs[h] || t3->list_entry_hash.Flink == Vcb->trees_ptrs[h])
992                         Vcb->trees_ptrs[h] = &t3->list_entry_hash;
993 
994                     if (data_items && level == 0) {
995                         le2 = data_items->Flink;
996 
997                         while (le2 != data_items) {
998                             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
999                             LIST_ENTRY* le3 = t3->itemlist.Flink;
1000 
1001                             while (le3 != &t3->itemlist) {
1002                                 tree_data* td = CONTAINING_RECORD(le3, tree_data, list_entry);
1003 
1004                                 if (!td->inserted && td->key.obj_type == TYPE_EXTENT_DATA && td->size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
1005                                     EXTENT_DATA* ed = (EXTENT_DATA*)td->data;
1006 
1007                                     if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) {
1008                                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1009 
1010                                         if (ed2->address == dr->address)
1011                                             ed2->address = dr->new_address;
1012                                     }
1013                                 }
1014 
1015                                 le3 = le3->Flink;
1016                             }
1017 
1018                             le2 = le2->Flink;
1019                         }
1020                     }
1021 
1022                     t3 = t4;
1023                 }
1024 
1025                 calc_tree_checksum(Vcb, mr->data);
1026 
1027                 tw = ExAllocatePoolWithTag(PagedPool, sizeof(tree_write), ALLOC_TAG);
1028                 if (!tw) {
1029                     ERR("out of memory\n");
1030                     Status = STATUS_INSUFFICIENT_RESOURCES;
1031                     goto end;
1032                 }
1033 
1034                 tw->address = mr->new_address;
1035                 tw->length = Vcb->superblock.node_size;
1036                 tw->data = (uint8_t*)mr->data;
1037                 tw->allocated = false;
1038 
1039                 if (IsListEmpty(&tree_writes))
1040                     InsertTailList(&tree_writes, &tw->list_entry);
1041                 else {
1042                     bool inserted = false;
1043 
1044                     le2 = tree_writes.Flink;
1045                     while (le2 != &tree_writes) {
1046                         tree_write* tw2 = CONTAINING_RECORD(le2, tree_write, list_entry);
1047 
1048                         if (tw2->address > tw->address) {
1049                             InsertHeadList(le2->Blink, &tw->list_entry);
1050                             inserted = true;
1051                             break;
1052                         }
1053 
1054                         le2 = le2->Flink;
1055                     }
1056 
1057                     if (!inserted)
1058                         InsertTailList(&tree_writes, &tw->list_entry);
1059                 }
1060             }
1061 
1062             le = le->Flink;
1063         }
1064     }
1065 
1066     Status = do_tree_writes(Vcb, &tree_writes, true);
1067     if (!NT_SUCCESS(Status)) {
1068         ERR("do_tree_writes returned %08lx\n", Status);
1069         goto end;
1070     }
1071 
1072     le = items->Flink;
1073     while (le != items) {
1074         metadata_reloc* mr = CONTAINING_RECORD(le, metadata_reloc, list_entry);
1075 
1076         Status = add_metadata_reloc_extent_item(Vcb, mr);
1077         if (!NT_SUCCESS(Status)) {
1078             ERR("add_metadata_reloc_extent_item returned %08lx\n", Status);
1079             goto end;
1080         }
1081 
1082         le = le->Flink;
1083     }
1084 
1085     Status = STATUS_SUCCESS;
1086 
1087 end:
1088     while (!IsListEmpty(&tree_writes)) {
1089         tree_write* tw = CONTAINING_RECORD(RemoveHeadList(&tree_writes), tree_write, list_entry);
1090 
1091         if (tw->allocated)
1092             ExFreePool(tw->data);
1093 
1094         ExFreePool(tw);
1095     }
1096 
1097     return Status;
1098 }
1099 
balance_metadata_chunk(device_extension * Vcb,chunk * c,bool * changed)1100 static NTSTATUS balance_metadata_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1101     KEY searchkey;
1102     traverse_ptr tp;
1103     NTSTATUS Status;
1104     bool b;
1105     LIST_ENTRY items, rollback;
1106     uint32_t loaded = 0;
1107 
1108     TRACE("chunk %I64x\n", c->offset);
1109 
1110     InitializeListHead(&rollback);
1111     InitializeListHead(&items);
1112 
1113     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1114 
1115     searchkey.obj_id = c->offset;
1116     searchkey.obj_type = TYPE_METADATA_ITEM;
1117     searchkey.offset = 0xffffffffffffffff;
1118 
1119     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1120     if (!NT_SUCCESS(Status)) {
1121         ERR("find_item returned %08lx\n", Status);
1122         goto end;
1123     }
1124 
1125     do {
1126         traverse_ptr next_tp;
1127 
1128         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1129             break;
1130 
1131         if (tp.item->key.obj_id >= c->offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
1132             bool tree = false, skinny = false;
1133 
1134             if (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1135                 tree = true;
1136                 skinny = true;
1137             } else if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.offset == Vcb->superblock.node_size &&
1138                        tp.item->size >= sizeof(EXTENT_ITEM)) {
1139                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1140 
1141                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1142                     tree = true;
1143             }
1144 
1145             if (tree) {
1146                 Status = add_metadata_reloc(Vcb, &items, &tp, skinny, NULL, c, &rollback);
1147 
1148                 if (!NT_SUCCESS(Status)) {
1149                     ERR("add_metadata_reloc returned %08lx\n", Status);
1150                     goto end;
1151                 }
1152 
1153                 loaded++;
1154 
1155                 if (loaded >= 64) // only do 64 at a time
1156                     break;
1157             }
1158         }
1159 
1160         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1161 
1162         if (b)
1163             tp = next_tp;
1164     } while (b);
1165 
1166     if (IsListEmpty(&items)) {
1167         *changed = false;
1168         Status = STATUS_SUCCESS;
1169         goto end;
1170     } else
1171         *changed = true;
1172 
1173     Status = write_metadata_items(Vcb, &items, NULL, c, &rollback);
1174     if (!NT_SUCCESS(Status)) {
1175         ERR("write_metadata_items returned %08lx\n", Status);
1176         goto end;
1177     }
1178 
1179     Status = STATUS_SUCCESS;
1180 
1181     Vcb->need_write = true;
1182 
1183 end:
1184     if (NT_SUCCESS(Status)) {
1185         Status = do_write(Vcb, NULL);
1186         if (!NT_SUCCESS(Status))
1187             ERR("do_write returned %08lx\n", Status);
1188     }
1189 
1190     if (NT_SUCCESS(Status))
1191         clear_rollback(&rollback);
1192     else
1193         do_rollback(Vcb, &rollback);
1194 
1195     free_trees(Vcb);
1196 
1197     ExReleaseResourceLite(&Vcb->tree_lock);
1198 
1199     while (!IsListEmpty(&items)) {
1200         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&items), metadata_reloc, list_entry);
1201 
1202         while (!IsListEmpty(&mr->refs)) {
1203             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
1204 
1205             ExFreePool(ref);
1206         }
1207 
1208         if (mr->data)
1209             ExFreePool(mr->data);
1210 
1211         ExFreePool(mr);
1212     }
1213 
1214     return Status;
1215 }
1216 
1217 static NTSTATUS data_reloc_add_tree_edr(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* metadata_items,
1218                                         data_reloc* dr, EXTENT_DATA_REF* edr, LIST_ENTRY* rollback) {
1219     NTSTATUS Status;
1220     LIST_ENTRY* le;
1221     KEY searchkey;
1222     traverse_ptr tp;
1223     root* r = NULL;
1224     metadata_reloc* mr;
1225     uint64_t last_tree = 0;
1226     data_reloc_ref* ref;
1227 
1228     le = Vcb->roots.Flink;
1229     while (le != &Vcb->roots) {
1230         root* r2 = CONTAINING_RECORD(le, root, list_entry);
1231 
1232         if (r2->id == edr->root) {
1233             r = r2;
1234             break;
1235         }
1236 
1237         le = le->Flink;
1238     }
1239 
1240     if (!r) {
1241         ERR("could not find subvol %I64x\n", edr->root);
1242         return STATUS_INTERNAL_ERROR;
1243     }
1244 
1245     searchkey.obj_id = edr->objid;
1246     searchkey.obj_type = TYPE_EXTENT_DATA;
1247     searchkey.offset = 0;
1248 
1249     Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
1250     if (!NT_SUCCESS(Status)) {
1251         ERR("find_item returned %08lx\n", Status);
1252         return Status;
1253     }
1254 
1255     if (tp.item->key.obj_id < searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type < searchkey.obj_type)) {
1256         traverse_ptr tp2;
1257 
1258         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1259             tp = tp2;
1260         else {
1261             ERR("could not find EXTENT_DATA for inode %I64x in root %I64x\n", searchkey.obj_id, r->id);
1262             return STATUS_INTERNAL_ERROR;
1263         }
1264     }
1265 
1266     ref = NULL;
1267 
1268     while (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
1269         traverse_ptr tp2;
1270 
1271         if (tp.item->size >= sizeof(EXTENT_DATA)) {
1272             EXTENT_DATA* ed = (EXTENT_DATA*)tp.item->data;
1273 
1274             if ((ed->type == EXTENT_TYPE_PREALLOC || ed->type == EXTENT_TYPE_REGULAR) && tp.item->size >= offsetof(EXTENT_DATA, data[0]) + sizeof(EXTENT_DATA2)) {
1275                 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
1276 
1277                 if (ed2->address == dr->address && ed2->size == dr->size && tp.item->key.offset - ed2->offset == edr->offset) {
1278                     if (ref && last_tree == tp.tree->header.address)
1279                         ref->edr.count++;
1280                     else {
1281                         ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1282                         if (!ref) {
1283                             ERR("out of memory\n");
1284                             return STATUS_INSUFFICIENT_RESOURCES;
1285                         }
1286 
1287                         ref->type = TYPE_EXTENT_DATA_REF;
1288                         RtlCopyMemory(&ref->edr, edr, sizeof(EXTENT_DATA_REF));
1289                         ref->edr.count = 1;
1290 
1291                         Status = add_metadata_reloc_parent(Vcb, metadata_items, tp.tree->header.address, &mr, rollback);
1292                         if (!NT_SUCCESS(Status)) {
1293                             ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1294                             ExFreePool(ref);
1295                             return Status;
1296                         }
1297 
1298                         last_tree = tp.tree->header.address;
1299                         ref->parent = mr;
1300 
1301                         InsertTailList(&dr->refs, &ref->list_entry);
1302                     }
1303                 }
1304             }
1305         }
1306 
1307         if (find_next_item(Vcb, &tp, &tp2, false, NULL))
1308             tp = tp2;
1309         else
1310             break;
1311     }
1312 
1313     return STATUS_SUCCESS;
1314 }
1315 
1316 static NTSTATUS add_data_reloc(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, LIST_ENTRY* items, LIST_ENTRY* metadata_items,
1317                                traverse_ptr* tp, chunk* c, LIST_ENTRY* rollback) {
1318     NTSTATUS Status;
1319     data_reloc* dr;
1320     EXTENT_ITEM* ei;
1321     uint16_t len;
1322     uint64_t inline_rc;
1323     uint8_t* ptr;
1324 
1325     dr = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc), ALLOC_TAG);
1326     if (!dr) {
1327         ERR("out of memory\n");
1328         return STATUS_INSUFFICIENT_RESOURCES;
1329     }
1330 
1331     dr->address = tp->item->key.obj_id;
1332     dr->size = tp->item->key.offset;
1333     dr->ei = (EXTENT_ITEM*)tp->item->data;
1334     InitializeListHead(&dr->refs);
1335 
1336     Status = delete_tree_item(Vcb, tp);
1337     if (!NT_SUCCESS(Status)) {
1338         ERR("delete_tree_item returned %08lx\n", Status);
1339         return Status;
1340     }
1341 
1342     if (!c)
1343         c = get_chunk_from_address(Vcb, tp->item->key.obj_id);
1344 
1345     if (c) {
1346         acquire_chunk_lock(c, Vcb);
1347 
1348         c->used -= tp->item->key.offset;
1349 
1350         space_list_add(c, tp->item->key.obj_id, tp->item->key.offset, rollback);
1351 
1352         release_chunk_lock(c, Vcb);
1353     }
1354 
1355     ei = (EXTENT_ITEM*)tp->item->data;
1356     inline_rc = 0;
1357 
1358     len = tp->item->size - sizeof(EXTENT_ITEM);
1359     ptr = (uint8_t*)tp->item->data + sizeof(EXTENT_ITEM);
1360 
1361     while (len > 0) {
1362         uint8_t secttype = *ptr;
1363         uint16_t sectlen = secttype == TYPE_EXTENT_DATA_REF ? sizeof(EXTENT_DATA_REF) : (secttype == TYPE_SHARED_DATA_REF ? sizeof(SHARED_DATA_REF) : 0);
1364 
1365         len--;
1366 
1367         if (sectlen > len) {
1368             ERR("(%I64x,%x,%I64x): %x bytes left, expecting at least %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, len, sectlen);
1369             return STATUS_INTERNAL_ERROR;
1370         }
1371 
1372         if (sectlen == 0) {
1373             ERR("(%I64x,%x,%I64x): unrecognized extent type %x\n", tp->item->key.obj_id, tp->item->key.obj_type, tp->item->key.offset, secttype);
1374             return STATUS_INTERNAL_ERROR;
1375         }
1376 
1377         if (secttype == TYPE_EXTENT_DATA_REF) {
1378             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)(ptr + sizeof(uint8_t));
1379 
1380             inline_rc += edr->count;
1381 
1382             Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, edr, rollback);
1383             if (!NT_SUCCESS(Status)) {
1384                 ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1385                 return Status;
1386             }
1387         } else if (secttype == TYPE_SHARED_DATA_REF) {
1388             metadata_reloc* mr;
1389             data_reloc_ref* ref;
1390 
1391             ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1392             if (!ref) {
1393                 ERR("out of memory\n");
1394                 return STATUS_INSUFFICIENT_RESOURCES;
1395             }
1396 
1397             ref->type = TYPE_SHARED_DATA_REF;
1398             RtlCopyMemory(&ref->sdr, ptr + sizeof(uint8_t), sizeof(SHARED_DATA_REF));
1399             inline_rc += ref->sdr.count;
1400 
1401             Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1402             if (!NT_SUCCESS(Status)) {
1403                 ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1404                 ExFreePool(ref);
1405                 return Status;
1406             }
1407 
1408             ref->parent = mr;
1409 
1410             InsertTailList(&dr->refs, &ref->list_entry);
1411         } else {
1412             ERR("unexpected tree type %x\n", secttype);
1413             return STATUS_INTERNAL_ERROR;
1414         }
1415 
1416 
1417         len -= sectlen;
1418         ptr += sizeof(uint8_t) + sectlen;
1419     }
1420 
1421     if (inline_rc < ei->refcount) { // look for non-inline entries
1422         traverse_ptr tp2 = *tp, next_tp;
1423 
1424         while (find_next_item(Vcb, &tp2, &next_tp, false, NULL)) {
1425             tp2 = next_tp;
1426 
1427             if (tp2.item->key.obj_id == tp->item->key.obj_id) {
1428                 if (tp2.item->key.obj_type == TYPE_EXTENT_DATA_REF && tp2.item->size >= sizeof(EXTENT_DATA_REF)) {
1429                     Status = data_reloc_add_tree_edr(Vcb, metadata_items, dr, (EXTENT_DATA_REF*)tp2.item->data, rollback);
1430                     if (!NT_SUCCESS(Status)) {
1431                         ERR("data_reloc_add_tree_edr returned %08lx\n", Status);
1432                         return Status;
1433                     }
1434 
1435                     Status = delete_tree_item(Vcb, &tp2);
1436                     if (!NT_SUCCESS(Status)) {
1437                         ERR("delete_tree_item returned %08lx\n", Status);
1438                         return Status;
1439                     }
1440                 } else if (tp2.item->key.obj_type == TYPE_SHARED_DATA_REF && tp2.item->size >= sizeof(uint32_t)) {
1441                     metadata_reloc* mr;
1442                     data_reloc_ref* ref;
1443 
1444                     ref = ExAllocatePoolWithTag(PagedPool, sizeof(data_reloc_ref), ALLOC_TAG);
1445                     if (!ref) {
1446                         ERR("out of memory\n");
1447                         return STATUS_INSUFFICIENT_RESOURCES;
1448                     }
1449 
1450                     ref->type = TYPE_SHARED_DATA_REF;
1451                     ref->sdr.offset = tp2.item->key.offset;
1452                     ref->sdr.count = *((uint32_t*)tp2.item->data);
1453 
1454                     Status = add_metadata_reloc_parent(Vcb, metadata_items, ref->sdr.offset, &mr, rollback);
1455                     if (!NT_SUCCESS(Status)) {
1456                         ERR("add_metadata_reloc_parent returned %08lx\n", Status);
1457                         ExFreePool(ref);
1458                         return Status;
1459                     }
1460 
1461                     ref->parent = mr;
1462                     InsertTailList(&dr->refs, &ref->list_entry);
1463 
1464                     Status = delete_tree_item(Vcb, &tp2);
1465                     if (!NT_SUCCESS(Status)) {
1466                         ERR("delete_tree_item returned %08lx\n", Status);
1467                         return Status;
1468                     }
1469                 }
1470             } else
1471                 break;
1472         }
1473     }
1474 
1475     InsertTailList(items, &dr->list_entry);
1476 
1477     return STATUS_SUCCESS;
1478 }
1479 
sort_data_reloc_refs(data_reloc * dr)1480 static void sort_data_reloc_refs(data_reloc* dr) {
1481     LIST_ENTRY newlist, *le;
1482 
1483     if (IsListEmpty(&dr->refs))
1484         return;
1485 
1486     // insertion sort
1487 
1488     InitializeListHead(&newlist);
1489 
1490     while (!IsListEmpty(&dr->refs)) {
1491         data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
1492         bool inserted = false;
1493 
1494         if (ref->type == TYPE_EXTENT_DATA_REF)
1495             ref->hash = get_extent_data_ref_hash2(ref->edr.root, ref->edr.objid, ref->edr.offset);
1496         else if (ref->type == TYPE_SHARED_DATA_REF)
1497             ref->hash = ref->parent->new_address;
1498 
1499         le = newlist.Flink;
1500         while (le != &newlist) {
1501             data_reloc_ref* ref2 = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1502 
1503             if (ref->type < ref2->type || (ref->type == ref2->type && ref->hash > ref2->hash)) {
1504                 InsertHeadList(le->Blink, &ref->list_entry);
1505                 inserted = true;
1506                 break;
1507             }
1508 
1509             le = le->Flink;
1510         }
1511 
1512         if (!inserted)
1513             InsertTailList(&newlist, &ref->list_entry);
1514     }
1515 
1516     le = newlist.Flink;
1517     while (le != &newlist) {
1518         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1519 
1520         if (le->Flink != &newlist) {
1521             data_reloc_ref* ref2 = CONTAINING_RECORD(le->Flink, data_reloc_ref, list_entry);
1522 
1523             if (ref->type == TYPE_EXTENT_DATA_REF && ref2->type == TYPE_EXTENT_DATA_REF && ref->edr.root == ref2->edr.root &&
1524                 ref->edr.objid == ref2->edr.objid && ref->edr.offset == ref2->edr.offset) {
1525                 RemoveEntryList(&ref2->list_entry);
1526                 ref->edr.count += ref2->edr.count;
1527                 ExFreePool(ref2);
1528                 continue;
1529             }
1530         }
1531 
1532         le = le->Flink;
1533     }
1534 
1535     newlist.Flink->Blink = &dr->refs;
1536     newlist.Blink->Flink = &dr->refs;
1537     dr->refs.Flink = newlist.Flink;
1538     dr->refs.Blink = newlist.Blink;
1539 }
1540 
1541 static NTSTATUS add_data_reloc_extent_item(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, data_reloc* dr) {
1542     NTSTATUS Status;
1543     LIST_ENTRY* le;
1544     uint64_t rc = 0;
1545     uint16_t inline_len;
1546     bool all_inline = true;
1547     data_reloc_ref* first_noninline = NULL;
1548     EXTENT_ITEM* ei;
1549     uint8_t* ptr;
1550 
1551     inline_len = sizeof(EXTENT_ITEM);
1552 
1553     sort_data_reloc_refs(dr);
1554 
1555     le = dr->refs.Flink;
1556     while (le != &dr->refs) {
1557         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1558         uint16_t extlen = 0;
1559 
1560         if (ref->type == TYPE_EXTENT_DATA_REF) {
1561             extlen += sizeof(EXTENT_DATA_REF);
1562             rc += ref->edr.count;
1563         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1564             extlen += sizeof(SHARED_DATA_REF);
1565             rc++;
1566         }
1567 
1568         if (all_inline) {
1569             if ((ULONG)(inline_len + 1 + extlen) > (Vcb->superblock.node_size >> 2)) {
1570                 all_inline = false;
1571                 first_noninline = ref;
1572             } else
1573                 inline_len += extlen + 1;
1574         }
1575 
1576         le = le->Flink;
1577     }
1578 
1579     ei = ExAllocatePoolWithTag(PagedPool, inline_len, ALLOC_TAG);
1580     if (!ei) {
1581         ERR("out of memory\n");
1582         return STATUS_INSUFFICIENT_RESOURCES;
1583     }
1584 
1585     ei->refcount = rc;
1586     ei->generation = dr->ei->generation;
1587     ei->flags = dr->ei->flags;
1588     ptr = (uint8_t*)&ei[1];
1589 
1590     le = dr->refs.Flink;
1591     while (le != &dr->refs) {
1592         data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1593 
1594         if (ref == first_noninline)
1595             break;
1596 
1597         *ptr = ref->type;
1598         ptr++;
1599 
1600         if (ref->type == TYPE_EXTENT_DATA_REF) {
1601             EXTENT_DATA_REF* edr = (EXTENT_DATA_REF*)ptr;
1602 
1603             RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1604 
1605             ptr += sizeof(EXTENT_DATA_REF);
1606         } else if (ref->type == TYPE_SHARED_DATA_REF) {
1607             SHARED_DATA_REF* sdr = (SHARED_DATA_REF*)ptr;
1608 
1609             sdr->offset = ref->parent->new_address;
1610             sdr->count = ref->sdr.count;
1611 
1612             ptr += sizeof(SHARED_DATA_REF);
1613         }
1614 
1615         le = le->Flink;
1616     }
1617 
1618     Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_ITEM, dr->size, ei, inline_len, NULL, NULL);
1619     if (!NT_SUCCESS(Status)) {
1620         ERR("insert_tree_item returned %08lx\n", Status);
1621         return Status;
1622     }
1623 
1624     if (!all_inline) {
1625         le = &first_noninline->list_entry;
1626 
1627         while (le != &dr->refs) {
1628             data_reloc_ref* ref = CONTAINING_RECORD(le, data_reloc_ref, list_entry);
1629 
1630             if (ref->type == TYPE_EXTENT_DATA_REF) {
1631                 EXTENT_DATA_REF* edr;
1632 
1633                 edr = ExAllocatePoolWithTag(PagedPool, sizeof(EXTENT_DATA_REF), ALLOC_TAG);
1634                 if (!edr) {
1635                     ERR("out of memory\n");
1636                     return STATUS_INSUFFICIENT_RESOURCES;
1637                 }
1638 
1639                 RtlCopyMemory(edr, &ref->edr, sizeof(EXTENT_DATA_REF));
1640 
1641                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_EXTENT_DATA_REF, ref->hash, edr, sizeof(EXTENT_DATA_REF), NULL, NULL);
1642                 if (!NT_SUCCESS(Status)) {
1643                     ERR("insert_tree_item returned %08lx\n", Status);
1644                     return Status;
1645                 }
1646             } else if (ref->type == TYPE_SHARED_DATA_REF) {
1647                 uint32_t* sdr;
1648 
1649                 sdr = ExAllocatePoolWithTag(PagedPool, sizeof(uint32_t), ALLOC_TAG);
1650                 if (!sdr) {
1651                     ERR("out of memory\n");
1652                     return STATUS_INSUFFICIENT_RESOURCES;
1653                 }
1654 
1655                 *sdr = ref->sdr.count;
1656 
1657                 Status = insert_tree_item(Vcb, Vcb->extent_root, dr->new_address, TYPE_SHARED_DATA_REF, ref->parent->new_address, sdr, sizeof(uint32_t), NULL, NULL);
1658                 if (!NT_SUCCESS(Status)) {
1659                     ERR("insert_tree_item returned %08lx\n", Status);
1660                     return Status;
1661                 }
1662             }
1663 
1664             le = le->Flink;
1665         }
1666     }
1667 
1668     return STATUS_SUCCESS;
1669 }
1670 
balance_data_chunk(device_extension * Vcb,chunk * c,bool * changed)1671 static NTSTATUS balance_data_chunk(device_extension* Vcb, chunk* c, bool* changed) {
1672     KEY searchkey;
1673     traverse_ptr tp;
1674     NTSTATUS Status;
1675     bool b;
1676     LIST_ENTRY items, metadata_items, rollback, *le;
1677     uint64_t loaded = 0, num_loaded = 0;
1678     chunk* newchunk = NULL;
1679     uint8_t* data = NULL;
1680 
1681     TRACE("chunk %I64x\n", c->offset);
1682 
1683     InitializeListHead(&rollback);
1684     InitializeListHead(&items);
1685     InitializeListHead(&metadata_items);
1686 
1687     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
1688 
1689     searchkey.obj_id = c->offset;
1690     searchkey.obj_type = TYPE_EXTENT_ITEM;
1691     searchkey.offset = 0xffffffffffffffff;
1692 
1693     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
1694     if (!NT_SUCCESS(Status)) {
1695         ERR("find_item returned %08lx\n", Status);
1696         goto end;
1697     }
1698 
1699     do {
1700         traverse_ptr next_tp;
1701 
1702         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
1703             break;
1704 
1705         if (tp.item->key.obj_id >= c->offset && tp.item->key.obj_type == TYPE_EXTENT_ITEM) {
1706             bool tree = false;
1707 
1708             if (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->size >= sizeof(EXTENT_ITEM)) {
1709                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
1710 
1711                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
1712                     tree = true;
1713             }
1714 
1715             if (!tree) {
1716                 Status = add_data_reloc(Vcb, &items, &metadata_items, &tp, c, &rollback);
1717 
1718                 if (!NT_SUCCESS(Status)) {
1719                     ERR("add_data_reloc returned %08lx\n", Status);
1720                     goto end;
1721                 }
1722 
1723                 loaded += tp.item->key.offset;
1724                 num_loaded++;
1725 
1726                 if (loaded >= 0x1000000 || num_loaded >= 100) // only do so much at a time, so we don't block too obnoxiously
1727                     break;
1728             }
1729         }
1730 
1731         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
1732 
1733         if (b)
1734             tp = next_tp;
1735     } while (b);
1736 
1737     if (IsListEmpty(&items)) {
1738         *changed = false;
1739         Status = STATUS_SUCCESS;
1740         goto end;
1741     } else
1742         *changed = true;
1743 
1744     data = ExAllocatePoolWithTag(PagedPool, BALANCE_UNIT, ALLOC_TAG);
1745     if (!data) {
1746         ERR("out of memory\n");
1747         Status = STATUS_INSUFFICIENT_RESOURCES;
1748         goto end;
1749     }
1750 
1751     le = items.Flink;
1752     while (le != &items) {
1753         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
1754         bool done = false;
1755         LIST_ENTRY* le2;
1756         void* csum;
1757         RTL_BITMAP bmp;
1758         ULONG* bmparr;
1759         ULONG bmplen, runlength, index, lastoff;
1760 
1761         if (newchunk) {
1762             acquire_chunk_lock(newchunk, Vcb);
1763 
1764             if (find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1765                 newchunk->used += dr->size;
1766                 space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1767                 done = true;
1768             }
1769 
1770             release_chunk_lock(newchunk, Vcb);
1771         }
1772 
1773         if (!done) {
1774             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
1775 
1776             le2 = Vcb->chunks.Flink;
1777             while (le2 != &Vcb->chunks) {
1778                 chunk* c2 = CONTAINING_RECORD(le2, chunk, list_entry);
1779 
1780                 if (!c2->readonly && !c2->reloc && c2 != newchunk && c2->chunk_item->type == Vcb->data_flags) {
1781                     acquire_chunk_lock(c2, Vcb);
1782 
1783                     if ((c2->chunk_item->size - c2->used) >= dr->size) {
1784                         if (find_data_address_in_chunk(Vcb, c2, dr->size, &dr->new_address)) {
1785                             c2->used += dr->size;
1786                             space_list_subtract(c2, dr->new_address, dr->size, &rollback);
1787                             release_chunk_lock(c2, Vcb);
1788                             newchunk = c2;
1789                             done = true;
1790                             break;
1791                         }
1792                     }
1793 
1794                     release_chunk_lock(c2, Vcb);
1795                 }
1796 
1797                 le2 = le2->Flink;
1798             }
1799 
1800             // allocate new chunk if necessary
1801             if (!done) {
1802                 Status = alloc_chunk(Vcb, Vcb->data_flags, &newchunk, false);
1803 
1804                 if (!NT_SUCCESS(Status)) {
1805                     ERR("alloc_chunk returned %08lx\n", Status);
1806                     ExReleaseResourceLite(&Vcb->chunk_lock);
1807                     goto end;
1808                 }
1809 
1810                 acquire_chunk_lock(newchunk, Vcb);
1811 
1812                 newchunk->balance_num = Vcb->balance.balance_num;
1813 
1814                 if (!find_data_address_in_chunk(Vcb, newchunk, dr->size, &dr->new_address)) {
1815                     release_chunk_lock(newchunk, Vcb);
1816                     ExReleaseResourceLite(&Vcb->chunk_lock);
1817                     ERR("could not find address in new chunk\n");
1818                     Status = STATUS_DISK_FULL;
1819                     goto end;
1820                 } else {
1821                     newchunk->used += dr->size;
1822                     space_list_subtract(newchunk, dr->new_address, dr->size, &rollback);
1823                 }
1824 
1825                 release_chunk_lock(newchunk, Vcb);
1826             }
1827 
1828             ExReleaseResourceLite(&Vcb->chunk_lock);
1829         }
1830 
1831         dr->newchunk = newchunk;
1832 
1833         bmplen = (ULONG)(dr->size >> Vcb->sector_shift);
1834 
1835         bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(bmplen + 1, sizeof(ULONG)), ALLOC_TAG);
1836         if (!bmparr) {
1837             ERR("out of memory\n");
1838             Status = STATUS_INSUFFICIENT_RESOURCES;
1839             goto end;
1840         }
1841 
1842         csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((dr->size * Vcb->csum_size) >> Vcb->sector_shift), ALLOC_TAG);
1843         if (!csum) {
1844             ERR("out of memory\n");
1845             ExFreePool(bmparr);
1846             Status = STATUS_INSUFFICIENT_RESOURCES;
1847             goto end;
1848         }
1849 
1850         RtlInitializeBitMap(&bmp, bmparr, bmplen);
1851         RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
1852 
1853         searchkey.obj_id = EXTENT_CSUM_ID;
1854         searchkey.obj_type = TYPE_EXTENT_CSUM;
1855         searchkey.offset = dr->address;
1856 
1857         Status = find_item(Vcb, Vcb->checksum_root, &tp, &searchkey, false, NULL);
1858         if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
1859             ERR("find_item returned %08lx\n", Status);
1860             ExFreePool(csum);
1861             ExFreePool(bmparr);
1862             goto end;
1863         }
1864 
1865         if (Status != STATUS_NOT_FOUND) {
1866             do {
1867                 traverse_ptr next_tp;
1868 
1869                 if (tp.item->key.obj_type == TYPE_EXTENT_CSUM) {
1870                     if (tp.item->key.offset >= dr->address + dr->size)
1871                         break;
1872                     else if (tp.item->size >= Vcb->csum_size && tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size) >= dr->address) {
1873                         uint64_t cs = max(dr->address, tp.item->key.offset);
1874                         uint64_t ce = min(dr->address + dr->size, tp.item->key.offset + (((unsigned int)tp.item->size << Vcb->sector_shift) / Vcb->csum_size));
1875 
1876                         RtlCopyMemory((uint8_t*)csum + (((cs - dr->address) * Vcb->csum_size) >> Vcb->sector_shift),
1877                                       tp.item->data + (((cs - tp.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
1878                                       (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift));
1879 
1880                         RtlClearBits(&bmp, (ULONG)((cs - dr->address) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift));
1881 
1882                         if (ce == dr->address + dr->size)
1883                             break;
1884                     }
1885                 }
1886 
1887                 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
1888                     tp = next_tp;
1889                 else
1890                     break;
1891             } while (true);
1892         }
1893 
1894         lastoff = 0;
1895         runlength = RtlFindFirstRunClear(&bmp, &index);
1896 
1897         while (runlength != 0) {
1898             if (index >= bmplen)
1899                 break;
1900 
1901             if (index + runlength >= bmplen) {
1902                 runlength = bmplen - index;
1903 
1904                 if (runlength == 0)
1905                     break;
1906             }
1907 
1908             if (index > lastoff) {
1909                 ULONG off = lastoff;
1910                 ULONG size = index - lastoff;
1911 
1912                 // handle no csum run
1913                 do {
1914                     ULONG rl;
1915 
1916                     if (size << Vcb->sector_shift > BALANCE_UNIT)
1917                         rl = BALANCE_UNIT >> Vcb->sector_shift;
1918                     else
1919                         rl = size;
1920 
1921                     Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1922                                        c, NULL, NULL, 0, false, NormalPagePriority);
1923                     if (!NT_SUCCESS(Status)) {
1924                         ERR("read_data returned %08lx\n", Status);
1925                         ExFreePool(csum);
1926                         ExFreePool(bmparr);
1927                         goto end;
1928                     }
1929 
1930                     Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1931                                                  NULL, newchunk, false, 0, NormalPagePriority);
1932                     if (!NT_SUCCESS(Status)) {
1933                         ERR("write_data_complete returned %08lx\n", Status);
1934                         ExFreePool(csum);
1935                         ExFreePool(bmparr);
1936                         goto end;
1937                     }
1938 
1939                     size -= rl;
1940                     off += rl;
1941                 } while (size > 0);
1942             }
1943 
1944             add_checksum_entry(Vcb, dr->new_address + (index << Vcb->sector_shift), runlength, (uint8_t*)csum + (index * Vcb->csum_size), NULL);
1945             add_checksum_entry(Vcb, dr->address + (index << Vcb->sector_shift), runlength, NULL, NULL);
1946 
1947             // handle csum run
1948             do {
1949                 ULONG rl;
1950 
1951                 if (runlength << Vcb->sector_shift > BALANCE_UNIT)
1952                     rl = BALANCE_UNIT >> Vcb->sector_shift;
1953                 else
1954                     rl = runlength;
1955 
1956                 Status = read_data(Vcb, dr->address + (index << Vcb->sector_shift), rl << Vcb->sector_shift,
1957                                    (uint8_t*)csum + (index * Vcb->csum_size), false, data, c, NULL, NULL, 0, false, NormalPagePriority);
1958                 if (!NT_SUCCESS(Status)) {
1959                     ERR("read_data returned %08lx\n", Status);
1960                     ExFreePool(csum);
1961                     ExFreePool(bmparr);
1962                     goto end;
1963                 }
1964 
1965                 Status = write_data_complete(Vcb, dr->new_address + (index << Vcb->sector_shift), data, rl << Vcb->sector_shift,
1966                                              NULL, newchunk, false, 0, NormalPagePriority);
1967                 if (!NT_SUCCESS(Status)) {
1968                     ERR("write_data_complete returned %08lx\n", Status);
1969                     ExFreePool(csum);
1970                     ExFreePool(bmparr);
1971                     goto end;
1972                 }
1973 
1974                 runlength -= rl;
1975                 index += rl;
1976             } while (runlength > 0);
1977 
1978             lastoff = index;
1979             runlength = RtlFindNextForwardRunClear(&bmp, index, &index);
1980         }
1981 
1982         ExFreePool(csum);
1983         ExFreePool(bmparr);
1984 
1985         // handle final nocsum run
1986         if (lastoff < dr->size >> Vcb->sector_shift) {
1987             ULONG off = lastoff;
1988             ULONG size = (ULONG)((dr->size >> Vcb->sector_shift) - lastoff);
1989 
1990             do {
1991                 ULONG rl;
1992 
1993                 if (size << Vcb->sector_shift > BALANCE_UNIT)
1994                     rl = BALANCE_UNIT >> Vcb->sector_shift;
1995                 else
1996                     rl = size;
1997 
1998                 Status = read_data(Vcb, dr->address + (off << Vcb->sector_shift), rl << Vcb->sector_shift, NULL, false, data,
1999                                    c, NULL, NULL, 0, false, NormalPagePriority);
2000                 if (!NT_SUCCESS(Status)) {
2001                     ERR("read_data returned %08lx\n", Status);
2002                     goto end;
2003                 }
2004 
2005                 Status = write_data_complete(Vcb, dr->new_address + (off << Vcb->sector_shift), data, rl << Vcb->sector_shift,
2006                                              NULL, newchunk, false, 0, NormalPagePriority);
2007                 if (!NT_SUCCESS(Status)) {
2008                     ERR("write_data_complete returned %08lx\n", Status);
2009                     goto end;
2010                 }
2011 
2012                 size -= rl;
2013                 off += rl;
2014             } while (size > 0);
2015         }
2016 
2017         le = le->Flink;
2018     }
2019 
2020     ExFreePool(data);
2021     data = NULL;
2022 
2023     Status = write_metadata_items(Vcb, &metadata_items, &items, NULL, &rollback);
2024     if (!NT_SUCCESS(Status)) {
2025         ERR("write_metadata_items returned %08lx\n", Status);
2026         goto end;
2027     }
2028 
2029     le = items.Flink;
2030     while (le != &items) {
2031         data_reloc* dr = CONTAINING_RECORD(le, data_reloc, list_entry);
2032 
2033         Status = add_data_reloc_extent_item(Vcb, dr);
2034         if (!NT_SUCCESS(Status)) {
2035             ERR("add_data_reloc_extent_item returned %08lx\n", Status);
2036             goto end;
2037         }
2038 
2039         le = le->Flink;
2040     }
2041 
2042     le = c->changed_extents.Flink;
2043     while (le != &c->changed_extents) {
2044         LIST_ENTRY *le2, *le3;
2045         changed_extent* ce = CONTAINING_RECORD(le, changed_extent, list_entry);
2046 
2047         le3 = le->Flink;
2048 
2049         le2 = items.Flink;
2050         while (le2 != &items) {
2051             data_reloc* dr = CONTAINING_RECORD(le2, data_reloc, list_entry);
2052 
2053             if (ce->address == dr->address) {
2054                 ce->address = dr->new_address;
2055                 RemoveEntryList(&ce->list_entry);
2056                 InsertTailList(&dr->newchunk->changed_extents, &ce->list_entry);
2057                 break;
2058             }
2059 
2060             le2 = le2->Flink;
2061         }
2062 
2063         le = le3;
2064     }
2065 
2066     Status = STATUS_SUCCESS;
2067 
2068     Vcb->need_write = true;
2069 
2070 end:
2071     if (NT_SUCCESS(Status)) {
2072         // update extents in cache inodes before we flush
2073         le = Vcb->chunks.Flink;
2074         while (le != &Vcb->chunks) {
2075             chunk* c2 = CONTAINING_RECORD(le, chunk, list_entry);
2076 
2077             if (c2->cache) {
2078                 LIST_ENTRY* le2;
2079 
2080                 ExAcquireResourceExclusiveLite(c2->cache->Header.Resource, true);
2081 
2082                 le2 = c2->cache->extents.Flink;
2083                 while (le2 != &c2->cache->extents) {
2084                     extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2085 
2086                     if (!ext->ignore) {
2087                         if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2088                             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2089 
2090                             if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2091                                 LIST_ENTRY* le3 = items.Flink;
2092                                 while (le3 != &items) {
2093                                     data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2094 
2095                                     if (ed2->address == dr->address) {
2096                                         ed2->address = dr->new_address;
2097                                         break;
2098                                     }
2099 
2100                                     le3 = le3->Flink;
2101                                 }
2102                             }
2103                         }
2104                     }
2105 
2106                     le2 = le2->Flink;
2107                 }
2108 
2109                 ExReleaseResourceLite(c2->cache->Header.Resource);
2110             }
2111 
2112             le = le->Flink;
2113         }
2114 
2115         Status = do_write(Vcb, NULL);
2116         if (!NT_SUCCESS(Status))
2117             ERR("do_write returned %08lx\n", Status);
2118     }
2119 
2120     if (NT_SUCCESS(Status)) {
2121         clear_rollback(&rollback);
2122 
2123         // update open FCBs
2124         // FIXME - speed this up(?)
2125 
2126         le = Vcb->all_fcbs.Flink;
2127         while (le != &Vcb->all_fcbs) {
2128             struct _fcb* fcb = CONTAINING_RECORD(le, struct _fcb, list_entry_all);
2129             LIST_ENTRY* le2;
2130 
2131             ExAcquireResourceExclusiveLite(fcb->Header.Resource, true);
2132 
2133             le2 = fcb->extents.Flink;
2134             while (le2 != &fcb->extents) {
2135                 extent* ext = CONTAINING_RECORD(le2, extent, list_entry);
2136 
2137                 if (!ext->ignore) {
2138                     if (ext->extent_data.type == EXTENT_TYPE_REGULAR || ext->extent_data.type == EXTENT_TYPE_PREALLOC) {
2139                         EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ext->extent_data.data;
2140 
2141                         if (ed2->size > 0 && ed2->address >= c->offset && ed2->address < c->offset + c->chunk_item->size) {
2142                             LIST_ENTRY* le3 = items.Flink;
2143                             while (le3 != &items) {
2144                                 data_reloc* dr = CONTAINING_RECORD(le3, data_reloc, list_entry);
2145 
2146                                 if (ed2->address == dr->address) {
2147                                     ed2->address = dr->new_address;
2148                                     break;
2149                                 }
2150 
2151                                 le3 = le3->Flink;
2152                             }
2153                         }
2154                     }
2155                 }
2156 
2157                 le2 = le2->Flink;
2158             }
2159 
2160             ExReleaseResourceLite(fcb->Header.Resource);
2161 
2162             le = le->Flink;
2163         }
2164     } else
2165         do_rollback(Vcb, &rollback);
2166 
2167     free_trees(Vcb);
2168 
2169     ExReleaseResourceLite(&Vcb->tree_lock);
2170 
2171     if (data)
2172         ExFreePool(data);
2173 
2174     while (!IsListEmpty(&items)) {
2175         data_reloc* dr = CONTAINING_RECORD(RemoveHeadList(&items), data_reloc, list_entry);
2176 
2177         while (!IsListEmpty(&dr->refs)) {
2178             data_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&dr->refs), data_reloc_ref, list_entry);
2179 
2180             ExFreePool(ref);
2181         }
2182 
2183         ExFreePool(dr);
2184     }
2185 
2186     while (!IsListEmpty(&metadata_items)) {
2187         metadata_reloc* mr = CONTAINING_RECORD(RemoveHeadList(&metadata_items), metadata_reloc, list_entry);
2188 
2189         while (!IsListEmpty(&mr->refs)) {
2190             metadata_reloc_ref* ref = CONTAINING_RECORD(RemoveHeadList(&mr->refs), metadata_reloc_ref, list_entry);
2191 
2192             ExFreePool(ref);
2193         }
2194 
2195         if (mr->data)
2196             ExFreePool(mr->data);
2197 
2198         ExFreePool(mr);
2199     }
2200 
2201     return Status;
2202 }
2203 
get_chunk_dup_type(chunk * c)2204 static __inline uint64_t get_chunk_dup_type(chunk* c) {
2205     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2206         return BLOCK_FLAG_RAID0;
2207     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2208         return BLOCK_FLAG_RAID1;
2209     else if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2210         return BLOCK_FLAG_DUPLICATE;
2211     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2212         return BLOCK_FLAG_RAID10;
2213     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2214         return BLOCK_FLAG_RAID5;
2215     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2216         return BLOCK_FLAG_RAID6;
2217     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2218         return BLOCK_FLAG_RAID1C3;
2219     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2220         return BLOCK_FLAG_RAID1C4;
2221     else
2222         return BLOCK_FLAG_SINGLE;
2223 }
2224 
should_balance_chunk(device_extension * Vcb,uint8_t sort,chunk * c)2225 static bool should_balance_chunk(device_extension* Vcb, uint8_t sort, chunk* c) {
2226     btrfs_balance_opts* opts;
2227 
2228     opts = &Vcb->balance.opts[sort];
2229 
2230     if (!(opts->flags & BTRFS_BALANCE_OPTS_ENABLED))
2231         return false;
2232 
2233     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2234         uint64_t type = get_chunk_dup_type(c);
2235 
2236         if (!(type & opts->profiles))
2237             return false;
2238     }
2239 
2240     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2241         uint16_t i;
2242         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2243         bool b = false;
2244 
2245         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2246             if (cis[i].dev_id == opts->devid) {
2247                 b = true;
2248                 break;
2249             }
2250         }
2251 
2252         if (!b)
2253             return false;
2254     }
2255 
2256     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2257         uint16_t i, factor;
2258         uint64_t physsize;
2259         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2260         bool b = false;
2261 
2262         if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2263             factor = c->chunk_item->num_stripes;
2264         else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2265             factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
2266         else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
2267             factor = c->chunk_item->num_stripes - 1;
2268         else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2269             factor = c->chunk_item->num_stripes - 2;
2270         else // SINGLE, DUPLICATE, RAID1, RAID1C3, RAID1C4
2271             factor = 1;
2272 
2273         physsize = c->chunk_item->size / factor;
2274 
2275         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2276             if (cis[i].offset < opts->drange_end && cis[i].offset + physsize >= opts->drange_start &&
2277                 (!(opts->flags & BTRFS_BALANCE_OPTS_DEVID) || cis[i].dev_id == opts->devid)) {
2278                 b = true;
2279                 break;
2280             }
2281         }
2282 
2283         if (!b)
2284             return false;
2285     }
2286 
2287     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2288         if (c->offset + c->chunk_item->size <= opts->vrange_start || c->offset > opts->vrange_end)
2289             return false;
2290     }
2291 
2292     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2293         if (c->chunk_item->num_stripes < opts->stripes_start || c->chunk_item->num_stripes < opts->stripes_end)
2294             return false;
2295     }
2296 
2297     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2298         uint64_t usage = c->used * 100 / c->chunk_item->size;
2299 
2300         // usage == 0 should mean completely empty, not just that usage rounds to 0%
2301         if (c->used > 0 && usage == 0)
2302             usage = 1;
2303 
2304         if (usage < opts->usage_start || usage > opts->usage_end)
2305             return false;
2306     }
2307 
2308     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT && opts->flags & BTRFS_BALANCE_OPTS_SOFT) {
2309         uint64_t type = get_chunk_dup_type(c);
2310 
2311         if (type == opts->convert)
2312             return false;
2313     }
2314 
2315     return true;
2316 }
2317 
copy_balance_args(btrfs_balance_opts * opts,BALANCE_ARGS * args)2318 static void copy_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2319     if (opts->flags & BTRFS_BALANCE_OPTS_PROFILES) {
2320         args->profiles = opts->profiles;
2321         args->flags |= BALANCE_ARGS_FLAGS_PROFILES;
2322     }
2323 
2324     if (opts->flags & BTRFS_BALANCE_OPTS_USAGE) {
2325         if (args->usage_start == 0) {
2326             args->flags |= BALANCE_ARGS_FLAGS_USAGE_RANGE;
2327             args->usage_start = opts->usage_start;
2328             args->usage_end = opts->usage_end;
2329         } else {
2330             args->flags |= BALANCE_ARGS_FLAGS_USAGE;
2331             args->usage = opts->usage_end;
2332         }
2333     }
2334 
2335     if (opts->flags & BTRFS_BALANCE_OPTS_DEVID) {
2336         args->devid = opts->devid;
2337         args->flags |= BALANCE_ARGS_FLAGS_DEVID;
2338     }
2339 
2340     if (opts->flags & BTRFS_BALANCE_OPTS_DRANGE) {
2341         args->drange_start = opts->drange_start;
2342         args->drange_end = opts->drange_end;
2343         args->flags |= BALANCE_ARGS_FLAGS_DRANGE;
2344     }
2345 
2346     if (opts->flags & BTRFS_BALANCE_OPTS_VRANGE) {
2347         args->vrange_start = opts->vrange_start;
2348         args->vrange_end = opts->vrange_end;
2349         args->flags |= BALANCE_ARGS_FLAGS_VRANGE;
2350     }
2351 
2352     if (opts->flags & BTRFS_BALANCE_OPTS_CONVERT) {
2353         args->convert = opts->convert;
2354         args->flags |= BALANCE_ARGS_FLAGS_CONVERT;
2355 
2356         if (opts->flags & BTRFS_BALANCE_OPTS_SOFT)
2357             args->flags |= BALANCE_ARGS_FLAGS_SOFT;
2358     }
2359 
2360     if (opts->flags & BTRFS_BALANCE_OPTS_LIMIT) {
2361         if (args->limit_start == 0) {
2362             args->flags |= BALANCE_ARGS_FLAGS_LIMIT_RANGE;
2363             args->limit_start = (uint32_t)opts->limit_start;
2364             args->limit_end = (uint32_t)opts->limit_end;
2365         } else {
2366             args->flags |= BALANCE_ARGS_FLAGS_LIMIT;
2367             args->limit = opts->limit_end;
2368         }
2369     }
2370 
2371     if (opts->flags & BTRFS_BALANCE_OPTS_STRIPES) {
2372         args->stripes_start = opts->stripes_start;
2373         args->stripes_end = opts->stripes_end;
2374         args->flags |= BALANCE_ARGS_FLAGS_STRIPES_RANGE;
2375     }
2376 }
2377 
add_balance_item(device_extension * Vcb)2378 static NTSTATUS add_balance_item(device_extension* Vcb) {
2379     KEY searchkey;
2380     traverse_ptr tp;
2381     NTSTATUS Status;
2382     BALANCE_ITEM* bi;
2383 
2384     searchkey.obj_id = BALANCE_ITEM_ID;
2385     searchkey.obj_type = TYPE_TEMP_ITEM;
2386     searchkey.offset = 0;
2387 
2388     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2389 
2390     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2391     if (!NT_SUCCESS(Status)) {
2392         ERR("find_item returned %08lx\n", Status);
2393         goto end;
2394     }
2395 
2396     if (!keycmp(tp.item->key, searchkey)) {
2397         Status = delete_tree_item(Vcb, &tp);
2398         if (!NT_SUCCESS(Status)) {
2399             ERR("delete_tree_item returned %08lx\n", Status);
2400             goto end;
2401         }
2402     }
2403 
2404     bi = ExAllocatePoolWithTag(PagedPool, sizeof(BALANCE_ITEM), ALLOC_TAG);
2405     if (!bi) {
2406         ERR("out of memory\n");
2407         Status = STATUS_INSUFFICIENT_RESOURCES;
2408         goto end;
2409     }
2410 
2411     RtlZeroMemory(bi, sizeof(BALANCE_ITEM));
2412 
2413     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2414         bi->flags |= BALANCE_FLAGS_DATA;
2415         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
2416     }
2417 
2418     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2419         bi->flags |= BALANCE_FLAGS_METADATA;
2420         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
2421     }
2422 
2423     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED) {
2424         bi->flags |= BALANCE_FLAGS_SYSTEM;
2425         copy_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
2426     }
2427 
2428     Status = insert_tree_item(Vcb, Vcb->root_root, BALANCE_ITEM_ID, TYPE_TEMP_ITEM, 0, bi, sizeof(BALANCE_ITEM), NULL, NULL);
2429     if (!NT_SUCCESS(Status)) {
2430         ERR("insert_tree_item returned %08lx\n", Status);
2431         ExFreePool(bi);
2432         goto end;
2433     }
2434 
2435     Status = STATUS_SUCCESS;
2436 
2437 end:
2438     if (NT_SUCCESS(Status)) {
2439         Status = do_write(Vcb, NULL);
2440         if (!NT_SUCCESS(Status))
2441             ERR("do_write returned %08lx\n", Status);
2442     }
2443 
2444     free_trees(Vcb);
2445 
2446     ExReleaseResourceLite(&Vcb->tree_lock);
2447 
2448     return Status;
2449 }
2450 
remove_balance_item(device_extension * Vcb)2451 static NTSTATUS remove_balance_item(device_extension* Vcb) {
2452     KEY searchkey;
2453     traverse_ptr tp;
2454     NTSTATUS Status;
2455 
2456     searchkey.obj_id = BALANCE_ITEM_ID;
2457     searchkey.obj_type = TYPE_TEMP_ITEM;
2458     searchkey.offset = 0;
2459 
2460     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
2461 
2462     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
2463     if (!NT_SUCCESS(Status)) {
2464         ERR("find_item returned %08lx\n", Status);
2465         goto end;
2466     }
2467 
2468     if (!keycmp(tp.item->key, searchkey)) {
2469         Status = delete_tree_item(Vcb, &tp);
2470         if (!NT_SUCCESS(Status)) {
2471             ERR("delete_tree_item returned %08lx\n", Status);
2472             goto end;
2473         }
2474 
2475         Status = do_write(Vcb, NULL);
2476         if (!NT_SUCCESS(Status)) {
2477             ERR("do_write returned %08lx\n", Status);
2478             goto end;
2479         }
2480 
2481         free_trees(Vcb);
2482     }
2483 
2484     Status = STATUS_SUCCESS;
2485 
2486 end:
2487     ExReleaseResourceLite(&Vcb->tree_lock);
2488 
2489     return Status;
2490 }
2491 
load_balance_args(btrfs_balance_opts * opts,BALANCE_ARGS * args)2492 static void load_balance_args(btrfs_balance_opts* opts, BALANCE_ARGS* args) {
2493     opts->flags = BTRFS_BALANCE_OPTS_ENABLED;
2494 
2495     if (args->flags & BALANCE_ARGS_FLAGS_PROFILES) {
2496         opts->flags |= BTRFS_BALANCE_OPTS_PROFILES;
2497         opts->profiles = args->profiles;
2498     }
2499 
2500     if (args->flags & BALANCE_ARGS_FLAGS_USAGE) {
2501         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2502 
2503         opts->usage_start = 0;
2504         opts->usage_end = (uint8_t)args->usage;
2505     } else if (args->flags & BALANCE_ARGS_FLAGS_USAGE_RANGE) {
2506         opts->flags |= BTRFS_BALANCE_OPTS_USAGE;
2507 
2508         opts->usage_start = (uint8_t)args->usage_start;
2509         opts->usage_end = (uint8_t)args->usage_end;
2510     }
2511 
2512     if (args->flags & BALANCE_ARGS_FLAGS_DEVID) {
2513         opts->flags |= BTRFS_BALANCE_OPTS_DEVID;
2514         opts->devid = args->devid;
2515     }
2516 
2517     if (args->flags & BALANCE_ARGS_FLAGS_DRANGE) {
2518         opts->flags |= BTRFS_BALANCE_OPTS_DRANGE;
2519         opts->drange_start = args->drange_start;
2520         opts->drange_end = args->drange_end;
2521     }
2522 
2523     if (args->flags & BALANCE_ARGS_FLAGS_VRANGE) {
2524         opts->flags |= BTRFS_BALANCE_OPTS_VRANGE;
2525         opts->vrange_start = args->vrange_start;
2526         opts->vrange_end = args->vrange_end;
2527     }
2528 
2529     if (args->flags & BALANCE_ARGS_FLAGS_LIMIT) {
2530         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2531 
2532         opts->limit_start = 0;
2533         opts->limit_end = args->limit;
2534     } else if (args->flags & BALANCE_ARGS_FLAGS_LIMIT_RANGE) {
2535         opts->flags |= BTRFS_BALANCE_OPTS_LIMIT;
2536 
2537         opts->limit_start = args->limit_start;
2538         opts->limit_end = args->limit_end;
2539     }
2540 
2541     if (args->flags & BALANCE_ARGS_FLAGS_STRIPES_RANGE) {
2542         opts->flags |= BTRFS_BALANCE_OPTS_STRIPES;
2543 
2544         opts->stripes_start = (uint16_t)args->stripes_start;
2545         opts->stripes_end = (uint16_t)args->stripes_end;
2546     }
2547 
2548     if (args->flags & BALANCE_ARGS_FLAGS_CONVERT) {
2549         opts->flags |= BTRFS_BALANCE_OPTS_CONVERT;
2550         opts->convert = args->convert;
2551 
2552         if (args->flags & BALANCE_ARGS_FLAGS_SOFT)
2553             opts->flags |= BTRFS_BALANCE_OPTS_SOFT;
2554     }
2555 }
2556 
remove_superblocks(device * dev)2557 static NTSTATUS remove_superblocks(device* dev) {
2558     NTSTATUS Status;
2559     superblock* sb;
2560     int i = 0;
2561 
2562     sb = ExAllocatePoolWithTag(PagedPool, sizeof(superblock), ALLOC_TAG);
2563     if (!sb) {
2564         ERR("out of memory\n");
2565         return STATUS_INSUFFICIENT_RESOURCES;
2566     }
2567 
2568     RtlZeroMemory(sb, sizeof(superblock));
2569 
2570     while (superblock_addrs[i] > 0 && dev->devitem.num_bytes >= superblock_addrs[i] + sizeof(superblock)) {
2571         Status = write_data_phys(dev->devobj, dev->fileobj, superblock_addrs[i], sb, sizeof(superblock));
2572 
2573         if (!NT_SUCCESS(Status)) {
2574             ExFreePool(sb);
2575             return Status;
2576         }
2577 
2578         i++;
2579     }
2580 
2581     ExFreePool(sb);
2582 
2583     return STATUS_SUCCESS;
2584 }
2585 
2586 static NTSTATUS finish_removing_device(_Requires_exclusive_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2587     KEY searchkey;
2588     traverse_ptr tp;
2589     NTSTATUS Status;
2590     LIST_ENTRY* le;
2591     volume_device_extension* vde;
2592 
2593     if (Vcb->need_write) {
2594         Status = do_write(Vcb, NULL);
2595 
2596         if (!NT_SUCCESS(Status))
2597             ERR("do_write returned %08lx\n", Status);
2598     } else
2599         Status = STATUS_SUCCESS;
2600 
2601     free_trees(Vcb);
2602 
2603     if (!NT_SUCCESS(Status))
2604         return Status;
2605 
2606     // remove entry in chunk tree
2607 
2608     searchkey.obj_id = 1;
2609     searchkey.obj_type = TYPE_DEV_ITEM;
2610     searchkey.offset = dev->devitem.dev_id;
2611 
2612     Status = find_item(Vcb, Vcb->chunk_root, &tp, &searchkey, false, NULL);
2613     if (!NT_SUCCESS(Status)) {
2614         ERR("find_item returned %08lx\n", Status);
2615         return Status;
2616     }
2617 
2618     if (!keycmp(searchkey, tp.item->key)) {
2619         Status = delete_tree_item(Vcb, &tp);
2620 
2621         if (!NT_SUCCESS(Status)) {
2622             ERR("delete_tree_item returned %08lx\n", Status);
2623             return Status;
2624         }
2625     }
2626 
2627     // remove stats entry in device tree
2628 
2629     searchkey.obj_id = 0;
2630     searchkey.obj_type = TYPE_DEV_STATS;
2631     searchkey.offset = dev->devitem.dev_id;
2632 
2633     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2634     if (!NT_SUCCESS(Status)) {
2635         ERR("find_item returned %08lx\n", Status);
2636         return Status;
2637     }
2638 
2639     if (!keycmp(searchkey, tp.item->key)) {
2640         Status = delete_tree_item(Vcb, &tp);
2641 
2642         if (!NT_SUCCESS(Status)) {
2643             ERR("delete_tree_item returned %08lx\n", Status);
2644             return Status;
2645         }
2646     }
2647 
2648     // update superblock
2649 
2650     Vcb->superblock.num_devices--;
2651     Vcb->superblock.total_bytes -= dev->devitem.num_bytes;
2652     Vcb->devices_loaded--;
2653 
2654     RemoveEntryList(&dev->list_entry);
2655 
2656     // flush
2657 
2658     Status = do_write(Vcb, NULL);
2659     if (!NT_SUCCESS(Status))
2660         ERR("do_write returned %08lx\n", Status);
2661 
2662     free_trees(Vcb);
2663 
2664     if (!NT_SUCCESS(Status))
2665         return Status;
2666 
2667     if (!dev->readonly && dev->devobj) {
2668         Status = remove_superblocks(dev);
2669         if (!NT_SUCCESS(Status))
2670             WARN("remove_superblocks returned %08lx\n", Status);
2671     }
2672 
2673     // remove entry in volume list
2674 
2675     vde = Vcb->vde;
2676 
2677     if (dev->devobj) {
2678         pdo_device_extension* pdode = vde->pdode;
2679 
2680         ExAcquireResourceExclusiveLite(&pdode->child_lock, true);
2681 
2682         le = pdode->children.Flink;
2683         while (le != &pdode->children) {
2684             volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2685 
2686             if (RtlCompareMemory(&dev->devitem.device_uuid, &vc->uuid, sizeof(BTRFS_UUID)) == sizeof(BTRFS_UUID)) {
2687                 PFILE_OBJECT FileObject;
2688                 PDEVICE_OBJECT mountmgr;
2689                 UNICODE_STRING mmdevpath;
2690 
2691                 pdode->children_loaded--;
2692 
2693                 if (vc->had_drive_letter) { // re-add entry to mountmgr
2694                     RtlInitUnicodeString(&mmdevpath, MOUNTMGR_DEVICE_NAME);
2695                     Status = IoGetDeviceObjectPointer(&mmdevpath, FILE_READ_ATTRIBUTES, &FileObject, &mountmgr);
2696                     if (!NT_SUCCESS(Status))
2697                         ERR("IoGetDeviceObjectPointer returned %08lx\n", Status);
2698                     else {
2699                         MOUNTDEV_NAME mdn;
2700 
2701                         Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, &mdn, sizeof(MOUNTDEV_NAME), true, NULL);
2702                         if (!NT_SUCCESS(Status) && Status != STATUS_BUFFER_OVERFLOW)
2703                             ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2704                         else {
2705                             MOUNTDEV_NAME* mdn2;
2706                             ULONG mdnsize = (ULONG)offsetof(MOUNTDEV_NAME, Name[0]) + mdn.NameLength;
2707 
2708                             mdn2 = ExAllocatePoolWithTag(PagedPool, mdnsize, ALLOC_TAG);
2709                             if (!mdn2)
2710                                 ERR("out of memory\n");
2711                             else {
2712                                 Status = dev_ioctl(dev->devobj, IOCTL_MOUNTDEV_QUERY_DEVICE_NAME, NULL, 0, mdn2, mdnsize, true, NULL);
2713                                 if (!NT_SUCCESS(Status))
2714                                     ERR("IOCTL_MOUNTDEV_QUERY_DEVICE_NAME returned %08lx\n", Status);
2715                                 else {
2716                                     UNICODE_STRING name;
2717 
2718                                     name.Buffer = mdn2->Name;
2719                                     name.Length = name.MaximumLength = mdn2->NameLength;
2720 
2721                                     Status = mountmgr_add_drive_letter(mountmgr, &name);
2722                                     if (!NT_SUCCESS(Status))
2723                                         WARN("mountmgr_add_drive_letter returned %08lx\n", Status);
2724                                 }
2725 
2726                                 ExFreePool(mdn2);
2727                             }
2728                         }
2729 
2730                         ObDereferenceObject(FileObject);
2731                     }
2732                 }
2733 
2734                 ExFreePool(vc->pnp_name.Buffer);
2735                 RemoveEntryList(&vc->list_entry);
2736                 ExFreePool(vc);
2737 
2738                 ObDereferenceObject(vc->fileobj);
2739 
2740                 break;
2741             }
2742 
2743             le = le->Flink;
2744         }
2745 
2746         if (pdode->children_loaded > 0 && vde->device->Characteristics & FILE_REMOVABLE_MEDIA) {
2747             vde->device->Characteristics &= ~FILE_REMOVABLE_MEDIA;
2748 
2749             le = pdode->children.Flink;
2750             while (le != &pdode->children) {
2751                 volume_child* vc = CONTAINING_RECORD(le, volume_child, list_entry);
2752 
2753                 if (vc->devobj->Characteristics & FILE_REMOVABLE_MEDIA) {
2754                     vde->device->Characteristics |= FILE_REMOVABLE_MEDIA;
2755                     break;
2756                 }
2757 
2758                 le = le->Flink;
2759             }
2760         }
2761 
2762         pdode->num_children = Vcb->superblock.num_devices;
2763 
2764         ExReleaseResourceLite(&pdode->child_lock);
2765 
2766         // free dev
2767 
2768         if (dev->trim && !dev->readonly && !Vcb->options.no_trim)
2769             trim_whole_device(dev);
2770     }
2771 
2772     while (!IsListEmpty(&dev->space)) {
2773         LIST_ENTRY* le2 = RemoveHeadList(&dev->space);
2774         space* s = CONTAINING_RECORD(le2, space, list_entry);
2775 
2776         ExFreePool(s);
2777     }
2778 
2779     ExFreePool(dev);
2780 
2781     if (Vcb->trim) {
2782         Vcb->trim = false;
2783 
2784         le = Vcb->devices.Flink;
2785         while (le != &Vcb->devices) {
2786             device* dev2 = CONTAINING_RECORD(le, device, list_entry);
2787 
2788             if (dev2->trim) {
2789                 Vcb->trim = true;
2790                 break;
2791             }
2792 
2793             le = le->Flink;
2794         }
2795     }
2796 
2797     FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
2798 
2799     return STATUS_SUCCESS;
2800 }
2801 
2802 static void trim_unalloc_space(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb, device* dev) {
2803     DEVICE_MANAGE_DATA_SET_ATTRIBUTES* dmdsa;
2804     DEVICE_DATA_SET_RANGE* ranges;
2805     ULONG datalen, i;
2806     KEY searchkey;
2807     traverse_ptr tp;
2808     NTSTATUS Status;
2809     bool b;
2810     uint64_t lastoff = 0x100000; // don't TRIM the first megabyte, in case someone has been daft enough to install GRUB there
2811     LIST_ENTRY* le;
2812 
2813     dev->num_trim_entries = 0;
2814 
2815     searchkey.obj_id = dev->devitem.dev_id;
2816     searchkey.obj_type = TYPE_DEV_EXTENT;
2817     searchkey.offset = 0;
2818 
2819     Status = find_item(Vcb, Vcb->dev_root, &tp, &searchkey, false, NULL);
2820     if (!NT_SUCCESS(Status)) {
2821         ERR("find_item returned %08lx\n", Status);
2822         return;
2823     }
2824 
2825     do {
2826         traverse_ptr next_tp;
2827 
2828         if (tp.item->key.obj_id == dev->devitem.dev_id && tp.item->key.obj_type == TYPE_DEV_EXTENT) {
2829             if (tp.item->size >= sizeof(DEV_EXTENT)) {
2830                 DEV_EXTENT* de = (DEV_EXTENT*)tp.item->data;
2831 
2832                 if (tp.item->key.offset > lastoff)
2833                     add_trim_entry_avoid_sb(Vcb, dev, lastoff, tp.item->key.offset - lastoff);
2834 
2835                 lastoff = tp.item->key.offset + de->length;
2836             } else {
2837                 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(DEV_EXTENT));
2838                 return;
2839             }
2840         }
2841 
2842         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2843 
2844         if (b) {
2845             tp = next_tp;
2846             if (tp.item->key.obj_id > searchkey.obj_id || (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type > searchkey.obj_type))
2847                 break;
2848         }
2849     } while (b);
2850 
2851     if (lastoff < dev->devitem.num_bytes)
2852         add_trim_entry_avoid_sb(Vcb, dev, lastoff, dev->devitem.num_bytes - lastoff);
2853 
2854     if (dev->num_trim_entries == 0)
2855         return;
2856 
2857     datalen = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t)) + (dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE));
2858 
2859     dmdsa = ExAllocatePoolWithTag(PagedPool, datalen, ALLOC_TAG);
2860     if (!dmdsa) {
2861         ERR("out of memory\n");
2862         goto end;
2863     }
2864 
2865     dmdsa->Size = sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES);
2866     dmdsa->Action = DeviceDsmAction_Trim;
2867     dmdsa->Flags = DEVICE_DSM_FLAG_TRIM_NOT_FS_ALLOCATED;
2868     dmdsa->ParameterBlockOffset = 0;
2869     dmdsa->ParameterBlockLength = 0;
2870     dmdsa->DataSetRangesOffset = (ULONG)sector_align(sizeof(DEVICE_MANAGE_DATA_SET_ATTRIBUTES), sizeof(uint64_t));
2871     dmdsa->DataSetRangesLength = dev->num_trim_entries * sizeof(DEVICE_DATA_SET_RANGE);
2872 
2873     ranges = (DEVICE_DATA_SET_RANGE*)((uint8_t*)dmdsa + dmdsa->DataSetRangesOffset);
2874 
2875     i = 0;
2876     le = dev->trim_list.Flink;
2877     while (le != &dev->trim_list) {
2878         space* s = CONTAINING_RECORD(le, space, list_entry);
2879 
2880         ranges[i].StartingOffset = s->address;
2881         ranges[i].LengthInBytes = s->size;
2882         i++;
2883 
2884         le = le->Flink;
2885     }
2886 
2887     Status = dev_ioctl(dev->devobj, IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES, dmdsa, datalen, NULL, 0, true, NULL);
2888     if (!NT_SUCCESS(Status))
2889         WARN("IOCTL_STORAGE_MANAGE_DATA_SET_ATTRIBUTES returned %08lx\n", Status);
2890 
2891     ExFreePool(dmdsa);
2892 
2893 end:
2894     while (!IsListEmpty(&dev->trim_list)) {
2895         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->trim_list), space, list_entry);
2896         ExFreePool(s);
2897     }
2898 
2899     dev->num_trim_entries = 0;
2900 }
2901 
try_consolidation(device_extension * Vcb,uint64_t flags,chunk ** newchunk)2902 static NTSTATUS try_consolidation(device_extension* Vcb, uint64_t flags, chunk** newchunk) {
2903     NTSTATUS Status;
2904     bool changed;
2905     LIST_ENTRY* le;
2906     chunk* rc;
2907 
2908     // FIXME - allow with metadata chunks?
2909 
2910     while (true) {
2911         rc = NULL;
2912 
2913         ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2914 
2915         ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
2916 
2917         // choose the least-used chunk we haven't looked at yet
2918         le = Vcb->chunks.Flink;
2919         while (le != &Vcb->chunks) {
2920             chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
2921 
2922             // FIXME - skip full-size chunks over e.g. 90% full?
2923             if (c->chunk_item->type & BLOCK_FLAG_DATA && !c->readonly && c->balance_num != Vcb->balance.balance_num && (!rc || c->used < rc->used))
2924                 rc = c;
2925 
2926             le = le->Flink;
2927         }
2928 
2929         ExReleaseResourceLite(&Vcb->chunk_lock);
2930 
2931         if (!rc) {
2932             ExReleaseResourceLite(&Vcb->tree_lock);
2933             break;
2934         }
2935 
2936         if (rc->list_entry_balance.Flink) {
2937             RemoveEntryList(&rc->list_entry_balance);
2938             Vcb->balance.chunks_left--;
2939         }
2940 
2941         rc->list_entry_balance.Flink = (LIST_ENTRY*)1; // so it doesn't get dropped
2942         rc->reloc = true;
2943 
2944         ExReleaseResourceLite(&Vcb->tree_lock);
2945 
2946         do {
2947             changed = false;
2948 
2949             Status = balance_data_chunk(Vcb, rc, &changed);
2950             if (!NT_SUCCESS(Status)) {
2951                 ERR("balance_data_chunk returned %08lx\n", Status);
2952                 Vcb->balance.status = Status;
2953                 rc->list_entry_balance.Flink = NULL;
2954                 rc->reloc = false;
2955                 return Status;
2956             }
2957 
2958             KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
2959 
2960             if (Vcb->readonly)
2961                 Vcb->balance.stopping = true;
2962 
2963             if (Vcb->balance.stopping)
2964                 return STATUS_SUCCESS;
2965         } while (changed);
2966 
2967         rc->list_entry_balance.Flink = NULL;
2968 
2969         rc->changed = true;
2970         rc->space_changed = true;
2971         rc->balance_num = Vcb->balance.balance_num;
2972 
2973         Status = do_write(Vcb, NULL);
2974         if (!NT_SUCCESS(Status)) {
2975             ERR("do_write returned %08lx\n", Status);
2976             return Status;
2977         }
2978 
2979         free_trees(Vcb);
2980     }
2981 
2982     ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
2983 
2984     Status = alloc_chunk(Vcb, flags, &rc, true);
2985 
2986     ExReleaseResourceLite(&Vcb->chunk_lock);
2987 
2988     if (NT_SUCCESS(Status)) {
2989         *newchunk = rc;
2990         return Status;
2991     } else {
2992         ERR("alloc_chunk returned %08lx\n", Status);
2993         return Status;
2994     }
2995 }
2996 
regenerate_space_list(device_extension * Vcb,device * dev)2997 static NTSTATUS regenerate_space_list(device_extension* Vcb, device* dev) {
2998     LIST_ENTRY* le;
2999 
3000     while (!IsListEmpty(&dev->space)) {
3001         space* s = CONTAINING_RECORD(RemoveHeadList(&dev->space), space, list_entry);
3002 
3003         ExFreePool(s);
3004     }
3005 
3006     // The Linux driver doesn't like to allocate chunks within the first megabyte of a device.
3007 
3008     space_list_add2(&dev->space, NULL, 0x100000, dev->devitem.num_bytes - 0x100000, NULL, NULL);
3009 
3010     le = Vcb->chunks.Flink;
3011     while (le != &Vcb->chunks) {
3012         uint16_t n;
3013         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3014         CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
3015 
3016         for (n = 0; n < c->chunk_item->num_stripes; n++) {
3017             uint64_t stripe_size = 0;
3018 
3019             if (cis[n].dev_id == dev->devitem.dev_id) {
3020                 if (stripe_size == 0) {
3021                     uint16_t factor;
3022 
3023                     if (c->chunk_item->type & BLOCK_FLAG_RAID0)
3024                         factor = c->chunk_item->num_stripes;
3025                     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
3026                         factor = c->chunk_item->num_stripes / c->chunk_item->sub_stripes;
3027                     else if (c->chunk_item->type & BLOCK_FLAG_RAID5)
3028                         factor = c->chunk_item->num_stripes - 1;
3029                     else if (c->chunk_item->type & BLOCK_FLAG_RAID6)
3030                         factor = c->chunk_item->num_stripes - 2;
3031                     else // SINGLE, DUP, RAID1, RAID1C3, RAID1C4
3032                         factor = 1;
3033 
3034                     stripe_size = c->chunk_item->size / factor;
3035                 }
3036 
3037                 space_list_subtract2(&dev->space, NULL, cis[n].offset, stripe_size, NULL, NULL);
3038             }
3039         }
3040 
3041         le = le->Flink;
3042     }
3043 
3044     return STATUS_SUCCESS;
3045 }
3046 
_Function_class_(KSTART_ROUTINE)3047 _Function_class_(KSTART_ROUTINE)
3048 void __stdcall balance_thread(void* context) {
3049     device_extension* Vcb = (device_extension*)context;
3050     LIST_ENTRY chunks;
3051     LIST_ENTRY* le;
3052     uint64_t num_chunks[3], okay_metadata_chunks = 0, okay_data_chunks = 0, okay_system_chunks = 0;
3053     uint64_t old_data_flags = 0, old_metadata_flags = 0, old_system_flags = 0;
3054     NTSTATUS Status;
3055 
3056     Vcb->balance.balance_num++;
3057 
3058     Vcb->balance.stopping = false;
3059     KeInitializeEvent(&Vcb->balance.finished, NotificationEvent, false);
3060 
3061     if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3062         old_data_flags = Vcb->data_flags;
3063         Vcb->data_flags = BLOCK_FLAG_DATA | (Vcb->balance.opts[BALANCE_OPTS_DATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_DATA].convert);
3064 
3065         FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3066     }
3067 
3068     if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3069         old_metadata_flags = Vcb->metadata_flags;
3070         Vcb->metadata_flags = BLOCK_FLAG_METADATA | (Vcb->balance.opts[BALANCE_OPTS_METADATA].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_METADATA].convert);
3071     }
3072 
3073     if (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED && Vcb->balance.opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3074         old_system_flags = Vcb->system_flags;
3075         Vcb->system_flags = BLOCK_FLAG_SYSTEM | (Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert == BLOCK_FLAG_SINGLE ? 0 : Vcb->balance.opts[BALANCE_OPTS_SYSTEM].convert);
3076     }
3077 
3078     if (Vcb->superblock.incompat_flags & BTRFS_INCOMPAT_FLAGS_MIXED_GROUPS) {
3079         if (Vcb->balance.opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3080             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3081         else if (Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED)
3082             RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3083     }
3084 
3085     num_chunks[0] = num_chunks[1] = num_chunks[2] = 0;
3086     Vcb->balance.total_chunks = Vcb->balance.chunks_left = 0;
3087 
3088     InitializeListHead(&chunks);
3089 
3090     // FIXME - what are we supposed to do with limit_start?
3091 
3092     if (!Vcb->readonly) {
3093         if (!Vcb->balance.removing && !Vcb->balance.shrinking) {
3094             Status = add_balance_item(Vcb);
3095             if (!NT_SUCCESS(Status)) {
3096                 ERR("add_balance_item returned %08lx\n", Status);
3097                 Vcb->balance.status = Status;
3098                 goto end;
3099             }
3100         } else {
3101             if (Vcb->need_write) {
3102                 Status = do_write(Vcb, NULL);
3103 
3104                 free_trees(Vcb);
3105 
3106                 if (!NT_SUCCESS(Status)) {
3107                     ERR("do_write returned %08lx\n", Status);
3108                     Vcb->balance.status = Status;
3109                     goto end;
3110                 }
3111             }
3112         }
3113     }
3114 
3115     KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3116 
3117     if (Vcb->balance.stopping)
3118         goto end;
3119 
3120     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3121 
3122     le = Vcb->chunks.Flink;
3123     while (le != &Vcb->chunks) {
3124         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3125         uint8_t sort;
3126 
3127         acquire_chunk_lock(c, Vcb);
3128 
3129         if (c->chunk_item->type & BLOCK_FLAG_DATA)
3130             sort = BALANCE_OPTS_DATA;
3131         else if (c->chunk_item->type & BLOCK_FLAG_METADATA)
3132             sort = BALANCE_OPTS_METADATA;
3133         else if (c->chunk_item->type & BLOCK_FLAG_SYSTEM)
3134             sort = BALANCE_OPTS_SYSTEM;
3135         else {
3136             ERR("unexpected chunk type %I64x\n", c->chunk_item->type);
3137             release_chunk_lock(c, Vcb);
3138             break;
3139         }
3140 
3141         if ((!(Vcb->balance.opts[sort].flags & BTRFS_BALANCE_OPTS_LIMIT) || num_chunks[sort] < Vcb->balance.opts[sort].limit_end) &&
3142             should_balance_chunk(Vcb, sort, c)) {
3143             InsertTailList(&chunks, &c->list_entry_balance);
3144 
3145             num_chunks[sort]++;
3146             Vcb->balance.total_chunks++;
3147             Vcb->balance.chunks_left++;
3148         } else if (sort == BALANCE_OPTS_METADATA)
3149             okay_metadata_chunks++;
3150         else if (sort == BALANCE_OPTS_DATA)
3151             okay_data_chunks++;
3152         else if (sort == BALANCE_OPTS_SYSTEM)
3153             okay_system_chunks++;
3154 
3155         if (!c->cache_loaded) {
3156             Status = load_cache_chunk(Vcb, c, NULL);
3157 
3158             if (!NT_SUCCESS(Status)) {
3159                 ERR("load_cache_chunk returned %08lx\n", Status);
3160                 Vcb->balance.status = Status;
3161                 release_chunk_lock(c, Vcb);
3162                 ExReleaseResourceLite(&Vcb->chunk_lock);
3163                 goto end;
3164             }
3165         }
3166 
3167         release_chunk_lock(c, Vcb);
3168 
3169         le = le->Flink;
3170     }
3171 
3172     ExReleaseResourceLite(&Vcb->chunk_lock);
3173 
3174     // If we're doing a full balance, try and allocate a new chunk now, before we mess things up
3175     if (okay_metadata_chunks == 0 || okay_data_chunks == 0 || okay_system_chunks == 0) {
3176         bool consolidated = false;
3177         chunk* c;
3178 
3179         if (okay_metadata_chunks == 0) {
3180             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3181 
3182             Status = alloc_chunk(Vcb, Vcb->metadata_flags, &c, true);
3183             if (NT_SUCCESS(Status))
3184                 c->balance_num = Vcb->balance.balance_num;
3185             else if (Status != STATUS_DISK_FULL || consolidated) {
3186                 ERR("alloc_chunk returned %08lx\n", Status);
3187                 ExReleaseResourceLite(&Vcb->chunk_lock);
3188                 Vcb->balance.status = Status;
3189                 goto end;
3190             }
3191 
3192             ExReleaseResourceLite(&Vcb->chunk_lock);
3193 
3194             if (Status == STATUS_DISK_FULL) {
3195                 Status = try_consolidation(Vcb, Vcb->metadata_flags, &c);
3196                 if (!NT_SUCCESS(Status)) {
3197                     ERR("try_consolidation returned %08lx\n", Status);
3198                     Vcb->balance.status = Status;
3199                     goto end;
3200                 } else
3201                     c->balance_num = Vcb->balance.balance_num;
3202 
3203                 consolidated = true;
3204 
3205                 if (Vcb->balance.stopping)
3206                     goto end;
3207             }
3208         }
3209 
3210         if (okay_data_chunks == 0) {
3211             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3212 
3213             Status = alloc_chunk(Vcb, Vcb->data_flags, &c, true);
3214             if (NT_SUCCESS(Status))
3215                 c->balance_num = Vcb->balance.balance_num;
3216             else if (Status != STATUS_DISK_FULL || consolidated) {
3217                 ERR("alloc_chunk returned %08lx\n", Status);
3218                 ExReleaseResourceLite(&Vcb->chunk_lock);
3219                 Vcb->balance.status = Status;
3220                 goto end;
3221             }
3222 
3223             ExReleaseResourceLite(&Vcb->chunk_lock);
3224 
3225             if (Status == STATUS_DISK_FULL) {
3226                 Status = try_consolidation(Vcb, Vcb->data_flags, &c);
3227                 if (!NT_SUCCESS(Status)) {
3228                     ERR("try_consolidation returned %08lx\n", Status);
3229                     Vcb->balance.status = Status;
3230                     goto end;
3231                 } else
3232                     c->balance_num = Vcb->balance.balance_num;
3233 
3234                 consolidated = true;
3235 
3236                 if (Vcb->balance.stopping)
3237                     goto end;
3238             }
3239         }
3240 
3241         if (okay_system_chunks == 0) {
3242             ExAcquireResourceExclusiveLite(&Vcb->chunk_lock, true);
3243 
3244             Status = alloc_chunk(Vcb, Vcb->system_flags, &c, true);
3245             if (NT_SUCCESS(Status))
3246                 c->balance_num = Vcb->balance.balance_num;
3247             else if (Status != STATUS_DISK_FULL || consolidated) {
3248                 ERR("alloc_chunk returned %08lx\n", Status);
3249                 ExReleaseResourceLite(&Vcb->chunk_lock);
3250                 Vcb->balance.status = Status;
3251                 goto end;
3252             }
3253 
3254             ExReleaseResourceLite(&Vcb->chunk_lock);
3255 
3256             if (Status == STATUS_DISK_FULL) {
3257                 Status = try_consolidation(Vcb, Vcb->system_flags, &c);
3258                 if (!NT_SUCCESS(Status)) {
3259                     ERR("try_consolidation returned %08lx\n", Status);
3260                     Vcb->balance.status = Status;
3261                     goto end;
3262                 } else
3263                     c->balance_num = Vcb->balance.balance_num;
3264 
3265                 consolidated = true;
3266 
3267                 if (Vcb->balance.stopping)
3268                     goto end;
3269             }
3270         }
3271     }
3272 
3273     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3274 
3275     le = chunks.Flink;
3276     while (le != &chunks) {
3277         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3278 
3279         c->reloc = true;
3280 
3281         le = le->Flink;
3282     }
3283 
3284     ExReleaseResourceLite(&Vcb->chunk_lock);
3285 
3286     // do data chunks before metadata
3287     le = chunks.Flink;
3288     while (le != &chunks) {
3289         chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3290         LIST_ENTRY* le2 = le->Flink;
3291 
3292         if (c->chunk_item->type & BLOCK_FLAG_DATA) {
3293             bool changed;
3294 
3295             do {
3296                 changed = false;
3297 
3298                 Status = balance_data_chunk(Vcb, c, &changed);
3299                 if (!NT_SUCCESS(Status)) {
3300                     ERR("balance_data_chunk returned %08lx\n", Status);
3301                     Vcb->balance.status = Status;
3302                     goto end;
3303                 }
3304 
3305                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3306 
3307                 if (Vcb->readonly)
3308                     Vcb->balance.stopping = true;
3309 
3310                 if (Vcb->balance.stopping)
3311                     break;
3312             } while (changed);
3313 
3314             c->changed = true;
3315             c->space_changed = true;
3316         }
3317 
3318         if (Vcb->balance.stopping)
3319             goto end;
3320 
3321         if (c->chunk_item->type & BLOCK_FLAG_DATA &&
3322             (!(Vcb->balance.opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) || !(c->chunk_item->type & BLOCK_FLAG_METADATA))) {
3323             RemoveEntryList(&c->list_entry_balance);
3324             c->list_entry_balance.Flink = NULL;
3325 
3326             Vcb->balance.chunks_left--;
3327         }
3328 
3329         le = le2;
3330     }
3331 
3332     // do metadata chunks
3333     while (!IsListEmpty(&chunks)) {
3334         chunk* c;
3335         bool changed;
3336 
3337         le = RemoveHeadList(&chunks);
3338         c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3339 
3340         if (c->chunk_item->type & BLOCK_FLAG_METADATA || c->chunk_item->type & BLOCK_FLAG_SYSTEM) {
3341             do {
3342                 Status = balance_metadata_chunk(Vcb, c, &changed);
3343                 if (!NT_SUCCESS(Status)) {
3344                     ERR("balance_metadata_chunk returned %08lx\n", Status);
3345                     Vcb->balance.status = Status;
3346                     goto end;
3347                 }
3348 
3349                 KeWaitForSingleObject(&Vcb->balance.event, Executive, KernelMode, false, NULL);
3350 
3351                 if (Vcb->readonly)
3352                     Vcb->balance.stopping = true;
3353 
3354                 if (Vcb->balance.stopping)
3355                     break;
3356             } while (changed);
3357 
3358             c->changed = true;
3359             c->space_changed = true;
3360         }
3361 
3362         if (Vcb->balance.stopping)
3363             break;
3364 
3365         c->list_entry_balance.Flink = NULL;
3366 
3367         Vcb->balance.chunks_left--;
3368     }
3369 
3370 end:
3371     if (!Vcb->readonly) {
3372         if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3373             le = chunks.Flink;
3374             while (le != &chunks) {
3375                 chunk* c = CONTAINING_RECORD(le, chunk, list_entry_balance);
3376                 c->reloc = false;
3377 
3378                 le = le->Flink;
3379                 c->list_entry_balance.Flink = NULL;
3380             }
3381 
3382             if (old_data_flags != 0)
3383                 Vcb->data_flags = old_data_flags;
3384 
3385             if (old_metadata_flags != 0)
3386                 Vcb->metadata_flags = old_metadata_flags;
3387 
3388             if (old_system_flags != 0)
3389                 Vcb->system_flags = old_system_flags;
3390         }
3391 
3392         if (Vcb->balance.removing) {
3393             device* dev = NULL;
3394 
3395             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3396 
3397             le = Vcb->devices.Flink;
3398             while (le != &Vcb->devices) {
3399                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3400 
3401                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3402                     dev = dev2;
3403                     break;
3404                 }
3405 
3406                 le = le->Flink;
3407             }
3408 
3409             if (dev) {
3410                 if (Vcb->balance.chunks_left == 0) {
3411                     Status = finish_removing_device(Vcb, dev);
3412 
3413                     if (!NT_SUCCESS(Status)) {
3414                         ERR("finish_removing_device returned %08lx\n", Status);
3415                         dev->reloc = false;
3416                     }
3417                 } else
3418                     dev->reloc = false;
3419             }
3420 
3421             ExReleaseResourceLite(&Vcb->tree_lock);
3422         } else if (Vcb->balance.shrinking) {
3423             device* dev = NULL;
3424 
3425             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3426 
3427             le = Vcb->devices.Flink;
3428             while (le != &Vcb->devices) {
3429                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3430 
3431                 if (dev2->devitem.dev_id == Vcb->balance.opts[0].devid) {
3432                     dev = dev2;
3433                     break;
3434                 }
3435 
3436                 le = le->Flink;
3437             }
3438 
3439             if (!dev) {
3440                 ERR("could not find device %I64x\n", Vcb->balance.opts[0].devid);
3441                 Vcb->balance.status = STATUS_INTERNAL_ERROR;
3442             }
3443 
3444             if (Vcb->balance.stopping || !NT_SUCCESS(Vcb->balance.status)) {
3445                 if (dev) {
3446                     Status = regenerate_space_list(Vcb, dev);
3447                     if (!NT_SUCCESS(Status))
3448                         WARN("regenerate_space_list returned %08lx\n", Status);
3449                 }
3450             } else {
3451                 uint64_t old_size;
3452 
3453                 old_size = dev->devitem.num_bytes;
3454                 dev->devitem.num_bytes = Vcb->balance.opts[0].drange_start;
3455 
3456                 Status = update_dev_item(Vcb, dev, NULL);
3457                 if (!NT_SUCCESS(Status)) {
3458                     ERR("update_dev_item returned %08lx\n", Status);
3459                     dev->devitem.num_bytes = old_size;
3460                     Vcb->balance.status = Status;
3461 
3462                     Status = regenerate_space_list(Vcb, dev);
3463                     if (!NT_SUCCESS(Status))
3464                         WARN("regenerate_space_list returned %08lx\n", Status);
3465                 } else {
3466                     Vcb->superblock.total_bytes -= old_size - dev->devitem.num_bytes;
3467 
3468                     Status = do_write(Vcb, NULL);
3469                     if (!NT_SUCCESS(Status))
3470                         ERR("do_write returned %08lx\n", Status);
3471 
3472                     free_trees(Vcb);
3473                 }
3474             }
3475 
3476             ExReleaseResourceLite(&Vcb->tree_lock);
3477 
3478             if (!Vcb->balance.stopping && NT_SUCCESS(Vcb->balance.status))
3479                 FsRtlNotifyVolumeEvent(Vcb->root_file, FSRTL_VOLUME_CHANGE_SIZE);
3480         } else {
3481             Status = remove_balance_item(Vcb);
3482             if (!NT_SUCCESS(Status)) {
3483                 ERR("remove_balance_item returned %08lx\n", Status);
3484                 goto end;
3485             }
3486         }
3487 
3488         if (Vcb->trim && !Vcb->options.no_trim) {
3489             ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3490 
3491             le = Vcb->devices.Flink;
3492             while (le != &Vcb->devices) {
3493                 device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3494 
3495                 if (dev2->devobj && !dev2->readonly && dev2->trim)
3496                     trim_unalloc_space(Vcb, dev2);
3497 
3498                 le = le->Flink;
3499             }
3500 
3501             ExReleaseResourceLite(&Vcb->tree_lock);
3502         }
3503     }
3504 
3505     ZwClose(Vcb->balance.thread);
3506     Vcb->balance.thread = NULL;
3507 
3508     KeSetEvent(&Vcb->balance.finished, 0, false);
3509 }
3510 
start_balance(device_extension * Vcb,void * data,ULONG length,KPROCESSOR_MODE processor_mode)3511 NTSTATUS start_balance(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3512     NTSTATUS Status;
3513     btrfs_start_balance* bsb = (btrfs_start_balance*)data;
3514     OBJECT_ATTRIBUTES oa;
3515     uint8_t i;
3516 
3517     if (length < sizeof(btrfs_start_balance) || !data)
3518         return STATUS_INVALID_PARAMETER;
3519 
3520     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3521         return STATUS_PRIVILEGE_NOT_HELD;
3522 
3523     if (Vcb->locked) {
3524         WARN("cannot start balance while locked\n");
3525         return STATUS_DEVICE_NOT_READY;
3526     }
3527 
3528     if (Vcb->scrub.thread) {
3529         WARN("cannot start balance while scrub running\n");
3530         return STATUS_DEVICE_NOT_READY;
3531     }
3532 
3533     if (Vcb->balance.thread) {
3534         WARN("balance already running\n");
3535         return STATUS_DEVICE_NOT_READY;
3536     }
3537 
3538     if (Vcb->readonly)
3539         return STATUS_MEDIA_WRITE_PROTECTED;
3540 
3541     if (!(bsb->opts[BALANCE_OPTS_DATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3542         !(bsb->opts[BALANCE_OPTS_METADATA].flags & BTRFS_BALANCE_OPTS_ENABLED) &&
3543         !(bsb->opts[BALANCE_OPTS_SYSTEM].flags & BTRFS_BALANCE_OPTS_ENABLED))
3544         return STATUS_SUCCESS;
3545 
3546     for (i = 0; i < 3; i++) {
3547         if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3548             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_PROFILES) {
3549                 bsb->opts[i].profiles &= BLOCK_FLAG_RAID0 | BLOCK_FLAG_RAID1 | BLOCK_FLAG_DUPLICATE | BLOCK_FLAG_RAID10 |
3550                                          BLOCK_FLAG_RAID5 | BLOCK_FLAG_RAID6 | BLOCK_FLAG_SINGLE | BLOCK_FLAG_RAID1C3 |
3551                                          BLOCK_FLAG_RAID1C4;
3552 
3553                 if (bsb->opts[i].profiles == 0)
3554                     return STATUS_INVALID_PARAMETER;
3555             }
3556 
3557             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DEVID) {
3558                 if (bsb->opts[i].devid == 0)
3559                     return STATUS_INVALID_PARAMETER;
3560             }
3561 
3562             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_DRANGE) {
3563                 if (bsb->opts[i].drange_start > bsb->opts[i].drange_end)
3564                     return STATUS_INVALID_PARAMETER;
3565             }
3566 
3567             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_VRANGE) {
3568                 if (bsb->opts[i].vrange_start > bsb->opts[i].vrange_end)
3569                     return STATUS_INVALID_PARAMETER;
3570             }
3571 
3572             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_LIMIT) {
3573                 bsb->opts[i].limit_start = max(1, bsb->opts[i].limit_start);
3574                 bsb->opts[i].limit_end = max(1, bsb->opts[i].limit_end);
3575 
3576                 if (bsb->opts[i].limit_start > bsb->opts[i].limit_end)
3577                     return STATUS_INVALID_PARAMETER;
3578             }
3579 
3580             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_STRIPES) {
3581                 bsb->opts[i].stripes_start = max(1, bsb->opts[i].stripes_start);
3582                 bsb->opts[i].stripes_end = max(1, bsb->opts[i].stripes_end);
3583 
3584                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3585                     return STATUS_INVALID_PARAMETER;
3586             }
3587 
3588             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) {
3589                 bsb->opts[i].usage_start = min(100, bsb->opts[i].stripes_start);
3590                 bsb->opts[i].usage_end = min(100, bsb->opts[i].stripes_end);
3591 
3592                 if (bsb->opts[i].stripes_start > bsb->opts[i].stripes_end)
3593                     return STATUS_INVALID_PARAMETER;
3594             }
3595 
3596             if (bsb->opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT) {
3597                 if (bsb->opts[i].convert != BLOCK_FLAG_RAID0 && bsb->opts[i].convert != BLOCK_FLAG_RAID1 &&
3598                     bsb->opts[i].convert != BLOCK_FLAG_DUPLICATE && bsb->opts[i].convert != BLOCK_FLAG_RAID10 &&
3599                     bsb->opts[i].convert != BLOCK_FLAG_RAID5 && bsb->opts[i].convert != BLOCK_FLAG_RAID6 &&
3600                     bsb->opts[i].convert != BLOCK_FLAG_SINGLE && bsb->opts[i].convert != BLOCK_FLAG_RAID1C3 &&
3601                     bsb->opts[i].convert != BLOCK_FLAG_RAID1C4)
3602                     return STATUS_INVALID_PARAMETER;
3603             }
3604         }
3605     }
3606 
3607     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bsb->opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3608     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bsb->opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3609     RtlCopyMemory(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bsb->opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3610 
3611     Vcb->balance.paused = false;
3612     Vcb->balance.removing = false;
3613     Vcb->balance.shrinking = false;
3614     Vcb->balance.status = STATUS_SUCCESS;
3615     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3616 
3617     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3618 
3619     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3620     if (!NT_SUCCESS(Status)) {
3621         ERR("PsCreateSystemThread returned %08lx\n", Status);
3622         return Status;
3623     }
3624 
3625     return STATUS_SUCCESS;
3626 }
3627 
3628 NTSTATUS look_for_balance_item(_Requires_lock_held_(_Curr_->tree_lock) device_extension* Vcb) {
3629     KEY searchkey;
3630     traverse_ptr tp;
3631     NTSTATUS Status;
3632     BALANCE_ITEM* bi;
3633     OBJECT_ATTRIBUTES oa;
3634     int i;
3635 
3636     searchkey.obj_id = BALANCE_ITEM_ID;
3637     searchkey.obj_type = TYPE_TEMP_ITEM;
3638     searchkey.offset = 0;
3639 
3640     Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
3641     if (!NT_SUCCESS(Status)) {
3642         ERR("find_item returned %08lx\n", Status);
3643         return Status;
3644     }
3645 
3646     if (keycmp(tp.item->key, searchkey)) {
3647         TRACE("no balance item found\n");
3648         return STATUS_NOT_FOUND;
3649     }
3650 
3651     if (tp.item->size < sizeof(BALANCE_ITEM)) {
3652         WARN("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
3653              tp.item->size, sizeof(BALANCE_ITEM));
3654         return STATUS_INTERNAL_ERROR;
3655     }
3656 
3657     bi = (BALANCE_ITEM*)tp.item->data;
3658 
3659     if (bi->flags & BALANCE_FLAGS_DATA)
3660         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_DATA], &bi->data);
3661 
3662     if (bi->flags & BALANCE_FLAGS_METADATA)
3663         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_METADATA], &bi->metadata);
3664 
3665     if (bi->flags & BALANCE_FLAGS_SYSTEM)
3666         load_balance_args(&Vcb->balance.opts[BALANCE_OPTS_SYSTEM], &bi->system);
3667 
3668     // do the heuristics that Linux driver does
3669 
3670     for (i = 0; i < 3; i++) {
3671         if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_ENABLED) {
3672             // if converting, don't redo chunks already done
3673 
3674             if (Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3675                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_SOFT;
3676 
3677             // don't balance chunks more than 90% filled - presumably these
3678             // have already been done
3679 
3680             if (!(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_USAGE) &&
3681                 !(Vcb->balance.opts[i].flags & BTRFS_BALANCE_OPTS_CONVERT)
3682             ) {
3683                 Vcb->balance.opts[i].flags |= BTRFS_BALANCE_OPTS_USAGE;
3684                 Vcb->balance.opts[i].usage_start = 0;
3685                 Vcb->balance.opts[i].usage_end = 90;
3686             }
3687         }
3688     }
3689 
3690     if (Vcb->readonly || Vcb->options.skip_balance)
3691         Vcb->balance.paused = true;
3692     else
3693         Vcb->balance.paused = false;
3694 
3695     Vcb->balance.removing = false;
3696     Vcb->balance.shrinking = false;
3697     Vcb->balance.status = STATUS_SUCCESS;
3698     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3699 
3700     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3701 
3702     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3703     if (!NT_SUCCESS(Status)) {
3704         ERR("PsCreateSystemThread returned %08lx\n", Status);
3705         return Status;
3706     }
3707 
3708     return STATUS_SUCCESS;
3709 }
3710 
query_balance(device_extension * Vcb,void * data,ULONG length)3711 NTSTATUS query_balance(device_extension* Vcb, void* data, ULONG length) {
3712     btrfs_query_balance* bqb = (btrfs_query_balance*)data;
3713 
3714     if (length < sizeof(btrfs_query_balance) || !data)
3715         return STATUS_INVALID_PARAMETER;
3716 
3717     if (!Vcb->balance.thread) {
3718         bqb->status = BTRFS_BALANCE_STOPPED;
3719 
3720         if (!NT_SUCCESS(Vcb->balance.status)) {
3721             bqb->status |= BTRFS_BALANCE_ERROR;
3722             bqb->error = Vcb->balance.status;
3723         }
3724 
3725         return STATUS_SUCCESS;
3726     }
3727 
3728     bqb->status = Vcb->balance.paused ? BTRFS_BALANCE_PAUSED : BTRFS_BALANCE_RUNNING;
3729 
3730     if (Vcb->balance.removing)
3731         bqb->status |= BTRFS_BALANCE_REMOVAL;
3732 
3733     if (Vcb->balance.shrinking)
3734         bqb->status |= BTRFS_BALANCE_SHRINKING;
3735 
3736     if (!NT_SUCCESS(Vcb->balance.status))
3737         bqb->status |= BTRFS_BALANCE_ERROR;
3738 
3739     bqb->chunks_left = Vcb->balance.chunks_left;
3740     bqb->total_chunks = Vcb->balance.total_chunks;
3741     bqb->error = Vcb->balance.status;
3742     RtlCopyMemory(&bqb->data_opts, &Vcb->balance.opts[BALANCE_OPTS_DATA], sizeof(btrfs_balance_opts));
3743     RtlCopyMemory(&bqb->metadata_opts, &Vcb->balance.opts[BALANCE_OPTS_METADATA], sizeof(btrfs_balance_opts));
3744     RtlCopyMemory(&bqb->system_opts, &Vcb->balance.opts[BALANCE_OPTS_SYSTEM], sizeof(btrfs_balance_opts));
3745 
3746     return STATUS_SUCCESS;
3747 }
3748 
pause_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3749 NTSTATUS pause_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3750     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3751         return STATUS_PRIVILEGE_NOT_HELD;
3752 
3753     if (!Vcb->balance.thread)
3754         return STATUS_DEVICE_NOT_READY;
3755 
3756     if (Vcb->balance.paused)
3757         return STATUS_DEVICE_NOT_READY;
3758 
3759     Vcb->balance.paused = true;
3760     KeClearEvent(&Vcb->balance.event);
3761 
3762     return STATUS_SUCCESS;
3763 }
3764 
resume_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3765 NTSTATUS resume_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3766     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3767         return STATUS_PRIVILEGE_NOT_HELD;
3768 
3769     if (!Vcb->balance.thread)
3770         return STATUS_DEVICE_NOT_READY;
3771 
3772     if (!Vcb->balance.paused)
3773         return STATUS_DEVICE_NOT_READY;
3774 
3775     if (Vcb->readonly)
3776         return STATUS_MEDIA_WRITE_PROTECTED;
3777 
3778     Vcb->balance.paused = false;
3779     KeSetEvent(&Vcb->balance.event, 0, false);
3780 
3781     return STATUS_SUCCESS;
3782 }
3783 
stop_balance(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3784 NTSTATUS stop_balance(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3785     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3786         return STATUS_PRIVILEGE_NOT_HELD;
3787 
3788     if (!Vcb->balance.thread)
3789         return STATUS_DEVICE_NOT_READY;
3790 
3791     Vcb->balance.paused = false;
3792     Vcb->balance.stopping = true;
3793     Vcb->balance.status = STATUS_SUCCESS;
3794     KeSetEvent(&Vcb->balance.event, 0, false);
3795 
3796     return STATUS_SUCCESS;
3797 }
3798 
remove_device(device_extension * Vcb,void * data,ULONG length,KPROCESSOR_MODE processor_mode)3799 NTSTATUS remove_device(device_extension* Vcb, void* data, ULONG length, KPROCESSOR_MODE processor_mode) {
3800     uint64_t devid;
3801     LIST_ENTRY* le;
3802     device* dev = NULL;
3803     NTSTATUS Status;
3804     int i;
3805     uint64_t num_rw_devices;
3806     OBJECT_ATTRIBUTES oa;
3807 
3808     TRACE("(%p, %p, %lx)\n", Vcb, data, length);
3809 
3810     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3811         return STATUS_PRIVILEGE_NOT_HELD;
3812 
3813     if (length < sizeof(uint64_t))
3814         return STATUS_INVALID_PARAMETER;
3815 
3816     devid = *(uint64_t*)data;
3817 
3818     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
3819 
3820     if (Vcb->readonly) {
3821         ExReleaseResourceLite(&Vcb->tree_lock);
3822         return STATUS_MEDIA_WRITE_PROTECTED;
3823     }
3824 
3825     num_rw_devices = 0;
3826 
3827     le = Vcb->devices.Flink;
3828     while (le != &Vcb->devices) {
3829         device* dev2 = CONTAINING_RECORD(le, device, list_entry);
3830 
3831         if (dev2->devitem.dev_id == devid)
3832             dev = dev2;
3833 
3834         if (!dev2->readonly)
3835             num_rw_devices++;
3836 
3837         le = le->Flink;
3838     }
3839 
3840     if (!dev) {
3841         ExReleaseResourceLite(&Vcb->tree_lock);
3842         WARN("device %I64x not found\n", devid);
3843         return STATUS_NOT_FOUND;
3844     }
3845 
3846     if (!dev->readonly) {
3847         if (num_rw_devices == 1) {
3848             ExReleaseResourceLite(&Vcb->tree_lock);
3849             WARN("not removing last non-readonly device\n");
3850             return STATUS_INVALID_PARAMETER;
3851         }
3852 
3853         if (num_rw_devices == 4 &&
3854             ((Vcb->data_flags & BLOCK_FLAG_RAID10 || Vcb->metadata_flags & BLOCK_FLAG_RAID10 || Vcb->system_flags & BLOCK_FLAG_RAID10) ||
3855              (Vcb->data_flags & BLOCK_FLAG_RAID6 || Vcb->metadata_flags & BLOCK_FLAG_RAID6 || Vcb->system_flags & BLOCK_FLAG_RAID6) ||
3856              (Vcb->data_flags & BLOCK_FLAG_RAID1C4 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C4 || Vcb->system_flags & BLOCK_FLAG_RAID1C4)
3857             )
3858         ) {
3859             ExReleaseResourceLite(&Vcb->tree_lock);
3860             ERR("would not be enough devices to satisfy RAID requirement (RAID6/10/1C4)\n");
3861             return STATUS_CANNOT_DELETE;
3862         }
3863 
3864         if (num_rw_devices == 3 &&
3865             ((Vcb->data_flags & BLOCK_FLAG_RAID5 || Vcb->metadata_flags & BLOCK_FLAG_RAID5 || Vcb->system_flags & BLOCK_FLAG_RAID5) ||
3866             (Vcb->data_flags & BLOCK_FLAG_RAID1C3 || Vcb->metadata_flags & BLOCK_FLAG_RAID1C3 || Vcb->system_flags & BLOCK_FLAG_RAID1C3))
3867             ) {
3868             ExReleaseResourceLite(&Vcb->tree_lock);
3869             ERR("would not be enough devices to satisfy RAID requirement (RAID5/1C3)\n");
3870             return STATUS_CANNOT_DELETE;
3871         }
3872 
3873         if (num_rw_devices == 2 &&
3874             ((Vcb->data_flags & BLOCK_FLAG_RAID0 || Vcb->metadata_flags & BLOCK_FLAG_RAID0 || Vcb->system_flags & BLOCK_FLAG_RAID0) ||
3875              (Vcb->data_flags & BLOCK_FLAG_RAID1 || Vcb->metadata_flags & BLOCK_FLAG_RAID1 || Vcb->system_flags & BLOCK_FLAG_RAID1))
3876         ) {
3877             ExReleaseResourceLite(&Vcb->tree_lock);
3878             ERR("would not be enough devices to satisfy RAID requirement (RAID0/1)\n");
3879             return STATUS_CANNOT_DELETE;
3880         }
3881     }
3882 
3883     ExReleaseResourceLite(&Vcb->tree_lock);
3884 
3885     if (Vcb->balance.thread) {
3886         WARN("balance already running\n");
3887         return STATUS_DEVICE_NOT_READY;
3888     }
3889 
3890     dev->reloc = true;
3891 
3892     RtlZeroMemory(Vcb->balance.opts, sizeof(btrfs_balance_opts) * 3);
3893 
3894     for (i = 0; i < 3; i++) {
3895         Vcb->balance.opts[i].flags = BTRFS_BALANCE_OPTS_ENABLED | BTRFS_BALANCE_OPTS_DEVID;
3896         Vcb->balance.opts[i].devid = devid;
3897     }
3898 
3899     Vcb->balance.paused = false;
3900     Vcb->balance.removing = true;
3901     Vcb->balance.shrinking = false;
3902     Vcb->balance.status = STATUS_SUCCESS;
3903     KeInitializeEvent(&Vcb->balance.event, NotificationEvent, !Vcb->balance.paused);
3904 
3905     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3906 
3907     Status = PsCreateSystemThread(&Vcb->balance.thread, 0, &oa, NULL, NULL, balance_thread, Vcb);
3908     if (!NT_SUCCESS(Status)) {
3909         ERR("PsCreateSystemThread returned %08lx\n", Status);
3910         dev->reloc = false;
3911         return Status;
3912     }
3913 
3914     return STATUS_SUCCESS;
3915 }
3916