xref: /reactos/drivers/filesystems/btrfs/scrub.c (revision 36873c49)
1 /* Copyright (c) Mark Harmstone 2017
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 
20 #define SCRUB_UNIT 0x100000 // 1 MB
21 
22 struct _scrub_context;
23 
24 typedef struct {
25     struct _scrub_context* context;
26     PIRP Irp;
27     uint64_t start;
28     uint32_t length;
29     IO_STATUS_BLOCK iosb;
30     uint8_t* buf;
31     bool csum_error;
32     uint32_t* bad_csums;
33 } scrub_context_stripe;
34 
35 typedef struct _scrub_context {
36     KEVENT Event;
37     scrub_context_stripe* stripes;
38     LONG stripes_left;
39 } scrub_context;
40 
41 typedef struct {
42     ANSI_STRING name;
43     bool orig_subvol;
44     LIST_ENTRY list_entry;
45 } path_part;
46 
47 static void log_file_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t subvol, uint64_t inode, uint64_t offset) {
48     LIST_ENTRY *le, parts;
49     root* r = NULL;
50     KEY searchkey;
51     traverse_ptr tp;
52     uint64_t dir;
53     bool orig_subvol = true, not_in_tree = false;
54     ANSI_STRING fn;
55     scrub_error* err;
56     NTSTATUS Status;
57     ULONG utf16len;
58 
59     le = Vcb->roots.Flink;
60     while (le != &Vcb->roots) {
61         root* r2 = CONTAINING_RECORD(le, root, list_entry);
62 
63         if (r2->id == subvol) {
64             r = r2;
65             break;
66         }
67 
68         le = le->Flink;
69     }
70 
71     if (!r) {
72         ERR("could not find subvol %I64x\n", subvol);
73         return;
74     }
75 
76     InitializeListHead(&parts);
77 
78     dir = inode;
79 
80     while (true) {
81         if (dir == r->root_item.objid) {
82             if (r == Vcb->root_fileref->fcb->subvol)
83                 break;
84 
85             searchkey.obj_id = r->id;
86             searchkey.obj_type = TYPE_ROOT_BACKREF;
87             searchkey.offset = 0xffffffffffffffff;
88 
89             Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
90             if (!NT_SUCCESS(Status)) {
91                 ERR("find_item returned %08x\n", Status);
92                 goto end;
93             }
94 
95             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
96                 ROOT_REF* rr = (ROOT_REF*)tp.item->data;
97                 path_part* pp;
98 
99                 if (tp.item->size < sizeof(ROOT_REF)) {
100                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(ROOT_REF));
101                     goto end;
102                 }
103 
104                 if (tp.item->size < offsetof(ROOT_REF, name[0]) + rr->n) {
105                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
106                         tp.item->size, offsetof(ROOT_REF, name[0]) + rr->n);
107                     goto end;
108                 }
109 
110                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
111                 if (!pp) {
112                     ERR("out of memory\n");
113                     goto end;
114                 }
115 
116                 pp->name.Buffer = rr->name;
117                 pp->name.Length = pp->name.MaximumLength = rr->n;
118                 pp->orig_subvol = false;
119 
120                 InsertTailList(&parts, &pp->list_entry);
121 
122                 r = NULL;
123 
124                 le = Vcb->roots.Flink;
125                 while (le != &Vcb->roots) {
126                     root* r2 = CONTAINING_RECORD(le, root, list_entry);
127 
128                     if (r2->id == tp.item->key.offset) {
129                         r = r2;
130                         break;
131                     }
132 
133                     le = le->Flink;
134                 }
135 
136                 if (!r) {
137                     ERR("could not find subvol %I64x\n", tp.item->key.offset);
138                     goto end;
139                 }
140 
141                 dir = rr->dir;
142                 orig_subvol = false;
143             } else {
144                 not_in_tree = true;
145                 break;
146             }
147         } else {
148             searchkey.obj_id = dir;
149             searchkey.obj_type = TYPE_INODE_EXTREF;
150             searchkey.offset = 0xffffffffffffffff;
151 
152             Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
153             if (!NT_SUCCESS(Status)) {
154                 ERR("find_item returned %08x\n", Status);
155                 goto end;
156             }
157 
158             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_REF) {
159                 INODE_REF* ir = (INODE_REF*)tp.item->data;
160                 path_part* pp;
161 
162                 if (tp.item->size < sizeof(INODE_REF)) {
163                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_REF));
164                     goto end;
165                 }
166 
167                 if (tp.item->size < offsetof(INODE_REF, name[0]) + ir->n) {
168                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
169                         tp.item->size, offsetof(INODE_REF, name[0]) + ir->n);
170                     goto end;
171                 }
172 
173                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
174                 if (!pp) {
175                     ERR("out of memory\n");
176                     goto end;
177                 }
178 
179                 pp->name.Buffer = ir->name;
180                 pp->name.Length = pp->name.MaximumLength = ir->n;
181                 pp->orig_subvol = orig_subvol;
182 
183                 InsertTailList(&parts, &pp->list_entry);
184 
185                 if (dir == tp.item->key.offset)
186                     break;
187 
188                 dir = tp.item->key.offset;
189             } else if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_EXTREF) {
190                 INODE_EXTREF* ier = (INODE_EXTREF*)tp.item->data;
191                 path_part* pp;
192 
193                 if (tp.item->size < sizeof(INODE_EXTREF)) {
194                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
195                                                                                tp.item->size, sizeof(INODE_EXTREF));
196                     goto end;
197                 }
198 
199                 if (tp.item->size < offsetof(INODE_EXTREF, name[0]) + ier->n) {
200                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
201                         tp.item->size, offsetof(INODE_EXTREF, name[0]) + ier->n);
202                     goto end;
203                 }
204 
205                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
206                 if (!pp) {
207                     ERR("out of memory\n");
208                     goto end;
209                 }
210 
211                 pp->name.Buffer = ier->name;
212                 pp->name.Length = pp->name.MaximumLength = ier->n;
213                 pp->orig_subvol = orig_subvol;
214 
215                 InsertTailList(&parts, &pp->list_entry);
216 
217                 if (dir == ier->dir)
218                     break;
219 
220                 dir = ier->dir;
221             } else {
222                 ERR("could not find INODE_REF for inode %I64x in subvol %I64x\n", dir, r->id);
223                 goto end;
224             }
225         }
226     }
227 
228     fn.MaximumLength = 0;
229 
230     if (not_in_tree) {
231         le = parts.Blink;
232         while (le != &parts) {
233             path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
234             LIST_ENTRY* le2 = le->Blink;
235 
236             if (pp->orig_subvol)
237                 break;
238 
239             RemoveTailList(&parts);
240             ExFreePool(pp);
241 
242             le = le2;
243         }
244     }
245 
246     le = parts.Flink;
247     while (le != &parts) {
248         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
249 
250         fn.MaximumLength += pp->name.Length + 1;
251 
252         le = le->Flink;
253     }
254 
255     fn.Buffer = ExAllocatePoolWithTag(PagedPool, fn.MaximumLength, ALLOC_TAG);
256     if (!fn.Buffer) {
257         ERR("out of memory\n");
258         goto end;
259     }
260 
261     fn.Length = 0;
262 
263     le = parts.Blink;
264     while (le != &parts) {
265         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
266 
267         fn.Buffer[fn.Length] = '\\';
268         fn.Length++;
269 
270         RtlCopyMemory(&fn.Buffer[fn.Length], pp->name.Buffer, pp->name.Length);
271         fn.Length += pp->name.Length;
272 
273         le = le->Blink;
274     }
275 
276     if (not_in_tree)
277         ERR("subvol %I64x, %.*s, offset %I64x\n", subvol, fn.Length, fn.Buffer, offset);
278     else
279         ERR("%.*s, offset %I64x\n", fn.Length, fn.Buffer, offset);
280 
281     Status = utf8_to_utf16(NULL, 0, &utf16len, fn.Buffer, fn.Length);
282     if (!NT_SUCCESS(Status)) {
283         ERR("utf8_to_utf16 1 returned %08x\n", Status);
284         ExFreePool(fn.Buffer);
285         goto end;
286     }
287 
288     err = ExAllocatePoolWithTag(PagedPool, offsetof(scrub_error, data.filename[0]) + utf16len, ALLOC_TAG);
289     if (!err) {
290         ERR("out of memory\n");
291         ExFreePool(fn.Buffer);
292         goto end;
293     }
294 
295     err->address = addr;
296     err->device = devid;
297     err->recovered = false;
298     err->is_metadata = false;
299     err->parity = false;
300 
301     err->data.subvol = not_in_tree ? subvol : 0;
302     err->data.offset = offset;
303     err->data.filename_length = (uint16_t)utf16len;
304 
305     Status = utf8_to_utf16(err->data.filename, utf16len, &utf16len, fn.Buffer, fn.Length);
306     if (!NT_SUCCESS(Status)) {
307         ERR("utf8_to_utf16 2 returned %08x\n", Status);
308         ExFreePool(fn.Buffer);
309         ExFreePool(err);
310         goto end;
311     }
312 
313     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
314 
315     Vcb->scrub.num_errors++;
316     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
317 
318     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
319 
320     ExFreePool(fn.Buffer);
321 
322 end:
323     while (!IsListEmpty(&parts)) {
324         path_part* pp = CONTAINING_RECORD(RemoveHeadList(&parts), path_part, list_entry);
325 
326         ExFreePool(pp);
327     }
328 }
329 
330 static void log_file_checksum_error_shared(device_extension* Vcb, uint64_t treeaddr, uint64_t addr, uint64_t devid, uint64_t extent) {
331     tree_header* tree;
332     NTSTATUS Status;
333     leaf_node* ln;
334     ULONG i;
335 
336     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
337     if (!tree) {
338         ERR("out of memory\n");
339         return;
340     }
341 
342     Status = read_data(Vcb, treeaddr, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
343     if (!NT_SUCCESS(Status)) {
344         ERR("read_data returned %08x\n", Status);
345         goto end;
346     }
347 
348     if (tree->level != 0) {
349         ERR("tree level was %x, expected 0\n", tree->level);
350         goto end;
351     }
352 
353     ln = (leaf_node*)&tree[1];
354 
355     for (i = 0; i < tree->num_items; i++) {
356         if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
357             EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)tree + sizeof(tree_header) + ln[i].offset);
358             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
359 
360             if (ed->type == EXTENT_TYPE_REGULAR && ed2->size != 0 && ed2->address == addr)
361                 log_file_checksum_error(Vcb, addr, devid, tree->tree_id, ln[i].key.obj_id, ln[i].key.offset + addr - extent);
362         }
363     }
364 
365 end:
366     ExFreePool(tree);
367 }
368 
369 static void log_tree_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t root, uint8_t level, KEY* firstitem) {
370     scrub_error* err;
371 
372     err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
373     if (!err) {
374         ERR("out of memory\n");
375         return;
376     }
377 
378     err->address = addr;
379     err->device = devid;
380     err->recovered = false;
381     err->is_metadata = true;
382     err->parity = false;
383 
384     err->metadata.root = root;
385     err->metadata.level = level;
386 
387     if (firstitem) {
388         ERR("root %I64x, level %u, first item (%I64x,%x,%I64x)\n", root, level, firstitem->obj_id,
389                                                                 firstitem->obj_type, firstitem->offset);
390 
391         err->metadata.firstitem = *firstitem;
392     } else {
393         ERR("root %I64x, level %u\n", root, level);
394 
395         RtlZeroMemory(&err->metadata.firstitem, sizeof(KEY));
396     }
397 
398     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
399 
400     Vcb->scrub.num_errors++;
401     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
402 
403     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
404 }
405 
406 static void log_tree_checksum_error_shared(device_extension* Vcb, uint64_t offset, uint64_t address, uint64_t devid) {
407     tree_header* tree;
408     NTSTATUS Status;
409     internal_node* in;
410     ULONG i;
411 
412     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
413     if (!tree) {
414         ERR("out of memory\n");
415         return;
416     }
417 
418     Status = read_data(Vcb, offset, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
419     if (!NT_SUCCESS(Status)) {
420         ERR("read_data returned %08x\n", Status);
421         goto end;
422     }
423 
424     if (tree->level == 0) {
425         ERR("tree level was 0\n");
426         goto end;
427     }
428 
429     in = (internal_node*)&tree[1];
430 
431     for (i = 0; i < tree->num_items; i++) {
432         if (in[i].address == address) {
433             log_tree_checksum_error(Vcb, address, devid, tree->tree_id, tree->level - 1, &in[i].key);
434             break;
435         }
436     }
437 
438 end:
439     ExFreePool(tree);
440 }
441 
442 static void log_unrecoverable_error(device_extension* Vcb, uint64_t address, uint64_t devid) {
443     KEY searchkey;
444     traverse_ptr tp;
445     NTSTATUS Status;
446     EXTENT_ITEM* ei;
447     EXTENT_ITEM2* ei2 = NULL;
448     uint8_t* ptr;
449     ULONG len;
450     uint64_t rc;
451 
452     // FIXME - still log even if rest of this function fails
453 
454     searchkey.obj_id = address;
455     searchkey.obj_type = TYPE_METADATA_ITEM;
456     searchkey.offset = 0xffffffffffffffff;
457 
458     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
459     if (!NT_SUCCESS(Status)) {
460         ERR("find_item returned %08x\n", Status);
461         return;
462     }
463 
464     if ((tp.item->key.obj_type != TYPE_EXTENT_ITEM && tp.item->key.obj_type != TYPE_METADATA_ITEM) ||
465         tp.item->key.obj_id >= address + Vcb->superblock.sector_size ||
466         (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.obj_id + tp.item->key.offset <= address) ||
467         (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->key.obj_id + Vcb->superblock.node_size <= address)
468     )
469         return;
470 
471     if (tp.item->size < sizeof(EXTENT_ITEM)) {
472         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
473         return;
474     }
475 
476     ei = (EXTENT_ITEM*)tp.item->data;
477     ptr = (uint8_t*)&ei[1];
478     len = tp.item->size - sizeof(EXTENT_ITEM);
479 
480     if (tp.item->key.obj_id == TYPE_EXTENT_ITEM && ei->flags & EXTENT_ITEM_TREE_BLOCK) {
481         if (tp.item->size < sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)) {
482             ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
483                                                                        tp.item->size, sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2));
484             return;
485         }
486 
487         ei2 = (EXTENT_ITEM2*)ptr;
488 
489         ptr += sizeof(EXTENT_ITEM2);
490         len -= sizeof(EXTENT_ITEM2);
491     }
492 
493     rc = 0;
494 
495     while (len > 0) {
496         uint8_t type = *ptr;
497 
498         ptr++;
499         len--;
500 
501         if (type == TYPE_TREE_BLOCK_REF) {
502             TREE_BLOCK_REF* tbr;
503 
504             if (len < sizeof(TREE_BLOCK_REF)) {
505                 ERR("TREE_BLOCK_REF takes up %u bytes, but only %u remaining\n", sizeof(TREE_BLOCK_REF), len);
506                 break;
507             }
508 
509             tbr = (TREE_BLOCK_REF*)ptr;
510 
511             log_tree_checksum_error(Vcb, address, devid, tbr->offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
512 
513             rc++;
514 
515             ptr += sizeof(TREE_BLOCK_REF);
516             len -= sizeof(TREE_BLOCK_REF);
517         } else if (type == TYPE_EXTENT_DATA_REF) {
518             EXTENT_DATA_REF* edr;
519 
520             if (len < sizeof(EXTENT_DATA_REF)) {
521                 ERR("EXTENT_DATA_REF takes up %u bytes, but only %u remaining\n", sizeof(EXTENT_DATA_REF), len);
522                 break;
523             }
524 
525             edr = (EXTENT_DATA_REF*)ptr;
526 
527             log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
528 
529             rc += edr->count;
530 
531             ptr += sizeof(EXTENT_DATA_REF);
532             len -= sizeof(EXTENT_DATA_REF);
533         } else if (type == TYPE_SHARED_BLOCK_REF) {
534             SHARED_BLOCK_REF* sbr;
535 
536             if (len < sizeof(SHARED_BLOCK_REF)) {
537                 ERR("SHARED_BLOCK_REF takes up %u bytes, but only %u remaining\n", sizeof(SHARED_BLOCK_REF), len);
538                 break;
539             }
540 
541             sbr = (SHARED_BLOCK_REF*)ptr;
542 
543             log_tree_checksum_error_shared(Vcb, sbr->offset, address, devid);
544 
545             rc++;
546 
547             ptr += sizeof(SHARED_BLOCK_REF);
548             len -= sizeof(SHARED_BLOCK_REF);
549         } else if (type == TYPE_SHARED_DATA_REF) {
550             SHARED_DATA_REF* sdr;
551 
552             if (len < sizeof(SHARED_DATA_REF)) {
553                 ERR("SHARED_DATA_REF takes up %u bytes, but only %u remaining\n", sizeof(SHARED_DATA_REF), len);
554                 break;
555             }
556 
557             sdr = (SHARED_DATA_REF*)ptr;
558 
559             log_file_checksum_error_shared(Vcb, sdr->offset, address, devid, tp.item->key.obj_id);
560 
561             rc += sdr->count;
562 
563             ptr += sizeof(SHARED_DATA_REF);
564             len -= sizeof(SHARED_DATA_REF);
565         } else {
566             ERR("unknown extent type %x\n", type);
567             break;
568         }
569     }
570 
571     if (rc < ei->refcount) {
572         do {
573             traverse_ptr next_tp;
574 
575             if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
576                 tp = next_tp;
577             else
578                 break;
579 
580             if (tp.item->key.obj_id == address) {
581                 if (tp.item->key.obj_type == TYPE_TREE_BLOCK_REF)
582                     log_tree_checksum_error(Vcb, address, devid, tp.item->key.offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
583                 else if (tp.item->key.obj_type == TYPE_EXTENT_DATA_REF) {
584                     EXTENT_DATA_REF* edr;
585 
586                     if (tp.item->size < sizeof(EXTENT_DATA_REF)) {
587                         ERR("(%I64x,%x,%I64x) was %u bytes, expected %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
588                                                                           tp.item->size, sizeof(EXTENT_DATA_REF));
589                         break;
590                     }
591 
592                     edr = (EXTENT_DATA_REF*)tp.item->data;
593 
594                     log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
595                 } else if (tp.item->key.obj_type == TYPE_SHARED_BLOCK_REF)
596                     log_tree_checksum_error_shared(Vcb, tp.item->key.offset, address, devid);
597                 else if (tp.item->key.obj_type == TYPE_SHARED_DATA_REF)
598                     log_file_checksum_error_shared(Vcb, tp.item->key.offset, address, devid, tp.item->key.obj_id);
599             } else
600                 break;
601         } while (true);
602     }
603 }
604 
605 static void log_error(device_extension* Vcb, uint64_t addr, uint64_t devid, bool metadata, bool recoverable, bool parity) {
606     if (recoverable) {
607         scrub_error* err;
608 
609         if (parity) {
610             ERR("recovering from parity error at %I64x on device %I64x\n", addr, devid);
611         } else {
612             if (metadata)
613                 ERR("recovering from metadata checksum error at %I64x on device %I64x\n", addr, devid);
614             else
615                 ERR("recovering from data checksum error at %I64x on device %I64x\n", addr, devid);
616         }
617 
618         err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
619         if (!err) {
620             ERR("out of memory\n");
621             return;
622         }
623 
624         err->address = addr;
625         err->device = devid;
626         err->recovered = true;
627         err->is_metadata = metadata;
628         err->parity = parity;
629 
630         if (metadata)
631             RtlZeroMemory(&err->metadata, sizeof(err->metadata));
632         else
633             RtlZeroMemory(&err->data, sizeof(err->data));
634 
635         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
636 
637         Vcb->scrub.num_errors++;
638         InsertTailList(&Vcb->scrub.errors, &err->list_entry);
639 
640         ExReleaseResourceLite(&Vcb->scrub.stats_lock);
641     } else {
642         if (metadata)
643             ERR("unrecoverable metadata checksum error at %I64x\n", addr);
644         else
645             ERR("unrecoverable data checksum error at %I64x\n", addr);
646 
647         log_unrecoverable_error(Vcb, addr, devid);
648     }
649 }
650 
651 _Function_class_(IO_COMPLETION_ROUTINE)
652 static NTSTATUS __stdcall scrub_read_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
653     scrub_context_stripe* stripe = conptr;
654     scrub_context* context = (scrub_context*)stripe->context;
655     ULONG left = InterlockedDecrement(&context->stripes_left);
656 
657     UNUSED(DeviceObject);
658 
659     stripe->iosb = Irp->IoStatus;
660 
661     if (left == 0)
662         KeSetEvent(&context->Event, 0, false);
663 
664     return STATUS_MORE_PROCESSING_REQUIRED;
665 }
666 
667 static NTSTATUS scrub_extent_dup(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t* csum, scrub_context* context) {
668     NTSTATUS Status;
669     bool csum_error = false;
670     ULONG i;
671     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
672     uint16_t present_devices = 0;
673 
674     if (csum) {
675         ULONG good_stripe = 0xffffffff;
676 
677         for (i = 0; i < c->chunk_item->num_stripes; i++) {
678             if (c->devices[i]->devobj) {
679                 present_devices++;
680 
681                 // if first stripe is okay, we only need to check that the others are identical to it
682                 if (good_stripe != 0xffffffff) {
683                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
684                                         context->stripes[good_stripe].length) != context->stripes[i].length) {
685                         context->stripes[i].csum_error = true;
686                         csum_error = true;
687                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
688                     }
689                 } else {
690                     Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, csum);
691                     if (Status == STATUS_CRC_ERROR) {
692                         context->stripes[i].csum_error = true;
693                         csum_error = true;
694                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
695                     } else if (!NT_SUCCESS(Status)) {
696                         ERR("check_csum returned %08x\n", Status);
697                         return Status;
698                     } else
699                         good_stripe = i;
700                 }
701             }
702         }
703     } else {
704         ULONG good_stripe = 0xffffffff;
705 
706         for (i = 0; i < c->chunk_item->num_stripes; i++) {
707             ULONG j;
708 
709             if (c->devices[i]->devobj) {
710                 // if first stripe is okay, we only need to check that the others are identical to it
711                 if (good_stripe != 0xffffffff) {
712                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
713                                          context->stripes[good_stripe].length) != context->stripes[i].length) {
714                         context->stripes[i].csum_error = true;
715                         csum_error = true;
716                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
717                     }
718                 } else {
719                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
720                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
721                         uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
722 
723                         if (crc32 != *((uint32_t*)th->csum) || th->address != offset + UInt32x32To64(j, Vcb->superblock.node_size)) {
724                             context->stripes[i].csum_error = true;
725                             csum_error = true;
726                             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
727                         }
728                     }
729 
730                     if (!context->stripes[i].csum_error)
731                         good_stripe = i;
732                 }
733             }
734         }
735     }
736 
737     if (!csum_error)
738         return STATUS_SUCCESS;
739 
740     // handle checksum error
741 
742     for (i = 0; i < c->chunk_item->num_stripes; i++) {
743         if (context->stripes[i].csum_error) {
744             if (csum) {
745                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * sizeof(uint32_t) / Vcb->superblock.sector_size, ALLOC_TAG);
746                 if (!context->stripes[i].bad_csums) {
747                     ERR("out of memory\n");
748                     return STATUS_INSUFFICIENT_RESOURCES;
749                 }
750 
751                 Status = calc_csum(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, context->stripes[i].bad_csums);
752                 if (!NT_SUCCESS(Status)) {
753                     ERR("calc_csum returned %08x\n", Status);
754                     return Status;
755                 }
756             } else {
757                 ULONG j;
758 
759                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * sizeof(uint32_t) / Vcb->superblock.node_size, ALLOC_TAG);
760                 if (!context->stripes[i].bad_csums) {
761                     ERR("out of memory\n");
762                     return STATUS_INSUFFICIENT_RESOURCES;
763                 }
764 
765                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
766                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
767                     uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
768 
769                     context->stripes[i].bad_csums[j] = crc32;
770                 }
771             }
772         }
773     }
774 
775     if (present_devices > 1) {
776         ULONG good_stripe = 0xffffffff;
777 
778         for (i = 0; i < c->chunk_item->num_stripes; i++) {
779             if (c->devices[i]->devobj && !context->stripes[i].csum_error) {
780                 good_stripe = i;
781                 break;
782             }
783         }
784 
785         if (good_stripe != 0xffffffff) {
786             // log
787 
788             for (i = 0; i < c->chunk_item->num_stripes; i++) {
789                 if (context->stripes[i].csum_error) {
790                     ULONG j;
791 
792                     if (csum) {
793                         for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
794                             if (context->stripes[i].bad_csums[j] != csum[j]) {
795                                 uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
796 
797                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
798                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
799                             }
800                         }
801                     } else {
802                         for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
803                             tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
804                             uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
805 
806                             if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr) {
807                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
808                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
809                             }
810                         }
811                     }
812                 }
813             }
814 
815             // write good data over bad
816 
817             for (i = 0; i < c->chunk_item->num_stripes; i++) {
818                 if (context->stripes[i].csum_error && !c->devices[i]->readonly) {
819                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
820                                              context->stripes[good_stripe].buf, context->stripes[i].length);
821 
822                     if (!NT_SUCCESS(Status)) {
823                         ERR("write_data_phys returned %08x\n", Status);
824                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
825                         return Status;
826                     }
827                 }
828             }
829 
830             return STATUS_SUCCESS;
831         }
832 
833         // if csum errors on all stripes, check sector by sector
834 
835         for (i = 0; i < c->chunk_item->num_stripes; i++) {
836             ULONG j;
837 
838             if (c->devices[i]->devobj) {
839                 if (csum) {
840                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
841                         if (context->stripes[i].bad_csums[j] != csum[j]) {
842                             ULONG k;
843                             uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
844                             bool recovered = false;
845 
846                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
847                                 if (i != k && c->devices[k]->devobj && context->stripes[k].bad_csums[j] == csum[j]) {
848                                     log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
849                                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
850 
851                                     RtlCopyMemory(context->stripes[i].buf + (j * Vcb->superblock.sector_size),
852                                                   context->stripes[k].buf + (j * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
853 
854                                     recovered = true;
855                                     break;
856                                 }
857                             }
858 
859                             if (!recovered) {
860                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
861                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
862                             }
863                         }
864                     }
865                 } else {
866                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
867                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
868                         uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
869 
870                         if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr) {
871                             ULONG k;
872                             bool recovered = false;
873 
874                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
875                                 if (i != k && c->devices[k]->devobj) {
876                                     tree_header* th2 = (tree_header*)&context->stripes[k].buf[j * Vcb->superblock.node_size];
877 
878                                     if (context->stripes[k].bad_csums[j] == *((uint32_t*)th2->csum) && th2->address == addr) {
879                                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
880                                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
881 
882                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
883 
884                                         recovered = true;
885                                         break;
886                                     }
887                                 }
888                             }
889 
890                             if (!recovered) {
891                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
892                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
893                             }
894                         }
895                     }
896                 }
897             }
898         }
899 
900         // write good data over bad
901 
902         for (i = 0; i < c->chunk_item->num_stripes; i++) {
903             if (c->devices[i]->devobj && !c->devices[i]->readonly) {
904                 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
905                                          context->stripes[i].buf, context->stripes[i].length);
906                 if (!NT_SUCCESS(Status)) {
907                     ERR("write_data_phys returned %08x\n", Status);
908                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
909                     return Status;
910                 }
911             }
912         }
913 
914         return STATUS_SUCCESS;
915     }
916 
917     for (i = 0; i < c->chunk_item->num_stripes; i++) {
918         if (c->devices[i]->devobj) {
919             ULONG j;
920 
921             if (csum) {
922                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
923                     if (context->stripes[i].bad_csums[j] != csum[j]) {
924                         uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
925 
926                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
927                     }
928                 }
929             } else {
930                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
931                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
932                     uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
933 
934                     if (context->stripes[i].bad_csums[j] != *((uint32_t*)th->csum) || th->address != addr)
935                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
936                 }
937             }
938         }
939     }
940 
941     return STATUS_SUCCESS;
942 }
943 
944 static NTSTATUS scrub_extent_raid0(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, uint32_t* csum, scrub_context* context) {
945     ULONG j;
946     uint16_t stripe;
947     uint32_t pos, *stripeoff;
948 
949     pos = 0;
950     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes, ALLOC_TAG);
951     if (!stripeoff) {
952         ERR("out of memory\n");
953         return STATUS_INSUFFICIENT_RESOURCES;
954     }
955 
956     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes);
957 
958     stripe = startoffstripe;
959     while (pos < length) {
960         uint32_t readlen;
961 
962         if (pos == 0)
963             readlen = (uint32_t)min(context->stripes[stripe].length, c->chunk_item->stripe_length - (context->stripes[stripe].start % c->chunk_item->stripe_length));
964         else
965             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
966 
967         if (csum) {
968             for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
969                 uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + stripeoff[stripe], Vcb->superblock.sector_size);
970 
971                 if (crc32 != csum[pos / Vcb->superblock.sector_size]) {
972                     uint64_t addr = offset + pos;
973 
974                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
975                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
976                 }
977 
978                 pos += Vcb->superblock.sector_size;
979                 stripeoff[stripe] += Vcb->superblock.sector_size;
980             }
981         } else {
982             for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
983                 tree_header* th = (tree_header*)(context->stripes[stripe].buf + stripeoff[stripe]);
984                 uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
985                 uint64_t addr = offset + pos;
986 
987                 if (crc32 != *((uint32_t*)th->csum) || th->address != addr) {
988                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
989                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
990                 }
991 
992                 pos += Vcb->superblock.node_size;
993                 stripeoff[stripe] += Vcb->superblock.node_size;
994             }
995         }
996 
997         stripe = (stripe + 1) % c->chunk_item->num_stripes;
998     }
999 
1000     ExFreePool(stripeoff);
1001 
1002     return STATUS_SUCCESS;
1003 }
1004 
1005 static NTSTATUS scrub_extent_raid10(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, uint32_t* csum, scrub_context* context) {
1006     ULONG j;
1007     uint16_t stripe, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1008     uint32_t pos, *stripeoff;
1009     bool csum_error = false;
1010     NTSTATUS Status;
1011 
1012     pos = 0;
1013     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes, ALLOC_TAG);
1014     if (!stripeoff) {
1015         ERR("out of memory\n");
1016         return STATUS_INSUFFICIENT_RESOURCES;
1017     }
1018 
1019     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes);
1020 
1021     stripe = startoffstripe;
1022     while (pos < length) {
1023         uint32_t readlen;
1024 
1025         if (pos == 0)
1026             readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1027                                   c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1028         else
1029             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1030 
1031         if (csum) {
1032             ULONG good_stripe = 0xffffffff;
1033             uint16_t k;
1034 
1035             for (k = 0; k < sub_stripes; k++) {
1036                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1037                     // if first stripe is okay, we only need to check that the others are identical to it
1038                     if (good_stripe != 0xffffffff) {
1039                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1040                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1041                                             readlen) != readlen) {
1042                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1043                             csum_error = true;
1044                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1045                         }
1046                     } else {
1047                         for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
1048                             uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j, Vcb->superblock.sector_size);
1049 
1050                             if (crc32 != csum[(pos + j) / Vcb->superblock.sector_size]) {
1051                                 csum_error = true;
1052                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1053                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1054                                 break;
1055                             }
1056                         }
1057 
1058                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1059                             good_stripe = k;
1060                     }
1061                 }
1062             }
1063 
1064             pos += readlen;
1065             stripeoff[stripe] += readlen;
1066         } else {
1067             ULONG good_stripe = 0xffffffff;
1068             uint16_t k;
1069 
1070             for (k = 0; k < sub_stripes; k++) {
1071                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1072                     // if first stripe is okay, we only need to check that the others are identical to it
1073                     if (good_stripe != 0xffffffff) {
1074                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1075                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1076                                             readlen) != readlen) {
1077                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1078                             csum_error = true;
1079                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1080                         }
1081                     } else {
1082                         for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
1083                             tree_header* th = (tree_header*)(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j);
1084                             uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
1085                             uint64_t addr = offset + pos + j;
1086 
1087                             if (crc32 != *((uint32_t*)th->csum) || th->address != addr) {
1088                                 csum_error = true;
1089                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1090                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1091                                 break;
1092                             }
1093                         }
1094 
1095                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1096                             good_stripe = k;
1097                     }
1098                 }
1099             }
1100 
1101             pos += readlen;
1102             stripeoff[stripe] += readlen;
1103         }
1104 
1105         stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1106     }
1107 
1108     if (!csum_error) {
1109         Status = STATUS_SUCCESS;
1110         goto end;
1111     }
1112 
1113     for (j = 0; j < c->chunk_item->num_stripes; j += sub_stripes) {
1114         ULONG goodstripe = 0xffffffff;
1115         uint16_t k;
1116         bool hasbadstripe = false;
1117 
1118         if (context->stripes[j].length == 0)
1119             continue;
1120 
1121         for (k = 0; k < sub_stripes; k++) {
1122             if (c->devices[j + k]->devobj) {
1123                 if (!context->stripes[j + k].csum_error)
1124                     goodstripe = k;
1125                 else
1126                     hasbadstripe = true;
1127             }
1128         }
1129 
1130         if (hasbadstripe) {
1131             if (goodstripe != 0xffffffff) {
1132                 for (k = 0; k < sub_stripes; k++) {
1133                     if (c->devices[j + k]->devobj && context->stripes[j + k].csum_error) {
1134                         uint32_t so = 0;
1135                         bool recovered = false;
1136 
1137                         pos = 0;
1138 
1139                         stripe = startoffstripe;
1140                         while (pos < length) {
1141                             uint32_t readlen;
1142 
1143                             if (pos == 0)
1144                                 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1145                                               c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1146                             else
1147                                 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1148 
1149                             if (stripe == j / sub_stripes) {
1150                                 if (csum) {
1151                                     ULONG l;
1152 
1153                                     for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1154                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1155                                                              context->stripes[j + goodstripe].buf + so,
1156                                                              Vcb->superblock.sector_size) != Vcb->superblock.sector_size) {
1157                                             uint64_t addr = offset + pos;
1158 
1159                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1160 
1161                                             recovered = true;
1162                                         }
1163 
1164                                         pos += Vcb->superblock.sector_size;
1165                                         so += Vcb->superblock.sector_size;
1166                                     }
1167                                 } else {
1168                                     ULONG l;
1169 
1170                                     for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1171                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1172                                                             context->stripes[j + goodstripe].buf + so,
1173                                                             Vcb->superblock.node_size) != Vcb->superblock.node_size) {
1174                                             uint64_t addr = offset + pos;
1175 
1176                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1177 
1178                                             recovered = true;
1179                                         }
1180 
1181                                         pos += Vcb->superblock.node_size;
1182                                         so += Vcb->superblock.node_size;
1183                                     }
1184                                 }
1185                             } else
1186                                 pos += readlen;
1187 
1188                             stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1189                         }
1190 
1191                         if (recovered) {
1192                             // write good data over bad
1193 
1194                             if (!c->devices[j + k]->readonly) {
1195                                 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1196 
1197                                 Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1198                                                          context->stripes[j + goodstripe].buf, context->stripes[j + goodstripe].length);
1199 
1200                                 if (!NT_SUCCESS(Status)) {
1201                                     ERR("write_data_phys returned %08x\n", Status);
1202                                     log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1203                                     goto end;
1204                                 }
1205                             }
1206                         }
1207                     }
1208                 }
1209             } else {
1210                 uint32_t so = 0;
1211                 bool recovered = false;
1212 
1213                 if (csum) {
1214                     for (k = 0; k < sub_stripes; k++) {
1215                         if (c->devices[j + k]->devobj) {
1216                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * sizeof(uint32_t) / Vcb->superblock.sector_size, ALLOC_TAG);
1217                             if (!context->stripes[j + k].bad_csums) {
1218                                 ERR("out of memory\n");
1219                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1220                                 goto end;
1221                             }
1222 
1223                             Status = calc_csum(Vcb, context->stripes[j + k].buf, context->stripes[j + k].length / Vcb->superblock.sector_size, context->stripes[j + k].bad_csums);
1224                             if (!NT_SUCCESS(Status)) {
1225                                 ERR("calc_csum returned %08x\n", Status);
1226                                 goto end;
1227                             }
1228                         }
1229                     }
1230                 } else {
1231                     for (k = 0; k < sub_stripes; k++) {
1232                         if (c->devices[j + k]->devobj) {
1233                             ULONG l;
1234 
1235                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * sizeof(uint32_t) / Vcb->superblock.node_size, ALLOC_TAG);
1236                             if (!context->stripes[j + k].bad_csums) {
1237                                 ERR("out of memory\n");
1238                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1239                                 goto end;
1240                             }
1241 
1242                             for (l = 0; l < context->stripes[j + k].length / Vcb->superblock.node_size; l++) {
1243                                 tree_header* th = (tree_header*)&context->stripes[j + k].buf[l * Vcb->superblock.node_size];
1244                                 uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
1245 
1246                                 context->stripes[j + k].bad_csums[l] = crc32;
1247                             }
1248                         }
1249                     }
1250                 }
1251 
1252                 pos = 0;
1253 
1254                 stripe = startoffstripe;
1255                 while (pos < length) {
1256                     uint32_t readlen;
1257 
1258                     if (pos == 0)
1259                         readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1260                                       c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1261                     else
1262                         readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1263 
1264                     if (stripe == j / sub_stripes) {
1265                         ULONG l;
1266 
1267                         if (csum) {
1268                             for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1269                                 uint32_t crc32 = csum[pos / Vcb->superblock.sector_size];
1270                                 bool has_error = false;
1271 
1272                                 goodstripe = 0xffffffff;
1273                                 for (k = 0; k < sub_stripes; k++) {
1274                                     if (c->devices[j + k]->devobj) {
1275                                         if (context->stripes[j + k].bad_csums[so / Vcb->superblock.sector_size] != crc32)
1276                                             has_error = true;
1277                                         else
1278                                             goodstripe = k;
1279                                     }
1280                                 }
1281 
1282                                 if (has_error) {
1283                                     if (goodstripe != 0xffffffff) {
1284                                         for (k = 0; k < sub_stripes; k++) {
1285                                             if (c->devices[j + k]->devobj && context->stripes[j + k].bad_csums[so / Vcb->superblock.sector_size] != crc32) {
1286                                                 uint64_t addr = offset + pos;
1287 
1288                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1289 
1290                                                 recovered = true;
1291 
1292                                                 RtlCopyMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so,
1293                                                               Vcb->superblock.sector_size);
1294                                             }
1295                                         }
1296                                     } else {
1297                                         uint64_t addr = offset + pos;
1298 
1299                                         for (k = 0; k < sub_stripes; k++) {
1300                                             if (c->devices[j + j]->devobj) {
1301                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, false, false);
1302                                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1303                                             }
1304                                         }
1305                                     }
1306                                 }
1307 
1308                                 pos += Vcb->superblock.sector_size;
1309                                 so += Vcb->superblock.sector_size;
1310                             }
1311                         } else {
1312                             for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1313                                 for (k = 0; k < sub_stripes; k++) {
1314                                     if (c->devices[j + k]->devobj) {
1315                                         tree_header* th = (tree_header*)&context->stripes[j + k].buf[so];
1316                                         uint64_t addr = offset + pos;
1317 
1318                                         if (context->stripes[j + k].bad_csums[so / Vcb->superblock.node_size] != *((uint32_t*)th->csum) || th->address != addr) {
1319                                             ULONG m;
1320 
1321                                             recovered = false;
1322 
1323                                             for (m = 0; m < sub_stripes; m++) {
1324                                                 if (m != k) {
1325                                                     tree_header* th2 = (tree_header*)&context->stripes[j + m].buf[so];
1326 
1327                                                     if (context->stripes[j + m].bad_csums[so / Vcb->superblock.node_size] == *((uint32_t*)th2->csum) && th2->address == addr) {
1328                                                         log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1329 
1330                                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
1331 
1332                                                         recovered = true;
1333                                                         break;
1334                                                     } else
1335                                                         log_device_error(Vcb, c->devices[j + m], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1336                                                 }
1337                                             }
1338 
1339                                             if (!recovered)
1340                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, false, false);
1341                                         }
1342                                     }
1343                                 }
1344 
1345                                 pos += Vcb->superblock.node_size;
1346                                 so += Vcb->superblock.node_size;
1347                             }
1348                         }
1349                     } else
1350                         pos += readlen;
1351 
1352                     stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1353                 }
1354 
1355                 if (recovered) {
1356                     // write good data over bad
1357 
1358                     for (k = 0; k < sub_stripes; k++) {
1359                         if (c->devices[j + k]->devobj && !c->devices[j + k]->readonly) {
1360                             CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1361 
1362                             Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1363                                                      context->stripes[j + k].buf, context->stripes[j + k].length);
1364 
1365                             if (!NT_SUCCESS(Status)) {
1366                                 ERR("write_data_phys returned %08x\n", Status);
1367                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1368                                 goto end;
1369                             }
1370                         }
1371                     }
1372                 }
1373             }
1374         }
1375     }
1376 
1377     Status = STATUS_SUCCESS;
1378 
1379 end:
1380     ExFreePool(stripeoff);
1381 
1382     return Status;
1383 }
1384 
1385 static NTSTATUS scrub_extent(device_extension* Vcb, chunk* c, ULONG type, uint64_t offset, uint32_t size, uint32_t* csum) {
1386     ULONG i;
1387     scrub_context context;
1388     CHUNK_ITEM_STRIPE* cis;
1389     NTSTATUS Status;
1390     uint16_t startoffstripe, num_missing, allowed_missing;
1391 
1392     TRACE("(%p, %p, %I64x, %I64x, %p)\n", Vcb, c, offset, size, csum);
1393 
1394     context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
1395     if (!context.stripes) {
1396         ERR("out of memory\n");
1397         Status = STATUS_INSUFFICIENT_RESOURCES;
1398         goto end;
1399     }
1400 
1401     RtlZeroMemory(context.stripes, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes);
1402 
1403     context.stripes_left = 0;
1404 
1405     cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1406 
1407     if (type == BLOCK_FLAG_RAID0) {
1408         uint64_t startoff, endoff;
1409         uint16_t endoffstripe;
1410 
1411         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &startoff, &startoffstripe);
1412         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &endoff, &endoffstripe);
1413 
1414         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1415             if (startoffstripe > i)
1416                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1417             else if (startoffstripe == i)
1418                 context.stripes[i].start = startoff;
1419             else
1420                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1421 
1422             if (endoffstripe > i)
1423                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1424             else if (endoffstripe == i)
1425                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1426             else
1427                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1428         }
1429 
1430         allowed_missing = 0;
1431     } else if (type == BLOCK_FLAG_RAID10) {
1432         uint64_t startoff, endoff;
1433         uint16_t endoffstripe, j, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1434 
1435         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &startoff, &startoffstripe);
1436         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &endoff, &endoffstripe);
1437 
1438         if ((c->chunk_item->num_stripes % sub_stripes) != 0) {
1439             ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", c->offset, c->chunk_item->num_stripes, sub_stripes);
1440             Status = STATUS_INTERNAL_ERROR;
1441             goto end;
1442         }
1443 
1444         startoffstripe *= sub_stripes;
1445         endoffstripe *= sub_stripes;
1446 
1447         for (i = 0; i < c->chunk_item->num_stripes; i += sub_stripes) {
1448             if (startoffstripe > i)
1449                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1450             else if (startoffstripe == i)
1451                 context.stripes[i].start = startoff;
1452             else
1453                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1454 
1455             if (endoffstripe > i)
1456                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1457             else if (endoffstripe == i)
1458                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1459             else
1460                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1461 
1462             for (j = 1; j < sub_stripes; j++) {
1463                 context.stripes[i+j].start = context.stripes[i].start;
1464                 context.stripes[i+j].length = context.stripes[i].length;
1465             }
1466         }
1467 
1468         startoffstripe /= sub_stripes;
1469         allowed_missing = 1;
1470     } else
1471         allowed_missing = c->chunk_item->num_stripes - 1;
1472 
1473     num_missing = 0;
1474 
1475     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1476         PIO_STACK_LOCATION IrpSp;
1477 
1478         context.stripes[i].context = (struct _scrub_context*)&context;
1479 
1480         if (type == BLOCK_FLAG_DUPLICATE) {
1481             context.stripes[i].start = offset - c->offset;
1482             context.stripes[i].length = size;
1483         } else if (type != BLOCK_FLAG_RAID0 && type != BLOCK_FLAG_RAID10) {
1484             ERR("unexpected chunk type %x\n", type);
1485             Status = STATUS_INTERNAL_ERROR;
1486             goto end;
1487         }
1488 
1489         if (!c->devices[i]->devobj) {
1490             num_missing++;
1491 
1492             if (num_missing > allowed_missing) {
1493                 ERR("too many missing devices (at least %u, maximum allowed %u)\n", num_missing, allowed_missing);
1494                 Status = STATUS_INTERNAL_ERROR;
1495                 goto end;
1496             }
1497         } else if (context.stripes[i].length > 0) {
1498             context.stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1499 
1500             if (!context.stripes[i].buf) {
1501                 ERR("out of memory\n");
1502                 Status = STATUS_INSUFFICIENT_RESOURCES;
1503                 goto end;
1504             }
1505 
1506             context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
1507 
1508             if (!context.stripes[i].Irp) {
1509                 ERR("IoAllocateIrp failed\n");
1510                 Status = STATUS_INSUFFICIENT_RESOURCES;
1511                 goto end;
1512             }
1513 
1514             IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
1515             IrpSp->MajorFunction = IRP_MJ_READ;
1516             IrpSp->FileObject = c->devices[i]->fileobj;
1517 
1518             if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
1519                 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1520                 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
1521                     ERR("out of memory\n");
1522                     Status = STATUS_INSUFFICIENT_RESOURCES;
1523                     goto end;
1524                 }
1525 
1526                 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
1527 
1528                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1529             } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
1530                 context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, context.stripes[i].length, false, false, NULL);
1531                 if (!context.stripes[i].Irp->MdlAddress) {
1532                     ERR("IoAllocateMdl failed\n");
1533                     Status = STATUS_INSUFFICIENT_RESOURCES;
1534                     goto end;
1535                 }
1536 
1537                 Status = STATUS_SUCCESS;
1538 
1539                 _SEH2_TRY {
1540                     MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
1541                 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1542                     Status = _SEH2_GetExceptionCode();
1543                 } _SEH2_END;
1544 
1545                 if (!NT_SUCCESS(Status)) {
1546                     ERR("MmProbeAndLockPages threw exception %08x\n", Status);
1547                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1548                     context.stripes[i].Irp->MdlAddress = NULL;
1549                     goto end;
1550                 }
1551             } else
1552                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1553 
1554             IrpSp->Parameters.Read.Length = context.stripes[i].length;
1555             IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].start + cis[i].offset;
1556 
1557             context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
1558 
1559             IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion, &context.stripes[i], true, true, true);
1560 
1561             context.stripes_left++;
1562 
1563             Vcb->scrub.data_scrubbed += context.stripes[i].length;
1564         }
1565     }
1566 
1567     if (context.stripes_left == 0) {
1568         ERR("error - not reading any stripes\n");
1569         Status = STATUS_INTERNAL_ERROR;
1570         goto end;
1571     }
1572 
1573     KeInitializeEvent(&context.Event, NotificationEvent, false);
1574 
1575     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1576         if (c->devices[i]->devobj && context.stripes[i].length > 0)
1577             IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
1578     }
1579 
1580     KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
1581 
1582     // return an error if any of the stripes returned an error
1583     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1584         if (!NT_SUCCESS(context.stripes[i].iosb.Status)) {
1585             Status = context.stripes[i].iosb.Status;
1586             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
1587             goto end;
1588         }
1589     }
1590 
1591     if (type == BLOCK_FLAG_DUPLICATE) {
1592         Status = scrub_extent_dup(Vcb, c, offset, csum, &context);
1593         if (!NT_SUCCESS(Status)) {
1594             ERR("scrub_extent_dup returned %08x\n", Status);
1595             goto end;
1596         }
1597     } else if (type == BLOCK_FLAG_RAID0) {
1598         Status = scrub_extent_raid0(Vcb, c, offset, size, startoffstripe, csum, &context);
1599         if (!NT_SUCCESS(Status)) {
1600             ERR("scrub_extent_raid0 returned %08x\n", Status);
1601             goto end;
1602         }
1603     } else if (type == BLOCK_FLAG_RAID10) {
1604         Status = scrub_extent_raid10(Vcb, c, offset, size, startoffstripe, csum, &context);
1605         if (!NT_SUCCESS(Status)) {
1606             ERR("scrub_extent_raid10 returned %08x\n", Status);
1607             goto end;
1608         }
1609     }
1610 
1611 end:
1612     if (context.stripes) {
1613         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1614             if (context.stripes[i].Irp) {
1615                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
1616                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
1617                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1618                 }
1619                 IoFreeIrp(context.stripes[i].Irp);
1620             }
1621 
1622             if (context.stripes[i].buf)
1623                 ExFreePool(context.stripes[i].buf);
1624 
1625             if (context.stripes[i].bad_csums)
1626                 ExFreePool(context.stripes[i].bad_csums);
1627         }
1628 
1629         ExFreePool(context.stripes);
1630     }
1631 
1632     return Status;
1633 }
1634 
1635 static NTSTATUS scrub_data_extent(device_extension* Vcb, chunk* c, uint64_t offset, ULONG type, uint32_t* csum, RTL_BITMAP* bmp, ULONG bmplen) {
1636     NTSTATUS Status;
1637     ULONG runlength, index;
1638 
1639     runlength = RtlFindFirstRunClear(bmp, &index);
1640 
1641     while (runlength != 0) {
1642         if (index >= bmplen)
1643             break;
1644 
1645         if (index + runlength >= bmplen) {
1646             runlength = bmplen - index;
1647 
1648             if (runlength == 0)
1649                 break;
1650         }
1651 
1652         do {
1653             ULONG rl;
1654 
1655             if (runlength * Vcb->superblock.sector_size > SCRUB_UNIT)
1656                 rl = SCRUB_UNIT / Vcb->superblock.sector_size;
1657             else
1658                 rl = runlength;
1659 
1660             Status = scrub_extent(Vcb, c, type, offset + UInt32x32To64(index, Vcb->superblock.sector_size), rl * Vcb->superblock.sector_size, &csum[index]);
1661             if (!NT_SUCCESS(Status)) {
1662                 ERR("scrub_data_extent_dup returned %08x\n", Status);
1663                 return Status;
1664             }
1665 
1666             runlength -= rl;
1667             index += rl;
1668         } while (runlength > 0);
1669 
1670         runlength = RtlFindNextForwardRunClear(bmp, index, &index);
1671     }
1672 
1673     return STATUS_SUCCESS;
1674 }
1675 
1676 typedef struct {
1677     uint8_t* buf;
1678     PIRP Irp;
1679     void* context;
1680     IO_STATUS_BLOCK iosb;
1681     uint64_t offset;
1682     bool rewrite, missing;
1683     RTL_BITMAP error;
1684     ULONG* errorarr;
1685 } scrub_context_raid56_stripe;
1686 
1687 typedef struct {
1688     scrub_context_raid56_stripe* stripes;
1689     LONG stripes_left;
1690     KEVENT Event;
1691     RTL_BITMAP alloc;
1692     RTL_BITMAP has_csum;
1693     RTL_BITMAP is_tree;
1694     uint32_t* csum;
1695     uint8_t* parity_scratch;
1696     uint8_t* parity_scratch2;
1697 } scrub_context_raid56;
1698 
1699 _Function_class_(IO_COMPLETION_ROUTINE)
1700 static NTSTATUS __stdcall scrub_read_completion_raid56(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
1701     scrub_context_raid56_stripe* stripe = conptr;
1702     scrub_context_raid56* context = (scrub_context_raid56*)stripe->context;
1703     LONG left = InterlockedDecrement(&context->stripes_left);
1704 
1705     UNUSED(DeviceObject);
1706 
1707     stripe->iosb = Irp->IoStatus;
1708 
1709     if (left == 0)
1710         KeSetEvent(&context->Event, 0, false);
1711 
1712     return STATUS_MORE_PROCESSING_REQUIRED;
1713 }
1714 
1715 static void scrub_raid5_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1716                                uint64_t num, uint16_t missing_devices) {
1717     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off;
1718     uint16_t stripe, parity = (bit_start + num + c->chunk_item->num_stripes - 1) % c->chunk_item->num_stripes;
1719     uint64_t stripeoff;
1720 
1721     stripe = (parity + 1) % c->chunk_item->num_stripes;
1722     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1);
1723     stripeoff = num * sectors_per_stripe;
1724 
1725     if (missing_devices == 0)
1726         RtlCopyMemory(context->parity_scratch, &context->stripes[parity].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1727 
1728     while (stripe != parity) {
1729         RtlClearAllBits(&context->stripes[stripe].error);
1730 
1731         for (i = 0; i < sectors_per_stripe; i++) {
1732             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1733                 if (RtlCheckBit(&context->is_tree, off)) {
1734                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size];
1735                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1736                     uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
1737 
1738                     if (crc32 != *((uint32_t*)th->csum) || th->address != addr) {
1739                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size);
1740                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1741 
1742                         if (missing_devices > 0)
1743                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1744                     }
1745 
1746                     off += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1747                     stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1748                     i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1;
1749 
1750                     continue;
1751                 } else if (RtlCheckBit(&context->has_csum, off)) {
1752                     uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1753 
1754                     if (crc32 != context->csum[off]) {
1755                         RtlSetBit(&context->stripes[stripe].error, i);
1756                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1757 
1758                         if (missing_devices > 0) {
1759                             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1760 
1761                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1762                         }
1763                     }
1764                 }
1765             }
1766 
1767             off++;
1768             stripeoff++;
1769         }
1770 
1771         if (missing_devices == 0)
1772             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1773 
1774         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1775         stripeoff = num * sectors_per_stripe;
1776     }
1777 
1778     // check parity
1779 
1780     if (missing_devices == 0) {
1781         RtlClearAllBits(&context->stripes[parity].error);
1782 
1783         for (i = 0; i < sectors_per_stripe; i++) {
1784             ULONG o, j;
1785 
1786             o = i * Vcb->superblock.sector_size;
1787             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1788                 if (context->parity_scratch[o] != 0) {
1789                     RtlSetBit(&context->stripes[parity].error, i);
1790                     break;
1791                 }
1792                 o++;
1793             }
1794         }
1795     }
1796 
1797     // log and fix errors
1798 
1799     if (missing_devices > 0)
1800         return;
1801 
1802     for (i = 0; i < sectors_per_stripe; i++) {
1803         ULONG num_errors = 0, bad_off;
1804         uint64_t bad_stripe;
1805         bool alloc = false;
1806 
1807         stripe = (parity + 1) % c->chunk_item->num_stripes;
1808         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1809 
1810         while (stripe != parity) {
1811             if (RtlCheckBit(&context->alloc, off)) {
1812                 alloc = true;
1813 
1814                 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1815                     bad_stripe = stripe;
1816                     bad_off = off;
1817                     num_errors++;
1818                 }
1819             }
1820 
1821             off += sectors_per_stripe;
1822             stripe = (stripe + 1) % c->chunk_item->num_stripes;
1823         }
1824 
1825         if (!alloc)
1826             continue;
1827 
1828         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity].error, i)) // everything fine
1829             continue;
1830 
1831         if (num_errors == 0 && RtlCheckBit(&context->stripes[parity].error, i)) { // parity error
1832             uint64_t addr;
1833 
1834             do_xor(&context->stripes[parity].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1835                    &context->parity_scratch[i * Vcb->superblock.sector_size],
1836                    Vcb->superblock.sector_size);
1837 
1838             bad_off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1839             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size);
1840 
1841             context->stripes[parity].rewrite = true;
1842 
1843             log_error(Vcb, addr, c->devices[parity]->devitem.dev_id, false, true, true);
1844             log_device_error(Vcb, c->devices[parity], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1845         } else if (num_errors == 1) {
1846             uint32_t crc32;
1847             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size);
1848 
1849             if (RtlCheckBit(&context->is_tree, bad_off)) {
1850                 tree_header* th;
1851 
1852                 do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
1853                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1854                        Vcb->superblock.node_size);
1855 
1856                 th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
1857                 crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
1858 
1859                 if (crc32 == *((uint32_t*)th->csum) && th->address == addr) {
1860                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1861                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
1862 
1863                     context->stripes[bad_stripe].rewrite = true;
1864 
1865                     RtlClearBits(&context->stripes[bad_stripe].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
1866 
1867                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, true, false);
1868                 } else
1869                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, false, false);
1870             } else {
1871                 do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
1872                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1873                        Vcb->superblock.sector_size);
1874 
1875                 crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
1876 
1877                 if (crc32 == context->csum[bad_off]) {
1878                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1879                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
1880 
1881                     context->stripes[bad_stripe].rewrite = true;
1882 
1883                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, true, false);
1884                 } else
1885                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, false, false);
1886             }
1887         } else {
1888             stripe = (parity + 1) % c->chunk_item->num_stripes;
1889             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1890 
1891             while (stripe != parity) {
1892                 if (RtlCheckBit(&context->alloc, off)) {
1893                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1894                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1895 
1896                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
1897                     }
1898                 }
1899 
1900                 off += sectors_per_stripe;
1901                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1902             }
1903         }
1904     }
1905 }
1906 
1907 static void scrub_raid6_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1908                                uint64_t num, uint16_t missing_devices) {
1909     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off;
1910     uint16_t stripe, parity1 = (bit_start + num + c->chunk_item->num_stripes - 2) % c->chunk_item->num_stripes;
1911     uint16_t parity2 = (parity1 + 1) % c->chunk_item->num_stripes;
1912     uint64_t stripeoff;
1913 
1914     stripe = (parity1 + 2) % c->chunk_item->num_stripes;
1915     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2);
1916     stripeoff = num * sectors_per_stripe;
1917 
1918     if (c->devices[parity1]->devobj)
1919         RtlCopyMemory(context->parity_scratch, &context->stripes[parity1].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1920 
1921     if (c->devices[parity2]->devobj)
1922         RtlZeroMemory(context->parity_scratch2, (ULONG)c->chunk_item->stripe_length);
1923 
1924     while (stripe != parity1) {
1925         RtlClearAllBits(&context->stripes[stripe].error);
1926 
1927         for (i = 0; i < sectors_per_stripe; i++) {
1928             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1929                 if (RtlCheckBit(&context->is_tree, off)) {
1930                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size];
1931                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1932                     uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
1933 
1934                     if (crc32 != *((uint32_t*)th->csum) || th->address != addr) {
1935                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size);
1936                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1937 
1938                         if (missing_devices == 2)
1939                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1940                     }
1941 
1942                     off += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1943                     stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1944                     i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1;
1945 
1946                     continue;
1947                 } else if (RtlCheckBit(&context->has_csum, off)) {
1948                     uint32_t crc32 = ~calc_crc32c(0xffffffff, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1949 
1950                     if (crc32 != context->csum[off]) {
1951                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1952 
1953                         RtlSetBit(&context->stripes[stripe].error, i);
1954                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1955 
1956                         if (missing_devices == 2)
1957                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1958                     }
1959                 }
1960             }
1961 
1962             off++;
1963             stripeoff++;
1964         }
1965 
1966         if (c->devices[parity1]->devobj)
1967             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1968 
1969         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1970         stripeoff = num * sectors_per_stripe;
1971     }
1972 
1973     RtlClearAllBits(&context->stripes[parity1].error);
1974 
1975     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity2]->devobj)) {
1976         // check parity 1
1977 
1978         for (i = 0; i < sectors_per_stripe; i++) {
1979             ULONG o, j;
1980 
1981             o = i * Vcb->superblock.sector_size;
1982             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1983                 if (context->parity_scratch[o] != 0) {
1984                     RtlSetBit(&context->stripes[parity1].error, i);
1985                     break;
1986                 }
1987                 o++;
1988             }
1989         }
1990     }
1991 
1992     RtlClearAllBits(&context->stripes[parity2].error);
1993 
1994     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity1]->devobj)) {
1995         // check parity 2
1996 
1997         stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
1998 
1999         while (stripe != parity2) {
2000             galois_double(context->parity_scratch2, (uint32_t)c->chunk_item->stripe_length);
2001             do_xor(context->parity_scratch2, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
2002 
2003             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2004         }
2005 
2006         for (i = 0; i < sectors_per_stripe; i++) {
2007             if (RtlCompareMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2008                                 &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size) != Vcb->superblock.sector_size)
2009                 RtlSetBit(&context->stripes[parity2].error, i);
2010         }
2011     }
2012 
2013     if (missing_devices == 2)
2014         return;
2015 
2016     // log and fix errors
2017 
2018     for (i = 0; i < sectors_per_stripe; i++) {
2019         ULONG num_errors = 0;
2020         uint64_t bad_stripe1, bad_stripe2;
2021         ULONG bad_off1, bad_off2;
2022         bool alloc = false;
2023 
2024         stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2025         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2026 
2027         while (stripe != parity1) {
2028             if (RtlCheckBit(&context->alloc, off)) {
2029                 alloc = true;
2030 
2031                 if (!c->devices[stripe]->devobj || RtlCheckBit(&context->stripes[stripe].error, i)) {
2032                     if (num_errors == 0) {
2033                         bad_stripe1 = stripe;
2034                         bad_off1 = off;
2035                     } else if (num_errors == 1) {
2036                         bad_stripe2 = stripe;
2037                         bad_off2 = off;
2038                     }
2039                     num_errors++;
2040                 }
2041             }
2042 
2043             off += sectors_per_stripe;
2044             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2045         }
2046 
2047         if (!alloc)
2048             continue;
2049 
2050         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity1].error, i) && !RtlCheckBit(&context->stripes[parity2].error, i)) // everything fine
2051             continue;
2052 
2053         if (num_errors == 0) { // parity error
2054             uint64_t addr;
2055 
2056             if (RtlCheckBit(&context->stripes[parity1].error, i)) {
2057                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2058                        &context->parity_scratch[i * Vcb->superblock.sector_size],
2059                        Vcb->superblock.sector_size);
2060 
2061                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2062                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2063 
2064                 context->stripes[parity1].rewrite = true;
2065 
2066                 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2067                 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2068             }
2069 
2070             if (RtlCheckBit(&context->stripes[parity2].error, i)) {
2071                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2072                               &context->parity_scratch2[i * Vcb->superblock.sector_size],
2073                               Vcb->superblock.sector_size);
2074 
2075                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2076                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2077 
2078                 context->stripes[parity2].rewrite = true;
2079 
2080                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2081                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2082             }
2083         } else if (num_errors == 1) {
2084             uint32_t crc32a, crc32b, len;
2085             uint16_t stripe_num, bad_stripe_num;
2086             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2087             uint8_t* scratch;
2088 
2089             len = RtlCheckBit(&context->is_tree, bad_off1)? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2090 
2091             scratch = ExAllocatePoolWithTag(PagedPool, len, ALLOC_TAG);
2092             if (!scratch) {
2093                 ERR("out of memory\n");
2094                 return;
2095             }
2096 
2097             RtlZeroMemory(scratch, len);
2098 
2099             do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
2100                    &context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2101 
2102             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2103 
2104             if (c->devices[parity2]->devobj) {
2105                 stripe_num = c->chunk_item->num_stripes - 3;
2106                 while (stripe != parity2) {
2107                     galois_double(scratch, len);
2108 
2109                     if (stripe != bad_stripe1)
2110                         do_xor(scratch, &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2111                     else
2112                         bad_stripe_num = stripe_num;
2113 
2114                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2115                     stripe_num--;
2116                 }
2117 
2118                 do_xor(scratch, &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2119 
2120                 if (bad_stripe_num != 0)
2121                     galois_divpower(scratch, (uint8_t)bad_stripe_num, len);
2122             }
2123 
2124             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2125                 tree_header *th1 = NULL, *th2 = NULL;
2126 
2127                 if (c->devices[parity1]->devobj) {
2128                     th1 = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
2129                     crc32a = ~calc_crc32c(0xffffffff, (uint8_t*)&th1->fs_uuid, Vcb->superblock.node_size - sizeof(th1->csum));
2130                 }
2131 
2132                 if (c->devices[parity2]->devobj) {
2133                     th2 = (tree_header*)scratch;
2134                     crc32b = ~calc_crc32c(0xffffffff, (uint8_t*)&th2->fs_uuid, Vcb->superblock.node_size - sizeof(th2->csum));
2135                 }
2136 
2137                 if ((c->devices[parity1]->devobj && crc32a == *((uint32_t*)th1->csum) && th1->address == addr) ||
2138                     (c->devices[parity2]->devobj && crc32b == *((uint32_t*)th2->csum) && th2->address == addr)) {
2139                     if (!c->devices[parity1]->devobj || crc32a != *((uint32_t*)th1->csum) || th1->address != addr) {
2140                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2141                                       scratch, Vcb->superblock.node_size);
2142 
2143                         if (c->devices[parity1]->devobj) {
2144                             // fix parity 1
2145 
2146                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2147 
2148                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2149                                           &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2150                                           Vcb->superblock.node_size);
2151 
2152                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2153 
2154                             while (stripe != parity1) {
2155                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2156                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2157                                        Vcb->superblock.node_size);
2158 
2159                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2160                             }
2161 
2162                             context->stripes[parity1].rewrite = true;
2163 
2164                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2165                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2166                         }
2167                     } else {
2168                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2169                                       &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2170 
2171                         if (!c->devices[parity2]->devobj || crc32b != *((uint32_t*)th2->csum) || th2->address != addr) {
2172                             // fix parity 2
2173                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2174 
2175                             if (c->devices[parity2]->devobj) {
2176                                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2177                                             &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2178                                             Vcb->superblock.node_size);
2179 
2180                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2181 
2182                                 while (stripe != parity2) {
2183                                     galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size);
2184 
2185                                     do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2186                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2187                                         Vcb->superblock.node_size);
2188 
2189                                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2190                                 }
2191 
2192                                 context->stripes[parity2].rewrite = true;
2193 
2194                                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2195                                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2196                             }
2197                         }
2198                     }
2199 
2200                     context->stripes[bad_stripe1].rewrite = true;
2201 
2202                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2203 
2204                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2205                 } else
2206                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2207             } else {
2208                 if (c->devices[parity1]->devobj)
2209                     crc32a = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2210 
2211                 if (c->devices[parity2]->devobj)
2212                     crc32b = ~calc_crc32c(0xffffffff, scratch, Vcb->superblock.sector_size);
2213 
2214                 if ((c->devices[parity1]->devobj && crc32a == context->csum[bad_off1]) || (c->devices[parity2]->devobj && crc32b == context->csum[bad_off1])) {
2215                     if (c->devices[parity2]->devobj && crc32b == context->csum[bad_off1]) {
2216                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2217                                       scratch, Vcb->superblock.sector_size);
2218 
2219                         if (c->devices[parity1]->devobj && crc32a != context->csum[bad_off1]) {
2220                             // fix parity 1
2221 
2222                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2223 
2224                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2225                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2226                                         Vcb->superblock.sector_size);
2227 
2228                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2229 
2230                             while (stripe != parity1) {
2231                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2232                                     &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2233                                     Vcb->superblock.sector_size);
2234 
2235                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2236                             }
2237 
2238                             context->stripes[parity1].rewrite = true;
2239 
2240                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2241                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2242                         }
2243                     } else {
2244                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2245                                       &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2246 
2247                         if (c->devices[parity2]->devobj && crc32b != context->csum[bad_off1]) {
2248                             // fix parity 2
2249                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2250 
2251                             RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2252                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2253                                         Vcb->superblock.sector_size);
2254 
2255                             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2256 
2257                             while (stripe != parity2) {
2258                                 galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
2259 
2260                                 do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2261                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2262                                        Vcb->superblock.sector_size);
2263 
2264                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2265                             }
2266 
2267                             context->stripes[parity2].rewrite = true;
2268 
2269                             log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2270                             log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2271                         }
2272                     }
2273 
2274                     context->stripes[bad_stripe1].rewrite = true;
2275 
2276                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2277                 } else
2278                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2279             }
2280 
2281             ExFreePool(scratch);
2282         } else if (num_errors == 2 && missing_devices == 0) {
2283             uint16_t x, y, k;
2284             uint64_t addr;
2285             uint32_t len = (RtlCheckBit(&context->is_tree, bad_off1) || RtlCheckBit(&context->is_tree, bad_off2)) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2286             uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
2287             uint32_t j;
2288 
2289             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2290 
2291             // put qxy in parity_scratch
2292             // put pxy in parity_scratch2
2293 
2294             k = c->chunk_item->num_stripes - 3;
2295             if (stripe == bad_stripe1 || stripe == bad_stripe2) {
2296                 RtlZeroMemory(&context->parity_scratch[i * Vcb->superblock.sector_size], len);
2297                 RtlZeroMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size], len);
2298 
2299                 if (stripe == bad_stripe1)
2300                     x = k;
2301                 else
2302                     y = k;
2303             } else {
2304                 RtlCopyMemory(&context->parity_scratch[i * Vcb->superblock.sector_size],
2305                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2306                 RtlCopyMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size],
2307                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2308             }
2309 
2310             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2311 
2312             k--;
2313             do {
2314                 galois_double(&context->parity_scratch[i * Vcb->superblock.sector_size], len);
2315 
2316                 if (stripe != bad_stripe1 && stripe != bad_stripe2) {
2317                     do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
2318                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2319                     do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size],
2320                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2321                 } else if (stripe == bad_stripe1)
2322                     x = k;
2323                 else if (stripe == bad_stripe2)
2324                     y = k;
2325 
2326                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2327                 k--;
2328             } while (stripe != parity2);
2329 
2330             gyx = gpow2(y > x ? (y-x) : (255-x+y));
2331             gx = gpow2(255-x);
2332 
2333             denom = gdiv(1, gyx ^ 1);
2334             a = gmul(gyx, denom);
2335             b = gmul(gx, denom);
2336 
2337             p = &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)];
2338             q = &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)];
2339             pxy = &context->parity_scratch2[i * Vcb->superblock.sector_size];
2340             qxy = &context->parity_scratch[i * Vcb->superblock.sector_size];
2341 
2342             for (j = 0; j < len; j++) {
2343                 *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
2344 
2345                 p++;
2346                 q++;
2347                 pxy++;
2348                 qxy++;
2349             }
2350 
2351             do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->parity_scratch[i * Vcb->superblock.sector_size], len);
2352             do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2353 
2354             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2355 
2356             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2357                 tree_header* th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
2358                 uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
2359 
2360                 if (crc32 == *((uint32_t*)th->csum) && th->address == addr) {
2361                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2362                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2363 
2364                     context->stripes[bad_stripe1].rewrite = true;
2365 
2366                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2367 
2368                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2369                 } else
2370                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2371             } else {
2372                 uint32_t crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2373 
2374                 if (crc32 == context->csum[bad_off1]) {
2375                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2376                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2377 
2378                     context->stripes[bad_stripe1].rewrite = true;
2379 
2380                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2381                 } else
2382                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2383             }
2384 
2385             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off2 * Vcb->superblock.sector_size);
2386 
2387             if (RtlCheckBit(&context->is_tree, bad_off2)) {
2388                 tree_header* th = (tree_header*)&context->parity_scratch2[i * Vcb->superblock.sector_size];
2389                 uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
2390 
2391                 if (crc32 == *((uint32_t*)th->csum) && th->address == addr) {
2392                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2393                                   &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2394 
2395                     context->stripes[bad_stripe2].rewrite = true;
2396 
2397                     RtlClearBits(&context->stripes[bad_stripe2].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2398 
2399                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, true, false);
2400                 } else
2401                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, false, false);
2402             } else {
2403                 uint32_t crc32 = ~calc_crc32c(0xffffffff, &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2404 
2405                 if (crc32 == context->csum[bad_off2]) {
2406                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2407                                   &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2408 
2409                     context->stripes[bad_stripe2].rewrite = true;
2410 
2411                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, true, false);
2412                 } else
2413                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, false, false);
2414             }
2415         } else {
2416             stripe = (parity2 + 1) % c->chunk_item->num_stripes;
2417             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2418 
2419             while (stripe != parity1) {
2420                 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
2421                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
2422                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
2423 
2424                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
2425                     }
2426                 }
2427 
2428                 off += sectors_per_stripe;
2429                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2430             }
2431         }
2432     }
2433 }
2434 
2435 static NTSTATUS scrub_chunk_raid56_stripe_run(device_extension* Vcb, chunk* c, uint64_t stripe_start, uint64_t stripe_end) {
2436     NTSTATUS Status;
2437     KEY searchkey;
2438     traverse_ptr tp;
2439     bool b;
2440     uint64_t run_start, run_end, full_stripe_len, stripe;
2441     uint32_t max_read, num_sectors;
2442     ULONG arrlen, *allocarr, *csumarr = NULL, *treearr, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2443     scrub_context_raid56 context;
2444     uint16_t i;
2445     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2446 
2447     TRACE("(%p, %p, %I64x, %I64x)\n", Vcb, c, stripe_start, stripe_end);
2448 
2449     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2450     run_start = c->offset + (stripe_start * full_stripe_len);
2451     run_end = c->offset + ((stripe_end + 1) * full_stripe_len);
2452 
2453     searchkey.obj_id = run_start;
2454     searchkey.obj_type = TYPE_METADATA_ITEM;
2455     searchkey.offset = 0xffffffffffffffff;
2456 
2457     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2458     if (!NT_SUCCESS(Status)) {
2459         ERR("find_item returned %08x\n", Status);
2460         return Status;
2461     }
2462 
2463     num_sectors = (uint32_t)((stripe_end - stripe_start + 1) * full_stripe_len / Vcb->superblock.sector_size);
2464     arrlen = (ULONG)sector_align((num_sectors / 8) + 1, sizeof(ULONG));
2465 
2466     allocarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2467     if (!allocarr) {
2468         ERR("out of memory\n");
2469         return STATUS_INSUFFICIENT_RESOURCES;
2470     }
2471 
2472     treearr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2473     if (!treearr) {
2474         ERR("out of memory\n");
2475         ExFreePool(allocarr);
2476         return STATUS_INSUFFICIENT_RESOURCES;
2477     }
2478 
2479     RtlInitializeBitMap(&context.alloc, allocarr, num_sectors);
2480     RtlClearAllBits(&context.alloc);
2481 
2482     RtlInitializeBitMap(&context.is_tree, treearr, num_sectors);
2483     RtlClearAllBits(&context.is_tree);
2484 
2485     context.parity_scratch = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2486     if (!context.parity_scratch) {
2487         ERR("out of memory\n");
2488         ExFreePool(allocarr);
2489         ExFreePool(treearr);
2490         return STATUS_INSUFFICIENT_RESOURCES;
2491     }
2492 
2493     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2494         csumarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2495         if (!csumarr) {
2496             ERR("out of memory\n");
2497             ExFreePool(allocarr);
2498             ExFreePool(treearr);
2499             ExFreePool(context.parity_scratch);
2500             return STATUS_INSUFFICIENT_RESOURCES;
2501         }
2502 
2503         RtlInitializeBitMap(&context.has_csum, csumarr, num_sectors);
2504         RtlClearAllBits(&context.has_csum);
2505 
2506         context.csum = ExAllocatePoolWithTag(PagedPool, num_sectors * sizeof(uint32_t), ALLOC_TAG);
2507         if (!context.csum) {
2508             ERR("out of memory\n");
2509             ExFreePool(allocarr);
2510             ExFreePool(treearr);
2511             ExFreePool(context.parity_scratch);
2512             ExFreePool(csumarr);
2513             return STATUS_INSUFFICIENT_RESOURCES;
2514         }
2515     }
2516 
2517     if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2518         context.parity_scratch2 = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2519         if (!context.parity_scratch2) {
2520             ERR("out of memory\n");
2521             ExFreePool(allocarr);
2522             ExFreePool(treearr);
2523             ExFreePool(context.parity_scratch);
2524 
2525             if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2526                 ExFreePool(csumarr);
2527                 ExFreePool(context.csum);
2528             }
2529 
2530             return STATUS_INSUFFICIENT_RESOURCES;
2531         }
2532     }
2533 
2534     do {
2535         traverse_ptr next_tp;
2536 
2537         if (tp.item->key.obj_id >= run_end)
2538             break;
2539 
2540         if (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM) {
2541             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2542 
2543             if (tp.item->key.obj_id + size > run_start) {
2544                 uint64_t extent_start = max(run_start, tp.item->key.obj_id);
2545                 uint64_t extent_end = min(tp.item->key.obj_id + size, run_end);
2546                 bool extent_is_tree = false;
2547 
2548                 RtlSetBits(&context.alloc, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size));
2549 
2550                 if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2551                     extent_is_tree = true;
2552                 else {
2553                     EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2554 
2555                     if (tp.item->size < sizeof(EXTENT_ITEM)) {
2556                         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2557                         Status = STATUS_INTERNAL_ERROR;
2558                         goto end;
2559                     }
2560 
2561                     if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2562                         extent_is_tree = true;
2563                 }
2564 
2565                 if (extent_is_tree)
2566                     RtlSetBits(&context.is_tree, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size));
2567                 else if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2568                     traverse_ptr tp2;
2569                     bool b2;
2570 
2571                     searchkey.obj_id = EXTENT_CSUM_ID;
2572                     searchkey.obj_type = TYPE_EXTENT_CSUM;
2573                     searchkey.offset = extent_start;
2574 
2575                     Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
2576                     if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
2577                         ERR("find_item returned %08x\n", Status);
2578                         goto end;
2579                     }
2580 
2581                     do {
2582                         traverse_ptr next_tp2;
2583 
2584                         if (tp2.item->key.offset >= extent_end)
2585                             break;
2586 
2587                         if (tp2.item->key.offset >= extent_start) {
2588                             uint64_t csum_start = max(extent_start, tp2.item->key.offset);
2589                             uint64_t csum_end = min(extent_end, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
2590 
2591                             RtlSetBits(&context.has_csum, (ULONG)((csum_start - run_start) / Vcb->superblock.sector_size), (ULONG)((csum_end - csum_start) / Vcb->superblock.sector_size));
2592 
2593                             RtlCopyMemory(&context.csum[(csum_start - run_start) / Vcb->superblock.sector_size],
2594                                           tp2.item->data + ((csum_start - tp2.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
2595                                           (ULONG)((csum_end - csum_start) * sizeof(uint32_t) / Vcb->superblock.sector_size));
2596                         }
2597 
2598                         b2 = find_next_item(Vcb, &tp2, &next_tp2, false, NULL);
2599 
2600                         if (b2)
2601                             tp2 = next_tp2;
2602                     } while (b2);
2603                 }
2604             }
2605         }
2606 
2607         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2608 
2609         if (b)
2610             tp = next_tp;
2611     } while (b);
2612 
2613     context.stripes = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_context_raid56_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
2614     if (!context.stripes) {
2615         ERR("out of memory\n");
2616         Status = STATUS_INSUFFICIENT_RESOURCES;
2617         goto end;
2618     }
2619 
2620     max_read = (uint32_t)min(1048576 / c->chunk_item->stripe_length, stripe_end - stripe_start + 1); // only process 1 MB of data at a time
2621 
2622     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2623         context.stripes[i].buf = ExAllocatePoolWithTag(PagedPool, (ULONG)(max_read * c->chunk_item->stripe_length), ALLOC_TAG);
2624         if (!context.stripes[i].buf) {
2625             uint64_t j;
2626 
2627             ERR("out of memory\n");
2628 
2629             for (j = 0; j < i; j++) {
2630                 ExFreePool(context.stripes[j].buf);
2631             }
2632             ExFreePool(context.stripes);
2633 
2634             Status = STATUS_INSUFFICIENT_RESOURCES;
2635             goto end;
2636         }
2637 
2638         context.stripes[i].errorarr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(((c->chunk_item->stripe_length / Vcb->superblock.sector_size) / 8) + 1, sizeof(ULONG)), ALLOC_TAG);
2639         if (!context.stripes[i].errorarr) {
2640             uint64_t j;
2641 
2642             ERR("out of memory\n");
2643 
2644             ExFreePool(context.stripes[i].buf);
2645 
2646             for (j = 0; j < i; j++) {
2647                 ExFreePool(context.stripes[j].buf);
2648             }
2649             ExFreePool(context.stripes);
2650 
2651             Status = STATUS_INSUFFICIENT_RESOURCES;
2652             goto end;
2653         }
2654 
2655         RtlInitializeBitMap(&context.stripes[i].error, context.stripes[i].errorarr, (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size));
2656 
2657         context.stripes[i].context = &context;
2658         context.stripes[i].rewrite = false;
2659     }
2660 
2661     stripe = stripe_start;
2662 
2663     Status = STATUS_SUCCESS;
2664 
2665     chunk_lock_range(Vcb, c, run_start, run_end - run_start);
2666 
2667     do {
2668         ULONG read_stripes;
2669         uint16_t missing_devices = 0;
2670         bool need_wait = false;
2671 
2672         if (max_read < stripe_end + 1 - stripe)
2673             read_stripes = max_read;
2674         else
2675             read_stripes = (ULONG)(stripe_end + 1 - stripe);
2676 
2677         context.stripes_left = c->chunk_item->num_stripes;
2678 
2679         // read megabyte by megabyte
2680         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2681             if (c->devices[i]->devobj) {
2682                 PIO_STACK_LOCATION IrpSp;
2683 
2684                 context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
2685 
2686                 if (!context.stripes[i].Irp) {
2687                     ERR("IoAllocateIrp failed\n");
2688                     Status = STATUS_INSUFFICIENT_RESOURCES;
2689                     goto end3;
2690                 }
2691 
2692                 context.stripes[i].Irp->MdlAddress = NULL;
2693 
2694                 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2695                 IrpSp->MajorFunction = IRP_MJ_READ;
2696                 IrpSp->FileObject = c->devices[i]->fileobj;
2697 
2698                 if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2699                     context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(read_stripes * c->chunk_item->stripe_length), ALLOC_TAG);
2700                     if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2701                         ERR("out of memory\n");
2702                         Status = STATUS_INSUFFICIENT_RESOURCES;
2703                         goto end3;
2704                     }
2705 
2706                     context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2707 
2708                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2709                 } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
2710                     context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, (ULONG)(read_stripes * c->chunk_item->stripe_length), false, false, NULL);
2711                     if (!context.stripes[i].Irp->MdlAddress) {
2712                         ERR("IoAllocateMdl failed\n");
2713                         Status = STATUS_INSUFFICIENT_RESOURCES;
2714                         goto end3;
2715                     }
2716 
2717                     Status = STATUS_SUCCESS;
2718 
2719                     _SEH2_TRY {
2720                         MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
2721                     } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2722                         Status = _SEH2_GetExceptionCode();
2723                     } _SEH2_END;
2724 
2725                     if (!NT_SUCCESS(Status)) {
2726                         ERR("MmProbeAndLockPages threw exception %08x\n", Status);
2727                         IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2728                         goto end3;
2729                     }
2730                 } else
2731                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2732 
2733                 context.stripes[i].offset = stripe * c->chunk_item->stripe_length;
2734 
2735                 IrpSp->Parameters.Read.Length = (ULONG)(read_stripes * c->chunk_item->stripe_length);
2736                 IrpSp->Parameters.Read.ByteOffset.QuadPart = cis[i].offset + context.stripes[i].offset;
2737 
2738                 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2739                 context.stripes[i].missing = false;
2740 
2741                 IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion_raid56, &context.stripes[i], true, true, true);
2742 
2743                 Vcb->scrub.data_scrubbed += read_stripes * c->chunk_item->stripe_length;
2744                 need_wait = true;
2745             } else {
2746                 context.stripes[i].Irp = NULL;
2747                 context.stripes[i].missing = true;
2748                 missing_devices++;
2749                 InterlockedDecrement(&context.stripes_left);
2750             }
2751         }
2752 
2753         if (c->chunk_item->type & BLOCK_FLAG_RAID5 && missing_devices > 1) {
2754             ERR("too many missing devices (%u, maximum 1)\n", missing_devices);
2755             Status = STATUS_UNEXPECTED_IO_ERROR;
2756             goto end3;
2757         } else if (c->chunk_item->type & BLOCK_FLAG_RAID6 && missing_devices > 2) {
2758             ERR("too many missing devices (%u, maximum 2)\n", missing_devices);
2759             Status = STATUS_UNEXPECTED_IO_ERROR;
2760             goto end3;
2761         }
2762 
2763         if (need_wait) {
2764             KeInitializeEvent(&context.Event, NotificationEvent, false);
2765 
2766             for (i = 0; i < c->chunk_item->num_stripes; i++) {
2767                 if (c->devices[i]->devobj)
2768                     IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
2769             }
2770 
2771             KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2772         }
2773 
2774         // return an error if any of the stripes returned an error
2775         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2776             if (!context.stripes[i].missing && !NT_SUCCESS(context.stripes[i].iosb.Status)) {
2777                 Status = context.stripes[i].iosb.Status;
2778                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
2779                 goto end3;
2780             }
2781         }
2782 
2783         if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2784             for (i = 0; i < read_stripes; i++) {
2785                 scrub_raid6_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2786             }
2787         } else {
2788             for (i = 0; i < read_stripes; i++) {
2789                 scrub_raid5_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2790             }
2791         }
2792         stripe += read_stripes;
2793 
2794 end3:
2795         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2796             if (context.stripes[i].Irp) {
2797                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
2798                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
2799                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2800                 }
2801                 IoFreeIrp(context.stripes[i].Irp);
2802                 context.stripes[i].Irp = NULL;
2803 
2804                 if (context.stripes[i].rewrite) {
2805                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + context.stripes[i].offset,
2806                                              context.stripes[i].buf, (uint32_t)(read_stripes * c->chunk_item->stripe_length));
2807 
2808                     if (!NT_SUCCESS(Status)) {
2809                         ERR("write_data_phys returned %08x\n", Status);
2810                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
2811                         goto end2;
2812                     }
2813                 }
2814             }
2815         }
2816 
2817         if (!NT_SUCCESS(Status))
2818             break;
2819     } while (stripe < stripe_end);
2820 
2821 end2:
2822     chunk_unlock_range(Vcb, c, run_start, run_end - run_start);
2823 
2824     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2825         ExFreePool(context.stripes[i].buf);
2826         ExFreePool(context.stripes[i].errorarr);
2827     }
2828     ExFreePool(context.stripes);
2829 
2830 end:
2831     ExFreePool(treearr);
2832     ExFreePool(allocarr);
2833     ExFreePool(context.parity_scratch);
2834 
2835     if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2836         ExFreePool(context.parity_scratch2);
2837 
2838     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2839         ExFreePool(csumarr);
2840         ExFreePool(context.csum);
2841     }
2842 
2843     return Status;
2844 }
2845 
2846 static NTSTATUS scrub_chunk_raid56(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2847     NTSTATUS Status;
2848     KEY searchkey;
2849     traverse_ptr tp;
2850     bool b;
2851     uint64_t full_stripe_len, stripe, stripe_start, stripe_end, total_data = 0;
2852     ULONG num_extents = 0, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2853 
2854     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2855     stripe = (*offset - c->offset) / full_stripe_len;
2856 
2857     *offset = c->offset + (stripe * full_stripe_len);
2858 
2859     searchkey.obj_id = *offset;
2860     searchkey.obj_type = TYPE_METADATA_ITEM;
2861     searchkey.offset = 0xffffffffffffffff;
2862 
2863     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2864     if (!NT_SUCCESS(Status)) {
2865         ERR("find_item returned %08x\n", Status);
2866         return Status;
2867     }
2868 
2869     *changed = false;
2870 
2871     do {
2872         traverse_ptr next_tp;
2873 
2874         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2875             break;
2876 
2877         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2878             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2879 
2880             TRACE("%I64x\n", tp.item->key.obj_id);
2881 
2882             if (size < Vcb->superblock.sector_size) {
2883                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, Vcb->superblock.sector_size);
2884                 return STATUS_INTERNAL_ERROR;
2885             }
2886 
2887             stripe = (tp.item->key.obj_id - c->offset) / full_stripe_len;
2888 
2889             if (*changed) {
2890                 if (stripe > stripe_end + 1) {
2891                     Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2892                     if (!NT_SUCCESS(Status)) {
2893                         ERR("scrub_chunk_raid56_stripe_run returned %08x\n", Status);
2894                         return Status;
2895                     }
2896 
2897                     stripe_start = stripe;
2898                 }
2899             } else
2900                 stripe_start = stripe;
2901 
2902             stripe_end = (tp.item->key.obj_id + size - 1 - c->offset) / full_stripe_len;
2903 
2904             *changed = true;
2905 
2906             total_data += size;
2907             num_extents++;
2908 
2909             // only do so much at a time
2910             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
2911                 break;
2912         }
2913 
2914         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2915 
2916         if (b)
2917             tp = next_tp;
2918     } while (b);
2919 
2920     if (*changed) {
2921         Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2922         if (!NT_SUCCESS(Status)) {
2923             ERR("scrub_chunk_raid56_stripe_run returned %08x\n", Status);
2924             return Status;
2925         }
2926 
2927         *offset = c->offset + ((stripe_end + 1) * full_stripe_len);
2928     }
2929 
2930     return STATUS_SUCCESS;
2931 }
2932 
2933 static NTSTATUS scrub_chunk(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2934     NTSTATUS Status;
2935     KEY searchkey;
2936     traverse_ptr tp;
2937     bool b = false, tree_run = false;
2938     ULONG type, num_extents = 0;
2939     uint64_t total_data = 0, tree_run_start, tree_run_end;
2940 
2941     TRACE("chunk %I64x\n", c->offset);
2942 
2943     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2944 
2945     if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2946         type = BLOCK_FLAG_DUPLICATE;
2947     else if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2948         type = BLOCK_FLAG_RAID0;
2949     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2950         type = BLOCK_FLAG_DUPLICATE;
2951     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2952         type = BLOCK_FLAG_RAID10;
2953     else if (c->chunk_item->type & BLOCK_FLAG_RAID5) {
2954         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2955         goto end;
2956     } else if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2957         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2958         goto end;
2959     } else // SINGLE
2960         type = BLOCK_FLAG_DUPLICATE;
2961 
2962     searchkey.obj_id = *offset;
2963     searchkey.obj_type = TYPE_METADATA_ITEM;
2964     searchkey.offset = 0xffffffffffffffff;
2965 
2966     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2967     if (!NT_SUCCESS(Status)) {
2968         ERR("error - find_item returned %08x\n", Status);
2969         goto end;
2970     }
2971 
2972     do {
2973         traverse_ptr next_tp;
2974 
2975         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2976             break;
2977 
2978         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2979             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2980             bool is_tree;
2981             uint32_t* csum = NULL;
2982             RTL_BITMAP bmp;
2983             ULONG* bmparr = NULL, bmplen;
2984 
2985             TRACE("%I64x\n", tp.item->key.obj_id);
2986 
2987             is_tree = false;
2988 
2989             if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2990                 is_tree = true;
2991             else {
2992                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2993 
2994                 if (tp.item->size < sizeof(EXTENT_ITEM)) {
2995                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %u\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2996                     Status = STATUS_INTERNAL_ERROR;
2997                     goto end;
2998                 }
2999 
3000                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
3001                     is_tree = true;
3002             }
3003 
3004             if (size < Vcb->superblock.sector_size) {
3005                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, Vcb->superblock.sector_size);
3006                 Status = STATUS_INTERNAL_ERROR;
3007                 goto end;
3008             }
3009 
3010             // load csum
3011             if (!is_tree) {
3012                 traverse_ptr tp2;
3013 
3014                 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(sizeof(uint32_t) * size / Vcb->superblock.sector_size), ALLOC_TAG);
3015                 if (!csum) {
3016                     ERR("out of memory\n");
3017                     Status = STATUS_INSUFFICIENT_RESOURCES;
3018                     goto end;
3019                 }
3020 
3021                 bmplen = (ULONG)(size / Vcb->superblock.sector_size);
3022 
3023                 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)(sector_align((bmplen >> 3) + 1, sizeof(ULONG))), ALLOC_TAG);
3024                 if (!bmparr) {
3025                     ERR("out of memory\n");
3026                     ExFreePool(csum);
3027                     Status = STATUS_INSUFFICIENT_RESOURCES;
3028                     goto end;
3029                 }
3030 
3031                 RtlInitializeBitMap(&bmp, bmparr, bmplen);
3032                 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
3033 
3034                 searchkey.obj_id = EXTENT_CSUM_ID;
3035                 searchkey.obj_type = TYPE_EXTENT_CSUM;
3036                 searchkey.offset = tp.item->key.obj_id;
3037 
3038                 Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
3039                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
3040                     ERR("find_item returned %08x\n", Status);
3041                     ExFreePool(csum);
3042                     ExFreePool(bmparr);
3043                     goto end;
3044                 }
3045 
3046                 if (Status != STATUS_NOT_FOUND) {
3047                     do {
3048                         traverse_ptr next_tp2;
3049 
3050                         if (tp2.item->key.obj_type == TYPE_EXTENT_CSUM) {
3051                             if (tp2.item->key.offset >= tp.item->key.obj_id + size)
3052                                 break;
3053                             else if (tp2.item->size >= sizeof(uint32_t) && tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)) >= tp.item->key.obj_id) {
3054                                 uint64_t cs = max(tp.item->key.obj_id, tp2.item->key.offset);
3055                                 uint64_t ce = min(tp.item->key.obj_id + size, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / sizeof(uint32_t)));
3056 
3057                                 RtlCopyMemory(csum + ((cs - tp.item->key.obj_id) / Vcb->superblock.sector_size),
3058                                               tp2.item->data + ((cs - tp2.item->key.offset) * sizeof(uint32_t) / Vcb->superblock.sector_size),
3059                                               (ULONG)((ce - cs) * sizeof(uint32_t) / Vcb->superblock.sector_size));
3060 
3061                                 RtlClearBits(&bmp, (ULONG)((cs - tp.item->key.obj_id) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
3062 
3063                                 if (ce == tp.item->key.obj_id + size)
3064                                     break;
3065                             }
3066                         }
3067 
3068                         if (find_next_item(Vcb, &tp2, &next_tp2, false, NULL))
3069                             tp2 = next_tp2;
3070                         else
3071                             break;
3072                     } while (true);
3073                 }
3074             }
3075 
3076             if (tree_run) {
3077                 if (!is_tree || tp.item->key.obj_id > tree_run_end) {
3078                     Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3079                     if (!NT_SUCCESS(Status)) {
3080                         ERR("scrub_extent returned %08x\n", Status);
3081                         goto end;
3082                     }
3083 
3084                     if (!is_tree)
3085                         tree_run = false;
3086                     else {
3087                         tree_run_start = tp.item->key.obj_id;
3088                         tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3089                     }
3090                 } else
3091                     tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3092             } else if (is_tree) {
3093                 tree_run = true;
3094                 tree_run_start = tp.item->key.obj_id;
3095                 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3096             }
3097 
3098             if (!is_tree) {
3099                 Status = scrub_data_extent(Vcb, c, tp.item->key.obj_id, type, csum, &bmp, bmplen);
3100                 if (!NT_SUCCESS(Status)) {
3101                     ERR("scrub_data_extent returned %08x\n", Status);
3102                     ExFreePool(csum);
3103                     ExFreePool(bmparr);
3104                     goto end;
3105                 }
3106 
3107                 ExFreePool(csum);
3108                 ExFreePool(bmparr);
3109             }
3110 
3111             *offset = tp.item->key.obj_id + size;
3112             *changed = true;
3113 
3114             total_data += size;
3115             num_extents++;
3116 
3117             // only do so much at a time
3118             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
3119                 break;
3120         }
3121 
3122         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
3123 
3124         if (b)
3125             tp = next_tp;
3126     } while (b);
3127 
3128     if (tree_run) {
3129         Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3130         if (!NT_SUCCESS(Status)) {
3131             ERR("scrub_extent returned %08x\n", Status);
3132             goto end;
3133         }
3134     }
3135 
3136     Status = STATUS_SUCCESS;
3137 
3138 end:
3139     ExReleaseResourceLite(&Vcb->tree_lock);
3140 
3141     return Status;
3142 }
3143 
3144 _Function_class_(KSTART_ROUTINE)
3145 static void __stdcall scrub_thread(void* context) {
3146     device_extension* Vcb = context;
3147     LIST_ENTRY chunks, *le;
3148     NTSTATUS Status;
3149     LARGE_INTEGER time;
3150 
3151     KeInitializeEvent(&Vcb->scrub.finished, NotificationEvent, false);
3152 
3153     InitializeListHead(&chunks);
3154 
3155     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3156 
3157     if (Vcb->need_write && !Vcb->readonly)
3158         Status = do_write(Vcb, NULL);
3159     else
3160         Status = STATUS_SUCCESS;
3161 
3162     free_trees(Vcb);
3163 
3164     if (!NT_SUCCESS(Status)) {
3165         ExReleaseResourceLite(&Vcb->tree_lock);
3166         ERR("do_write returned %08x\n", Status);
3167         Vcb->scrub.error = Status;
3168         goto end;
3169     }
3170 
3171     ExConvertExclusiveToSharedLite(&Vcb->tree_lock);
3172 
3173     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3174 
3175     KeQuerySystemTime(&Vcb->scrub.start_time);
3176     Vcb->scrub.finish_time.QuadPart = 0;
3177     Vcb->scrub.resume_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3178     Vcb->scrub.duration.QuadPart = 0;
3179     Vcb->scrub.total_chunks = 0;
3180     Vcb->scrub.chunks_left = 0;
3181     Vcb->scrub.data_scrubbed = 0;
3182     Vcb->scrub.num_errors = 0;
3183 
3184     while (!IsListEmpty(&Vcb->scrub.errors)) {
3185         scrub_error* err = CONTAINING_RECORD(RemoveHeadList(&Vcb->scrub.errors), scrub_error, list_entry);
3186         ExFreePool(err);
3187     }
3188 
3189     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3190 
3191     le = Vcb->chunks.Flink;
3192     while (le != &Vcb->chunks) {
3193         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3194 
3195         acquire_chunk_lock(c, Vcb);
3196 
3197         if (!c->readonly) {
3198             InsertTailList(&chunks, &c->list_entry_balance);
3199             Vcb->scrub.total_chunks++;
3200             Vcb->scrub.chunks_left++;
3201         }
3202 
3203         release_chunk_lock(c, Vcb);
3204 
3205         le = le->Flink;
3206     }
3207 
3208     ExReleaseResourceLite(&Vcb->chunk_lock);
3209 
3210     ExReleaseResource(&Vcb->scrub.stats_lock);
3211 
3212     ExReleaseResourceLite(&Vcb->tree_lock);
3213 
3214     while (!IsListEmpty(&chunks)) {
3215         chunk* c = CONTAINING_RECORD(RemoveHeadList(&chunks), chunk, list_entry_balance);
3216         uint64_t offset = c->offset;
3217         bool changed;
3218 
3219         c->reloc = true;
3220 
3221         KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3222 
3223         if (!Vcb->scrub.stopping) {
3224             do {
3225                 changed = false;
3226 
3227                 Status = scrub_chunk(Vcb, c, &offset, &changed);
3228                 if (!NT_SUCCESS(Status)) {
3229                     ERR("scrub_chunk returned %08x\n", Status);
3230                     Vcb->scrub.stopping = true;
3231                     Vcb->scrub.error = Status;
3232                     break;
3233                 }
3234 
3235                 if (offset == c->offset + c->chunk_item->size || Vcb->scrub.stopping)
3236                     break;
3237 
3238                 KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3239             } while (changed);
3240         }
3241 
3242         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3243 
3244         if (!Vcb->scrub.stopping)
3245             Vcb->scrub.chunks_left--;
3246 
3247         if (IsListEmpty(&chunks))
3248             KeQuerySystemTime(&Vcb->scrub.finish_time);
3249 
3250         ExReleaseResource(&Vcb->scrub.stats_lock);
3251 
3252         c->reloc = false;
3253         c->list_entry_balance.Flink = NULL;
3254     }
3255 
3256     KeQuerySystemTime(&time);
3257     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3258 
3259 end:
3260     ZwClose(Vcb->scrub.thread);
3261     Vcb->scrub.thread = NULL;
3262 
3263     KeSetEvent(&Vcb->scrub.finished, 0, false);
3264 }
3265 
3266 NTSTATUS start_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3267     NTSTATUS Status;
3268     OBJECT_ATTRIBUTES oa;
3269 
3270     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3271         return STATUS_PRIVILEGE_NOT_HELD;
3272 
3273     if (Vcb->locked) {
3274         WARN("cannot start scrub while locked\n");
3275         return STATUS_DEVICE_NOT_READY;
3276     }
3277 
3278     if (Vcb->balance.thread) {
3279         WARN("cannot start scrub while balance running\n");
3280         return STATUS_DEVICE_NOT_READY;
3281     }
3282 
3283     if (Vcb->scrub.thread) {
3284         WARN("scrub already running\n");
3285         return STATUS_DEVICE_NOT_READY;
3286     }
3287 
3288     if (Vcb->readonly)
3289         return STATUS_MEDIA_WRITE_PROTECTED;
3290 
3291     Vcb->scrub.stopping = false;
3292     Vcb->scrub.paused = false;
3293     Vcb->scrub.error = STATUS_SUCCESS;
3294     KeInitializeEvent(&Vcb->scrub.event, NotificationEvent, !Vcb->scrub.paused);
3295 
3296     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3297 
3298     Status = PsCreateSystemThread(&Vcb->scrub.thread, 0, &oa, NULL, NULL, scrub_thread, Vcb);
3299     if (!NT_SUCCESS(Status)) {
3300         ERR("PsCreateSystemThread returned %08x\n", Status);
3301         return Status;
3302     }
3303 
3304     return STATUS_SUCCESS;
3305 }
3306 
3307 NTSTATUS query_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode, void* data, ULONG length) {
3308     btrfs_query_scrub* bqs = (btrfs_query_scrub*)data;
3309     ULONG len;
3310     NTSTATUS Status;
3311     LIST_ENTRY* le;
3312     btrfs_scrub_error* bse = NULL;
3313 
3314     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3315         return STATUS_PRIVILEGE_NOT_HELD;
3316 
3317     if (length < offsetof(btrfs_query_scrub, errors))
3318         return STATUS_BUFFER_TOO_SMALL;
3319 
3320     ExAcquireResourceSharedLite(&Vcb->scrub.stats_lock, true);
3321 
3322     if (Vcb->scrub.thread && Vcb->scrub.chunks_left > 0)
3323         bqs->status = Vcb->scrub.paused ? BTRFS_SCRUB_PAUSED : BTRFS_SCRUB_RUNNING;
3324     else
3325         bqs->status = BTRFS_SCRUB_STOPPED;
3326 
3327     bqs->start_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3328     bqs->finish_time.QuadPart = Vcb->scrub.finish_time.QuadPart;
3329     bqs->chunks_left = Vcb->scrub.chunks_left;
3330     bqs->total_chunks = Vcb->scrub.total_chunks;
3331     bqs->data_scrubbed = Vcb->scrub.data_scrubbed;
3332 
3333     bqs->duration = Vcb->scrub.duration.QuadPart;
3334 
3335     if (bqs->status == BTRFS_SCRUB_RUNNING) {
3336         LARGE_INTEGER time;
3337 
3338         KeQuerySystemTime(&time);
3339         bqs->duration += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3340     }
3341 
3342     bqs->error = Vcb->scrub.error;
3343 
3344     bqs->num_errors = Vcb->scrub.num_errors;
3345 
3346     len = length - offsetof(btrfs_query_scrub, errors);
3347 
3348     le = Vcb->scrub.errors.Flink;
3349     while (le != &Vcb->scrub.errors) {
3350         scrub_error* err = CONTAINING_RECORD(le, scrub_error, list_entry);
3351         ULONG errlen;
3352 
3353         if (err->is_metadata)
3354             errlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3355         else
3356             errlen = offsetof(btrfs_scrub_error, data.filename) + err->data.filename_length;
3357 
3358         if (len < errlen) {
3359             Status = STATUS_BUFFER_OVERFLOW;
3360             goto end;
3361         }
3362 
3363         if (!bse)
3364             bse = &bqs->errors;
3365         else {
3366             ULONG lastlen;
3367 
3368             if (bse->is_metadata)
3369                 lastlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3370             else
3371                 lastlen = offsetof(btrfs_scrub_error, data.filename) + bse->data.filename_length;
3372 
3373             bse->next_entry = lastlen;
3374             bse = (btrfs_scrub_error*)(((uint8_t*)bse) + lastlen);
3375         }
3376 
3377         bse->next_entry = 0;
3378         bse->address = err->address;
3379         bse->device = err->device;
3380         bse->recovered = err->recovered;
3381         bse->is_metadata = err->is_metadata;
3382         bse->parity = err->parity;
3383 
3384         if (err->is_metadata) {
3385             bse->metadata.root = err->metadata.root;
3386             bse->metadata.level = err->metadata.level;
3387             bse->metadata.firstitem = err->metadata.firstitem;
3388         } else {
3389             bse->data.subvol = err->data.subvol;
3390             bse->data.offset = err->data.offset;
3391             bse->data.filename_length = err->data.filename_length;
3392             RtlCopyMemory(bse->data.filename, err->data.filename, err->data.filename_length);
3393         }
3394 
3395         len -= errlen;
3396         le = le->Flink;
3397     }
3398 
3399     Status = STATUS_SUCCESS;
3400 
3401 end:
3402     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
3403 
3404     return Status;
3405 }
3406 
3407 NTSTATUS pause_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3408     LARGE_INTEGER time;
3409 
3410     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3411         return STATUS_PRIVILEGE_NOT_HELD;
3412 
3413     if (!Vcb->scrub.thread)
3414         return STATUS_DEVICE_NOT_READY;
3415 
3416     if (Vcb->scrub.paused)
3417         return STATUS_DEVICE_NOT_READY;
3418 
3419     Vcb->scrub.paused = true;
3420     KeClearEvent(&Vcb->scrub.event);
3421 
3422     KeQuerySystemTime(&time);
3423     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3424 
3425     return STATUS_SUCCESS;
3426 }
3427 
3428 NTSTATUS resume_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3429     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3430         return STATUS_PRIVILEGE_NOT_HELD;
3431 
3432     if (!Vcb->scrub.thread)
3433         return STATUS_DEVICE_NOT_READY;
3434 
3435     if (!Vcb->scrub.paused)
3436         return STATUS_DEVICE_NOT_READY;
3437 
3438     Vcb->scrub.paused = false;
3439     KeSetEvent(&Vcb->scrub.event, 0, false);
3440 
3441     KeQuerySystemTime(&Vcb->scrub.resume_time);
3442 
3443     return STATUS_SUCCESS;
3444 }
3445 
3446 NTSTATUS stop_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3447     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3448         return STATUS_PRIVILEGE_NOT_HELD;
3449 
3450     if (!Vcb->scrub.thread)
3451         return STATUS_DEVICE_NOT_READY;
3452 
3453     Vcb->scrub.paused = false;
3454     Vcb->scrub.stopping = true;
3455     KeSetEvent(&Vcb->scrub.event, 0, false);
3456 
3457     return STATUS_SUCCESS;
3458 }
3459