xref: /reactos/drivers/filesystems/btrfs/scrub.c (revision d8c6ef5e)
1 /* Copyright (c) Mark Harmstone 2017
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 
20 #define SCRUB_UNIT 0x100000 // 1 MB
21 
22 struct _scrub_context;
23 
24 typedef struct {
25     struct _scrub_context* context;
26     PIRP Irp;
27     uint64_t start;
28     uint32_t length;
29     IO_STATUS_BLOCK iosb;
30     uint8_t* buf;
31     bool csum_error;
32     void* bad_csums;
33 } scrub_context_stripe;
34 
35 typedef struct _scrub_context {
36     KEVENT Event;
37     scrub_context_stripe* stripes;
38     LONG stripes_left;
39 } scrub_context;
40 
41 typedef struct {
42     ANSI_STRING name;
43     bool orig_subvol;
44     LIST_ENTRY list_entry;
45 } path_part;
46 
47 static void log_file_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t subvol, uint64_t inode, uint64_t offset) {
48     LIST_ENTRY *le, parts;
49     root* r = NULL;
50     KEY searchkey;
51     traverse_ptr tp;
52     uint64_t dir;
53     bool orig_subvol = true, not_in_tree = false;
54     ANSI_STRING fn;
55     scrub_error* err;
56     NTSTATUS Status;
57     ULONG utf16len;
58 
59     le = Vcb->roots.Flink;
60     while (le != &Vcb->roots) {
61         root* r2 = CONTAINING_RECORD(le, root, list_entry);
62 
63         if (r2->id == subvol) {
64             r = r2;
65             break;
66         }
67 
68         le = le->Flink;
69     }
70 
71     if (!r) {
72         ERR("could not find subvol %I64x\n", subvol);
73         return;
74     }
75 
76     InitializeListHead(&parts);
77 
78     dir = inode;
79 
80     while (true) {
81         if (dir == r->root_item.objid) {
82             if (r == Vcb->root_fileref->fcb->subvol)
83                 break;
84 
85             searchkey.obj_id = r->id;
86             searchkey.obj_type = TYPE_ROOT_BACKREF;
87             searchkey.offset = 0xffffffffffffffff;
88 
89             Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
90             if (!NT_SUCCESS(Status)) {
91                 ERR("find_item returned %08lx\n", Status);
92                 goto end;
93             }
94 
95             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
96                 ROOT_REF* rr = (ROOT_REF*)tp.item->data;
97                 path_part* pp;
98 
99                 if (tp.item->size < sizeof(ROOT_REF)) {
100                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(ROOT_REF));
101                     goto end;
102                 }
103 
104                 if (tp.item->size < offsetof(ROOT_REF, name[0]) + rr->n) {
105                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
106                         tp.item->size, offsetof(ROOT_REF, name[0]) + rr->n);
107                     goto end;
108                 }
109 
110                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
111                 if (!pp) {
112                     ERR("out of memory\n");
113                     goto end;
114                 }
115 
116                 pp->name.Buffer = rr->name;
117                 pp->name.Length = pp->name.MaximumLength = rr->n;
118                 pp->orig_subvol = false;
119 
120                 InsertTailList(&parts, &pp->list_entry);
121 
122                 r = NULL;
123 
124                 le = Vcb->roots.Flink;
125                 while (le != &Vcb->roots) {
126                     root* r2 = CONTAINING_RECORD(le, root, list_entry);
127 
128                     if (r2->id == tp.item->key.offset) {
129                         r = r2;
130                         break;
131                     }
132 
133                     le = le->Flink;
134                 }
135 
136                 if (!r) {
137                     ERR("could not find subvol %I64x\n", tp.item->key.offset);
138                     goto end;
139                 }
140 
141                 dir = rr->dir;
142                 orig_subvol = false;
143             } else {
144                 not_in_tree = true;
145                 break;
146             }
147         } else {
148             searchkey.obj_id = dir;
149             searchkey.obj_type = TYPE_INODE_EXTREF;
150             searchkey.offset = 0xffffffffffffffff;
151 
152             Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
153             if (!NT_SUCCESS(Status)) {
154                 ERR("find_item returned %08lx\n", Status);
155                 goto end;
156             }
157 
158             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_REF) {
159                 INODE_REF* ir = (INODE_REF*)tp.item->data;
160                 path_part* pp;
161 
162                 if (tp.item->size < sizeof(INODE_REF)) {
163                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_REF));
164                     goto end;
165                 }
166 
167                 if (tp.item->size < offsetof(INODE_REF, name[0]) + ir->n) {
168                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
169                         tp.item->size, offsetof(INODE_REF, name[0]) + ir->n);
170                     goto end;
171                 }
172 
173                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
174                 if (!pp) {
175                     ERR("out of memory\n");
176                     goto end;
177                 }
178 
179                 pp->name.Buffer = ir->name;
180                 pp->name.Length = pp->name.MaximumLength = ir->n;
181                 pp->orig_subvol = orig_subvol;
182 
183                 InsertTailList(&parts, &pp->list_entry);
184 
185                 if (dir == tp.item->key.offset)
186                     break;
187 
188                 dir = tp.item->key.offset;
189             } else if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_EXTREF) {
190                 INODE_EXTREF* ier = (INODE_EXTREF*)tp.item->data;
191                 path_part* pp;
192 
193                 if (tp.item->size < sizeof(INODE_EXTREF)) {
194                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
195                                                                                   tp.item->size, sizeof(INODE_EXTREF));
196                     goto end;
197                 }
198 
199                 if (tp.item->size < offsetof(INODE_EXTREF, name[0]) + ier->n) {
200                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
201                         tp.item->size, offsetof(INODE_EXTREF, name[0]) + ier->n);
202                     goto end;
203                 }
204 
205                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
206                 if (!pp) {
207                     ERR("out of memory\n");
208                     goto end;
209                 }
210 
211                 pp->name.Buffer = ier->name;
212                 pp->name.Length = pp->name.MaximumLength = ier->n;
213                 pp->orig_subvol = orig_subvol;
214 
215                 InsertTailList(&parts, &pp->list_entry);
216 
217                 if (dir == ier->dir)
218                     break;
219 
220                 dir = ier->dir;
221             } else {
222                 ERR("could not find INODE_REF for inode %I64x in subvol %I64x\n", dir, r->id);
223                 goto end;
224             }
225         }
226     }
227 
228     fn.MaximumLength = 0;
229 
230     if (not_in_tree) {
231         le = parts.Blink;
232         while (le != &parts) {
233             path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
234             LIST_ENTRY* le2 = le->Blink;
235 
236             if (pp->orig_subvol)
237                 break;
238 
239             RemoveTailList(&parts);
240             ExFreePool(pp);
241 
242             le = le2;
243         }
244     }
245 
246     le = parts.Flink;
247     while (le != &parts) {
248         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
249 
250         fn.MaximumLength += pp->name.Length + 1;
251 
252         le = le->Flink;
253     }
254 
255     fn.Buffer = ExAllocatePoolWithTag(PagedPool, fn.MaximumLength, ALLOC_TAG);
256     if (!fn.Buffer) {
257         ERR("out of memory\n");
258         goto end;
259     }
260 
261     fn.Length = 0;
262 
263     le = parts.Blink;
264     while (le != &parts) {
265         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
266 
267         fn.Buffer[fn.Length] = '\\';
268         fn.Length++;
269 
270         RtlCopyMemory(&fn.Buffer[fn.Length], pp->name.Buffer, pp->name.Length);
271         fn.Length += pp->name.Length;
272 
273         le = le->Blink;
274     }
275 
276     if (not_in_tree)
277         ERR("subvol %I64x, %.*s, offset %I64x\n", subvol, fn.Length, fn.Buffer, offset);
278     else
279         ERR("%.*s, offset %I64x\n", fn.Length, fn.Buffer, offset);
280 
281     Status = utf8_to_utf16(NULL, 0, &utf16len, fn.Buffer, fn.Length);
282     if (!NT_SUCCESS(Status)) {
283         ERR("utf8_to_utf16 1 returned %08lx\n", Status);
284         ExFreePool(fn.Buffer);
285         goto end;
286     }
287 
288     err = ExAllocatePoolWithTag(PagedPool, offsetof(scrub_error, data.filename[0]) + utf16len, ALLOC_TAG);
289     if (!err) {
290         ERR("out of memory\n");
291         ExFreePool(fn.Buffer);
292         goto end;
293     }
294 
295     err->address = addr;
296     err->device = devid;
297     err->recovered = false;
298     err->is_metadata = false;
299     err->parity = false;
300 
301     err->data.subvol = not_in_tree ? subvol : 0;
302     err->data.offset = offset;
303     err->data.filename_length = (uint16_t)utf16len;
304 
305     Status = utf8_to_utf16(err->data.filename, utf16len, &utf16len, fn.Buffer, fn.Length);
306     if (!NT_SUCCESS(Status)) {
307         ERR("utf8_to_utf16 2 returned %08lx\n", Status);
308         ExFreePool(fn.Buffer);
309         ExFreePool(err);
310         goto end;
311     }
312 
313     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
314 
315     Vcb->scrub.num_errors++;
316     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
317 
318     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
319 
320     ExFreePool(fn.Buffer);
321 
322 end:
323     while (!IsListEmpty(&parts)) {
324         path_part* pp = CONTAINING_RECORD(RemoveHeadList(&parts), path_part, list_entry);
325 
326         ExFreePool(pp);
327     }
328 }
329 
330 static void log_file_checksum_error_shared(device_extension* Vcb, uint64_t treeaddr, uint64_t addr, uint64_t devid, uint64_t extent) {
331     tree_header* tree;
332     NTSTATUS Status;
333     leaf_node* ln;
334     ULONG i;
335 
336     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
337     if (!tree) {
338         ERR("out of memory\n");
339         return;
340     }
341 
342     Status = read_data(Vcb, treeaddr, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
343     if (!NT_SUCCESS(Status)) {
344         ERR("read_data returned %08lx\n", Status);
345         goto end;
346     }
347 
348     if (tree->level != 0) {
349         ERR("tree level was %x, expected 0\n", tree->level);
350         goto end;
351     }
352 
353     ln = (leaf_node*)&tree[1];
354 
355     for (i = 0; i < tree->num_items; i++) {
356         if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
357             EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)tree + sizeof(tree_header) + ln[i].offset);
358             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
359 
360             if (ed->type == EXTENT_TYPE_REGULAR && ed2->size != 0 && ed2->address == addr)
361                 log_file_checksum_error(Vcb, addr, devid, tree->tree_id, ln[i].key.obj_id, ln[i].key.offset + addr - extent);
362         }
363     }
364 
365 end:
366     ExFreePool(tree);
367 }
368 
369 static void log_tree_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t root, uint8_t level, KEY* firstitem) {
370     scrub_error* err;
371 
372     err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
373     if (!err) {
374         ERR("out of memory\n");
375         return;
376     }
377 
378     err->address = addr;
379     err->device = devid;
380     err->recovered = false;
381     err->is_metadata = true;
382     err->parity = false;
383 
384     err->metadata.root = root;
385     err->metadata.level = level;
386 
387     if (firstitem) {
388         ERR("root %I64x, level %u, first item (%I64x,%x,%I64x)\n", root, level, firstitem->obj_id,
389                                                                 firstitem->obj_type, firstitem->offset);
390 
391         err->metadata.firstitem = *firstitem;
392     } else {
393         ERR("root %I64x, level %u\n", root, level);
394 
395         RtlZeroMemory(&err->metadata.firstitem, sizeof(KEY));
396     }
397 
398     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
399 
400     Vcb->scrub.num_errors++;
401     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
402 
403     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
404 }
405 
406 static void log_tree_checksum_error_shared(device_extension* Vcb, uint64_t offset, uint64_t address, uint64_t devid) {
407     tree_header* tree;
408     NTSTATUS Status;
409     internal_node* in;
410     ULONG i;
411 
412     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
413     if (!tree) {
414         ERR("out of memory\n");
415         return;
416     }
417 
418     Status = read_data(Vcb, offset, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
419     if (!NT_SUCCESS(Status)) {
420         ERR("read_data returned %08lx\n", Status);
421         goto end;
422     }
423 
424     if (tree->level == 0) {
425         ERR("tree level was 0\n");
426         goto end;
427     }
428 
429     in = (internal_node*)&tree[1];
430 
431     for (i = 0; i < tree->num_items; i++) {
432         if (in[i].address == address) {
433             log_tree_checksum_error(Vcb, address, devid, tree->tree_id, tree->level - 1, &in[i].key);
434             break;
435         }
436     }
437 
438 end:
439     ExFreePool(tree);
440 }
441 
442 static void log_unrecoverable_error(device_extension* Vcb, uint64_t address, uint64_t devid) {
443     KEY searchkey;
444     traverse_ptr tp;
445     NTSTATUS Status;
446     EXTENT_ITEM* ei;
447     EXTENT_ITEM2* ei2 = NULL;
448     uint8_t* ptr;
449     ULONG len;
450     uint64_t rc;
451 
452     // FIXME - still log even if rest of this function fails
453 
454     searchkey.obj_id = address;
455     searchkey.obj_type = TYPE_METADATA_ITEM;
456     searchkey.offset = 0xffffffffffffffff;
457 
458     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
459     if (!NT_SUCCESS(Status)) {
460         ERR("find_item returned %08lx\n", Status);
461         return;
462     }
463 
464     if ((tp.item->key.obj_type != TYPE_EXTENT_ITEM && tp.item->key.obj_type != TYPE_METADATA_ITEM) ||
465         tp.item->key.obj_id >= address + Vcb->superblock.sector_size ||
466         (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.obj_id + tp.item->key.offset <= address) ||
467         (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->key.obj_id + Vcb->superblock.node_size <= address)
468     )
469         return;
470 
471     if (tp.item->size < sizeof(EXTENT_ITEM)) {
472         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
473         return;
474     }
475 
476     ei = (EXTENT_ITEM*)tp.item->data;
477     ptr = (uint8_t*)&ei[1];
478     len = tp.item->size - sizeof(EXTENT_ITEM);
479 
480     if (tp.item->key.obj_id == TYPE_EXTENT_ITEM && ei->flags & EXTENT_ITEM_TREE_BLOCK) {
481         if (tp.item->size < sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)) {
482             ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
483                                                                           tp.item->size, sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2));
484             return;
485         }
486 
487         ei2 = (EXTENT_ITEM2*)ptr;
488 
489         ptr += sizeof(EXTENT_ITEM2);
490         len -= sizeof(EXTENT_ITEM2);
491     }
492 
493     rc = 0;
494 
495     while (len > 0) {
496         uint8_t type = *ptr;
497 
498         ptr++;
499         len--;
500 
501         if (type == TYPE_TREE_BLOCK_REF) {
502             TREE_BLOCK_REF* tbr;
503 
504             if (len < sizeof(TREE_BLOCK_REF)) {
505                 ERR("TREE_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(TREE_BLOCK_REF), len);
506                 break;
507             }
508 
509             tbr = (TREE_BLOCK_REF*)ptr;
510 
511             log_tree_checksum_error(Vcb, address, devid, tbr->offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
512 
513             rc++;
514 
515             ptr += sizeof(TREE_BLOCK_REF);
516             len -= sizeof(TREE_BLOCK_REF);
517         } else if (type == TYPE_EXTENT_DATA_REF) {
518             EXTENT_DATA_REF* edr;
519 
520             if (len < sizeof(EXTENT_DATA_REF)) {
521                 ERR("EXTENT_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(EXTENT_DATA_REF), len);
522                 break;
523             }
524 
525             edr = (EXTENT_DATA_REF*)ptr;
526 
527             log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
528 
529             rc += edr->count;
530 
531             ptr += sizeof(EXTENT_DATA_REF);
532             len -= sizeof(EXTENT_DATA_REF);
533         } else if (type == TYPE_SHARED_BLOCK_REF) {
534             SHARED_BLOCK_REF* sbr;
535 
536             if (len < sizeof(SHARED_BLOCK_REF)) {
537                 ERR("SHARED_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_BLOCK_REF), len);
538                 break;
539             }
540 
541             sbr = (SHARED_BLOCK_REF*)ptr;
542 
543             log_tree_checksum_error_shared(Vcb, sbr->offset, address, devid);
544 
545             rc++;
546 
547             ptr += sizeof(SHARED_BLOCK_REF);
548             len -= sizeof(SHARED_BLOCK_REF);
549         } else if (type == TYPE_SHARED_DATA_REF) {
550             SHARED_DATA_REF* sdr;
551 
552             if (len < sizeof(SHARED_DATA_REF)) {
553                 ERR("SHARED_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_DATA_REF), len);
554                 break;
555             }
556 
557             sdr = (SHARED_DATA_REF*)ptr;
558 
559             log_file_checksum_error_shared(Vcb, sdr->offset, address, devid, tp.item->key.obj_id);
560 
561             rc += sdr->count;
562 
563             ptr += sizeof(SHARED_DATA_REF);
564             len -= sizeof(SHARED_DATA_REF);
565         } else {
566             ERR("unknown extent type %x\n", type);
567             break;
568         }
569     }
570 
571     if (rc < ei->refcount) {
572         do {
573             traverse_ptr next_tp;
574 
575             if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
576                 tp = next_tp;
577             else
578                 break;
579 
580             if (tp.item->key.obj_id == address) {
581                 if (tp.item->key.obj_type == TYPE_TREE_BLOCK_REF)
582                     log_tree_checksum_error(Vcb, address, devid, tp.item->key.offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
583                 else if (tp.item->key.obj_type == TYPE_EXTENT_DATA_REF) {
584                     EXTENT_DATA_REF* edr;
585 
586                     if (tp.item->size < sizeof(EXTENT_DATA_REF)) {
587                         ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
588                                                                              tp.item->size, sizeof(EXTENT_DATA_REF));
589                         break;
590                     }
591 
592                     edr = (EXTENT_DATA_REF*)tp.item->data;
593 
594                     log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
595                 } else if (tp.item->key.obj_type == TYPE_SHARED_BLOCK_REF)
596                     log_tree_checksum_error_shared(Vcb, tp.item->key.offset, address, devid);
597                 else if (tp.item->key.obj_type == TYPE_SHARED_DATA_REF)
598                     log_file_checksum_error_shared(Vcb, tp.item->key.offset, address, devid, tp.item->key.obj_id);
599             } else
600                 break;
601         } while (true);
602     }
603 }
604 
605 static void log_error(device_extension* Vcb, uint64_t addr, uint64_t devid, bool metadata, bool recoverable, bool parity) {
606     if (recoverable) {
607         scrub_error* err;
608 
609         if (parity) {
610             ERR("recovering from parity error at %I64x on device %I64x\n", addr, devid);
611         } else {
612             if (metadata)
613                 ERR("recovering from metadata checksum error at %I64x on device %I64x\n", addr, devid);
614             else
615                 ERR("recovering from data checksum error at %I64x on device %I64x\n", addr, devid);
616         }
617 
618         err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
619         if (!err) {
620             ERR("out of memory\n");
621             return;
622         }
623 
624         err->address = addr;
625         err->device = devid;
626         err->recovered = true;
627         err->is_metadata = metadata;
628         err->parity = parity;
629 
630         if (metadata)
631             RtlZeroMemory(&err->metadata, sizeof(err->metadata));
632         else
633             RtlZeroMemory(&err->data, sizeof(err->data));
634 
635         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
636 
637         Vcb->scrub.num_errors++;
638         InsertTailList(&Vcb->scrub.errors, &err->list_entry);
639 
640         ExReleaseResourceLite(&Vcb->scrub.stats_lock);
641     } else {
642         if (metadata)
643             ERR("unrecoverable metadata checksum error at %I64x\n", addr);
644         else
645             ERR("unrecoverable data checksum error at %I64x\n", addr);
646 
647         log_unrecoverable_error(Vcb, addr, devid);
648     }
649 }
650 
651 _Function_class_(IO_COMPLETION_ROUTINE)
652 static NTSTATUS __stdcall scrub_read_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
653     scrub_context_stripe* stripe = conptr;
654     scrub_context* context = (scrub_context*)stripe->context;
655     ULONG left = InterlockedDecrement(&context->stripes_left);
656 
657     UNUSED(DeviceObject);
658 
659     stripe->iosb = Irp->IoStatus;
660 
661     if (left == 0)
662         KeSetEvent(&context->Event, 0, false);
663 
664     return STATUS_MORE_PROCESSING_REQUIRED;
665 }
666 
667 static NTSTATUS scrub_extent_dup(device_extension* Vcb, chunk* c, uint64_t offset, void* csum, scrub_context* context) {
668     NTSTATUS Status;
669     bool csum_error = false;
670     ULONG i;
671     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
672     uint16_t present_devices = 0;
673 
674     if (csum) {
675         ULONG good_stripe = 0xffffffff;
676 
677         for (i = 0; i < c->chunk_item->num_stripes; i++) {
678             if (c->devices[i]->devobj) {
679                 present_devices++;
680 
681                 // if first stripe is okay, we only need to check that the others are identical to it
682                 if (good_stripe != 0xffffffff) {
683                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
684                                         context->stripes[good_stripe].length) != context->stripes[i].length) {
685                         context->stripes[i].csum_error = true;
686                         csum_error = true;
687                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
688                     }
689                 } else {
690                     Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, csum);
691                     if (Status == STATUS_CRC_ERROR) {
692                         context->stripes[i].csum_error = true;
693                         csum_error = true;
694                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
695                     } else if (!NT_SUCCESS(Status)) {
696                         ERR("check_csum returned %08lx\n", Status);
697                         return Status;
698                     } else
699                         good_stripe = i;
700                 }
701             }
702         }
703     } else {
704         ULONG good_stripe = 0xffffffff;
705 
706         for (i = 0; i < c->chunk_item->num_stripes; i++) {
707             ULONG j;
708 
709             if (c->devices[i]->devobj) {
710                 // if first stripe is okay, we only need to check that the others are identical to it
711                 if (good_stripe != 0xffffffff) {
712                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
713                                          context->stripes[good_stripe].length) != context->stripes[i].length) {
714                         context->stripes[i].csum_error = true;
715                         csum_error = true;
716                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
717                     }
718                 } else {
719                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
720                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
721 
722                         if (!check_tree_checksum(Vcb, th) || th->address != offset + UInt32x32To64(j, Vcb->superblock.node_size)) {
723                             context->stripes[i].csum_error = true;
724                             csum_error = true;
725                             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
726                         }
727                     }
728 
729                     if (!context->stripes[i].csum_error)
730                         good_stripe = i;
731                 }
732             }
733         }
734     }
735 
736     if (!csum_error)
737         return STATUS_SUCCESS;
738 
739     // handle checksum error
740 
741     for (i = 0; i < c->chunk_item->num_stripes; i++) {
742         if (context->stripes[i].csum_error) {
743             if (csum) {
744                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * Vcb->csum_size / Vcb->superblock.sector_size, ALLOC_TAG);
745                 if (!context->stripes[i].bad_csums) {
746                     ERR("out of memory\n");
747                     return STATUS_INSUFFICIENT_RESOURCES;
748                 }
749 
750                 do_calc_job(Vcb, context->stripes[i].buf, context->stripes[i].length / Vcb->superblock.sector_size, context->stripes[i].bad_csums);
751             } else {
752                 ULONG j;
753 
754                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[i].length * Vcb->csum_size / Vcb->superblock.node_size, ALLOC_TAG);
755                 if (!context->stripes[i].bad_csums) {
756                     ERR("out of memory\n");
757                     return STATUS_INSUFFICIENT_RESOURCES;
758                 }
759 
760                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
761                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
762 
763                     get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[i].bad_csums + (Vcb->csum_size * j));
764                 }
765             }
766         }
767     }
768 
769     if (present_devices > 1) {
770         ULONG good_stripe = 0xffffffff;
771 
772         for (i = 0; i < c->chunk_item->num_stripes; i++) {
773             if (c->devices[i]->devobj && !context->stripes[i].csum_error) {
774                 good_stripe = i;
775                 break;
776             }
777         }
778 
779         if (good_stripe != 0xffffffff) {
780             // log
781 
782             for (i = 0; i < c->chunk_item->num_stripes; i++) {
783                 if (context->stripes[i].csum_error) {
784                     ULONG j;
785 
786                     if (csum) {
787                         for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
788                             if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
789                                 uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
790 
791                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
792                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
793                             }
794                         }
795                     } else {
796                         for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
797                             tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
798                             uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
799 
800                             if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
801                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
802                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
803                             }
804                         }
805                     }
806                 }
807             }
808 
809             // write good data over bad
810 
811             for (i = 0; i < c->chunk_item->num_stripes; i++) {
812                 if (context->stripes[i].csum_error && !c->devices[i]->readonly) {
813                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
814                                              context->stripes[good_stripe].buf, context->stripes[i].length);
815 
816                     if (!NT_SUCCESS(Status)) {
817                         ERR("write_data_phys returned %08lx\n", Status);
818                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
819                         return Status;
820                     }
821                 }
822             }
823 
824             return STATUS_SUCCESS;
825         }
826 
827         // if csum errors on all stripes, check sector by sector
828 
829         for (i = 0; i < c->chunk_item->num_stripes; i++) {
830             ULONG j;
831 
832             if (c->devices[i]->devobj) {
833                 if (csum) {
834                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
835                         if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
836                             ULONG k;
837                             uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
838                             bool recovered = false;
839 
840                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
841                                 if (i != k && c->devices[k]->devobj &&
842                                     RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size),
843                                                      (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
844                                     log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
845                                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
846 
847                                     RtlCopyMemory(context->stripes[i].buf + (j * Vcb->superblock.sector_size),
848                                                   context->stripes[k].buf + (j * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
849 
850                                     recovered = true;
851                                     break;
852                                 }
853                             }
854 
855                             if (!recovered) {
856                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
857                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
858                             }
859                         }
860                     }
861                 } else {
862                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
863                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
864                         uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
865 
866                         if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
867                             ULONG k;
868                             bool recovered = false;
869 
870                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
871                                 if (i != k && c->devices[k]->devobj) {
872                                     tree_header* th2 = (tree_header*)&context->stripes[k].buf[j * Vcb->superblock.node_size];
873 
874                                     if (RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
875                                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
876                                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
877 
878                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
879 
880                                         recovered = true;
881                                         break;
882                                     }
883                                 }
884                             }
885 
886                             if (!recovered) {
887                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
888                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
889                             }
890                         }
891                     }
892                 }
893             }
894         }
895 
896         // write good data over bad
897 
898         for (i = 0; i < c->chunk_item->num_stripes; i++) {
899             if (c->devices[i]->devobj && !c->devices[i]->readonly) {
900                 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
901                                          context->stripes[i].buf, context->stripes[i].length);
902                 if (!NT_SUCCESS(Status)) {
903                     ERR("write_data_phys returned %08lx\n", Status);
904                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
905                     return Status;
906                 }
907             }
908         }
909 
910         return STATUS_SUCCESS;
911     }
912 
913     for (i = 0; i < c->chunk_item->num_stripes; i++) {
914         if (c->devices[i]->devobj) {
915             ULONG j;
916 
917             if (csum) {
918                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.sector_size; j++) {
919                     if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
920                         uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.sector_size);
921 
922                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
923                     }
924                 }
925             } else {
926                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
927                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
928                     uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
929 
930                     if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr)
931                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
932                 }
933             }
934         }
935     }
936 
937     return STATUS_SUCCESS;
938 }
939 
940 static NTSTATUS scrub_extent_raid0(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
941     ULONG j;
942     uint16_t stripe;
943     uint32_t pos, *stripeoff;
944 
945     pos = 0;
946     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes, ALLOC_TAG);
947     if (!stripeoff) {
948         ERR("out of memory\n");
949         return STATUS_INSUFFICIENT_RESOURCES;
950     }
951 
952     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes);
953 
954     stripe = startoffstripe;
955     while (pos < length) {
956         uint32_t readlen;
957 
958         if (pos == 0)
959             readlen = (uint32_t)min(context->stripes[stripe].length, c->chunk_item->stripe_length - (context->stripes[stripe].start % c->chunk_item->stripe_length));
960         else
961             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
962 
963         if (csum) {
964             for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
965                 if (!check_sector_csum(Vcb, context->stripes[stripe].buf + stripeoff[stripe], (uint8_t*)csum + (pos * Vcb->csum_size / Vcb->superblock.sector_size))) {
966                     uint64_t addr = offset + pos;
967 
968                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
969                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
970                 }
971 
972                 pos += Vcb->superblock.sector_size;
973                 stripeoff[stripe] += Vcb->superblock.sector_size;
974             }
975         } else {
976             for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
977                 tree_header* th = (tree_header*)(context->stripes[stripe].buf + stripeoff[stripe]);
978                 uint64_t addr = offset + pos;
979 
980                 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
981                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
982                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
983                 }
984 
985                 pos += Vcb->superblock.node_size;
986                 stripeoff[stripe] += Vcb->superblock.node_size;
987             }
988         }
989 
990         stripe = (stripe + 1) % c->chunk_item->num_stripes;
991     }
992 
993     ExFreePool(stripeoff);
994 
995     return STATUS_SUCCESS;
996 }
997 
998 static NTSTATUS scrub_extent_raid10(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
999     ULONG j;
1000     uint16_t stripe, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1001     uint32_t pos, *stripeoff;
1002     bool csum_error = false;
1003     NTSTATUS Status;
1004 
1005     pos = 0;
1006     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes, ALLOC_TAG);
1007     if (!stripeoff) {
1008         ERR("out of memory\n");
1009         return STATUS_INSUFFICIENT_RESOURCES;
1010     }
1011 
1012     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes);
1013 
1014     stripe = startoffstripe;
1015     while (pos < length) {
1016         uint32_t readlen;
1017 
1018         if (pos == 0)
1019             readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1020                                   c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1021         else
1022             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1023 
1024         if (csum) {
1025             ULONG good_stripe = 0xffffffff;
1026             uint16_t k;
1027 
1028             for (k = 0; k < sub_stripes; k++) {
1029                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1030                     // if first stripe is okay, we only need to check that the others are identical to it
1031                     if (good_stripe != 0xffffffff) {
1032                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1033                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1034                                             readlen) != readlen) {
1035                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1036                             csum_error = true;
1037                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1038                         }
1039                     } else {
1040                         for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
1041                             if (!check_sector_csum(Vcb, context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j,
1042                                                    (uint8_t*)csum + ((pos + j) * Vcb->csum_size / Vcb->superblock.sector_size))) {
1043                                 csum_error = true;
1044                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1045                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1046                                 break;
1047                             }
1048                         }
1049 
1050                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1051                             good_stripe = k;
1052                     }
1053                 }
1054             }
1055 
1056             pos += readlen;
1057             stripeoff[stripe] += readlen;
1058         } else {
1059             ULONG good_stripe = 0xffffffff;
1060             uint16_t k;
1061 
1062             for (k = 0; k < sub_stripes; k++) {
1063                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1064                     // if first stripe is okay, we only need to check that the others are identical to it
1065                     if (good_stripe != 0xffffffff) {
1066                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1067                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1068                                             readlen) != readlen) {
1069                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1070                             csum_error = true;
1071                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1072                         }
1073                     } else {
1074                         for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
1075                             tree_header* th = (tree_header*)(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j);
1076                             uint64_t addr = offset + pos + j;
1077 
1078                             if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1079                                 csum_error = true;
1080                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1081                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1082                                 break;
1083                             }
1084                         }
1085 
1086                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1087                             good_stripe = k;
1088                     }
1089                 }
1090             }
1091 
1092             pos += readlen;
1093             stripeoff[stripe] += readlen;
1094         }
1095 
1096         stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1097     }
1098 
1099     if (!csum_error) {
1100         Status = STATUS_SUCCESS;
1101         goto end;
1102     }
1103 
1104     for (j = 0; j < c->chunk_item->num_stripes; j += sub_stripes) {
1105         ULONG goodstripe = 0xffffffff;
1106         uint16_t k;
1107         bool hasbadstripe = false;
1108 
1109         if (context->stripes[j].length == 0)
1110             continue;
1111 
1112         for (k = 0; k < sub_stripes; k++) {
1113             if (c->devices[j + k]->devobj) {
1114                 if (!context->stripes[j + k].csum_error)
1115                     goodstripe = k;
1116                 else
1117                     hasbadstripe = true;
1118             }
1119         }
1120 
1121         if (hasbadstripe) {
1122             if (goodstripe != 0xffffffff) {
1123                 for (k = 0; k < sub_stripes; k++) {
1124                     if (c->devices[j + k]->devobj && context->stripes[j + k].csum_error) {
1125                         uint32_t so = 0;
1126                         bool recovered = false;
1127 
1128                         pos = 0;
1129 
1130                         stripe = startoffstripe;
1131                         while (pos < length) {
1132                             uint32_t readlen;
1133 
1134                             if (pos == 0)
1135                                 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1136                                               c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1137                             else
1138                                 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1139 
1140                             if (stripe == j / sub_stripes) {
1141                                 if (csum) {
1142                                     ULONG l;
1143 
1144                                     for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1145                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1146                                                              context->stripes[j + goodstripe].buf + so,
1147                                                              Vcb->superblock.sector_size) != Vcb->superblock.sector_size) {
1148                                             uint64_t addr = offset + pos;
1149 
1150                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1151 
1152                                             recovered = true;
1153                                         }
1154 
1155                                         pos += Vcb->superblock.sector_size;
1156                                         so += Vcb->superblock.sector_size;
1157                                     }
1158                                 } else {
1159                                     ULONG l;
1160 
1161                                     for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1162                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1163                                                             context->stripes[j + goodstripe].buf + so,
1164                                                             Vcb->superblock.node_size) != Vcb->superblock.node_size) {
1165                                             uint64_t addr = offset + pos;
1166 
1167                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1168 
1169                                             recovered = true;
1170                                         }
1171 
1172                                         pos += Vcb->superblock.node_size;
1173                                         so += Vcb->superblock.node_size;
1174                                     }
1175                                 }
1176                             } else
1177                                 pos += readlen;
1178 
1179                             stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1180                         }
1181 
1182                         if (recovered) {
1183                             // write good data over bad
1184 
1185                             if (!c->devices[j + k]->readonly) {
1186                                 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1187 
1188                                 Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1189                                                          context->stripes[j + goodstripe].buf, context->stripes[j + goodstripe].length);
1190 
1191                                 if (!NT_SUCCESS(Status)) {
1192                                     ERR("write_data_phys returned %08lx\n", Status);
1193                                     log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1194                                     goto end;
1195                                 }
1196                             }
1197                         }
1198                     }
1199                 }
1200             } else {
1201                 uint32_t so = 0;
1202                 bool recovered = false;
1203 
1204                 if (csum) {
1205                     for (k = 0; k < sub_stripes; k++) {
1206                         if (c->devices[j + k]->devobj) {
1207                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * Vcb->csum_size / Vcb->superblock.sector_size,
1208                                                                                       ALLOC_TAG);
1209                             if (!context->stripes[j + k].bad_csums) {
1210                                 ERR("out of memory\n");
1211                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1212                                 goto end;
1213                             }
1214 
1215                             do_calc_job(Vcb, context->stripes[j + k].buf, context->stripes[j + k].length / Vcb->superblock.sector_size, context->stripes[j + k].bad_csums);
1216                         }
1217                     }
1218                 } else {
1219                     for (k = 0; k < sub_stripes; k++) {
1220                         if (c->devices[j + k]->devobj) {
1221                             ULONG l;
1222 
1223                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * Vcb->csum_size / Vcb->superblock.node_size,
1224                                                                                       ALLOC_TAG);
1225                             if (!context->stripes[j + k].bad_csums) {
1226                                 ERR("out of memory\n");
1227                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1228                                 goto end;
1229                             }
1230 
1231                             for (l = 0; l < context->stripes[j + k].length / Vcb->superblock.node_size; l++) {
1232                                 tree_header* th = (tree_header*)&context->stripes[j + k].buf[l * Vcb->superblock.node_size];
1233 
1234                                 get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[j + k].bad_csums + (Vcb->csum_size * l));
1235                             }
1236                         }
1237                     }
1238                 }
1239 
1240                 pos = 0;
1241 
1242                 stripe = startoffstripe;
1243                 while (pos < length) {
1244                     uint32_t readlen;
1245 
1246                     if (pos == 0)
1247                         readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1248                                       c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1249                     else
1250                         readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1251 
1252                     if (stripe == j / sub_stripes) {
1253                         ULONG l;
1254 
1255                         if (csum) {
1256                             for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1257                                 bool has_error = false;
1258 
1259                                 goodstripe = 0xffffffff;
1260                                 for (k = 0; k < sub_stripes; k++) {
1261                                     if (c->devices[j + k]->devobj) {
1262                                         if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + (so * Vcb->csum_size / Vcb->superblock.sector_size),
1263                                             (uint8_t*)csum + (pos * Vcb->csum_size / Vcb->superblock.sector_size),
1264                                             Vcb->csum_size) != Vcb->csum_size) {
1265                                             has_error = true;
1266                                         } else
1267                                             goodstripe = k;
1268                                     }
1269                                 }
1270 
1271                                 if (has_error) {
1272                                     if (goodstripe != 0xffffffff) {
1273                                         for (k = 0; k < sub_stripes; k++) {
1274                                             if (c->devices[j + k]->devobj &&
1275                                                 RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + (so * Vcb->csum_size / Vcb->superblock.sector_size),
1276                                                                  (uint8_t*)csum + (pos * Vcb->csum_size / Vcb->superblock.sector_size),
1277                                                                  Vcb->csum_size) != Vcb->csum_size) {
1278                                                 uint64_t addr = offset + pos;
1279 
1280                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1281 
1282                                                 recovered = true;
1283 
1284                                                 RtlCopyMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so,
1285                                                               Vcb->superblock.sector_size);
1286                                             }
1287                                         }
1288                                     } else {
1289                                         uint64_t addr = offset + pos;
1290 
1291                                         for (k = 0; k < sub_stripes; k++) {
1292                                             if (c->devices[j + j]->devobj) {
1293                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, false, false);
1294                                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1295                                             }
1296                                         }
1297                                     }
1298                                 }
1299 
1300                                 pos += Vcb->superblock.sector_size;
1301                                 so += Vcb->superblock.sector_size;
1302                             }
1303                         } else {
1304                             for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1305                                 for (k = 0; k < sub_stripes; k++) {
1306                                     if (c->devices[j + k]->devobj) {
1307                                         tree_header* th = (tree_header*)&context->stripes[j + k].buf[so];
1308                                         uint64_t addr = offset + pos;
1309 
1310                                         if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
1311                                             ULONG m;
1312 
1313                                             recovered = false;
1314 
1315                                             for (m = 0; m < sub_stripes; m++) {
1316                                                 if (m != k) {
1317                                                     tree_header* th2 = (tree_header*)&context->stripes[j + m].buf[so];
1318 
1319                                                     if (RtlCompareMemory((uint8_t*)context->stripes[j + m].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
1320                                                         log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1321 
1322                                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
1323 
1324                                                         recovered = true;
1325                                                         break;
1326                                                     } else
1327                                                         log_device_error(Vcb, c->devices[j + m], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1328                                                 }
1329                                             }
1330 
1331                                             if (!recovered)
1332                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, false, false);
1333                                         }
1334                                     }
1335                                 }
1336 
1337                                 pos += Vcb->superblock.node_size;
1338                                 so += Vcb->superblock.node_size;
1339                             }
1340                         }
1341                     } else
1342                         pos += readlen;
1343 
1344                     stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1345                 }
1346 
1347                 if (recovered) {
1348                     // write good data over bad
1349 
1350                     for (k = 0; k < sub_stripes; k++) {
1351                         if (c->devices[j + k]->devobj && !c->devices[j + k]->readonly) {
1352                             CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1353 
1354                             Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1355                                                      context->stripes[j + k].buf, context->stripes[j + k].length);
1356 
1357                             if (!NT_SUCCESS(Status)) {
1358                                 ERR("write_data_phys returned %08lx\n", Status);
1359                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1360                                 goto end;
1361                             }
1362                         }
1363                     }
1364                 }
1365             }
1366         }
1367     }
1368 
1369     Status = STATUS_SUCCESS;
1370 
1371 end:
1372     ExFreePool(stripeoff);
1373 
1374     return Status;
1375 }
1376 
1377 static NTSTATUS scrub_extent(device_extension* Vcb, chunk* c, ULONG type, uint64_t offset, uint32_t size, void* csum) {
1378     ULONG i;
1379     scrub_context context;
1380     CHUNK_ITEM_STRIPE* cis;
1381     NTSTATUS Status;
1382     uint16_t startoffstripe, num_missing, allowed_missing;
1383 
1384     TRACE("(%p, %p, %lx, %I64x, %x, %p)\n", Vcb, c, type, offset, size, csum);
1385 
1386     context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
1387     if (!context.stripes) {
1388         ERR("out of memory\n");
1389         Status = STATUS_INSUFFICIENT_RESOURCES;
1390         goto end;
1391     }
1392 
1393     RtlZeroMemory(context.stripes, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes);
1394 
1395     context.stripes_left = 0;
1396 
1397     cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1398 
1399     if (type == BLOCK_FLAG_RAID0) {
1400         uint64_t startoff, endoff;
1401         uint16_t endoffstripe;
1402 
1403         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &startoff, &startoffstripe);
1404         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &endoff, &endoffstripe);
1405 
1406         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1407             if (startoffstripe > i)
1408                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1409             else if (startoffstripe == i)
1410                 context.stripes[i].start = startoff;
1411             else
1412                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1413 
1414             if (endoffstripe > i)
1415                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1416             else if (endoffstripe == i)
1417                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1418             else
1419                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1420         }
1421 
1422         allowed_missing = 0;
1423     } else if (type == BLOCK_FLAG_RAID10) {
1424         uint64_t startoff, endoff;
1425         uint16_t endoffstripe, j, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1426 
1427         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &startoff, &startoffstripe);
1428         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &endoff, &endoffstripe);
1429 
1430         if ((c->chunk_item->num_stripes % sub_stripes) != 0) {
1431             ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", c->offset, c->chunk_item->num_stripes, sub_stripes);
1432             Status = STATUS_INTERNAL_ERROR;
1433             goto end;
1434         }
1435 
1436         startoffstripe *= sub_stripes;
1437         endoffstripe *= sub_stripes;
1438 
1439         for (i = 0; i < c->chunk_item->num_stripes; i += sub_stripes) {
1440             if (startoffstripe > i)
1441                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1442             else if (startoffstripe == i)
1443                 context.stripes[i].start = startoff;
1444             else
1445                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1446 
1447             if (endoffstripe > i)
1448                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1449             else if (endoffstripe == i)
1450                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1451             else
1452                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1453 
1454             for (j = 1; j < sub_stripes; j++) {
1455                 context.stripes[i+j].start = context.stripes[i].start;
1456                 context.stripes[i+j].length = context.stripes[i].length;
1457             }
1458         }
1459 
1460         startoffstripe /= sub_stripes;
1461         allowed_missing = 1;
1462     } else
1463         allowed_missing = c->chunk_item->num_stripes - 1;
1464 
1465     num_missing = 0;
1466 
1467     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1468         PIO_STACK_LOCATION IrpSp;
1469 
1470         context.stripes[i].context = (struct _scrub_context*)&context;
1471 
1472         if (type == BLOCK_FLAG_DUPLICATE) {
1473             context.stripes[i].start = offset - c->offset;
1474             context.stripes[i].length = size;
1475         } else if (type != BLOCK_FLAG_RAID0 && type != BLOCK_FLAG_RAID10) {
1476             ERR("unexpected chunk type %lx\n", type);
1477             Status = STATUS_INTERNAL_ERROR;
1478             goto end;
1479         }
1480 
1481         if (!c->devices[i]->devobj) {
1482             num_missing++;
1483 
1484             if (num_missing > allowed_missing) {
1485                 ERR("too many missing devices (at least %u, maximum allowed %u)\n", num_missing, allowed_missing);
1486                 Status = STATUS_INTERNAL_ERROR;
1487                 goto end;
1488             }
1489         } else if (context.stripes[i].length > 0) {
1490             context.stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1491 
1492             if (!context.stripes[i].buf) {
1493                 ERR("out of memory\n");
1494                 Status = STATUS_INSUFFICIENT_RESOURCES;
1495                 goto end;
1496             }
1497 
1498             context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
1499 
1500             if (!context.stripes[i].Irp) {
1501                 ERR("IoAllocateIrp failed\n");
1502                 Status = STATUS_INSUFFICIENT_RESOURCES;
1503                 goto end;
1504             }
1505 
1506             IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
1507             IrpSp->MajorFunction = IRP_MJ_READ;
1508             IrpSp->FileObject = c->devices[i]->fileobj;
1509 
1510             if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
1511                 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1512                 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
1513                     ERR("out of memory\n");
1514                     Status = STATUS_INSUFFICIENT_RESOURCES;
1515                     goto end;
1516                 }
1517 
1518                 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
1519 
1520                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1521             } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
1522                 context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, context.stripes[i].length, false, false, NULL);
1523                 if (!context.stripes[i].Irp->MdlAddress) {
1524                     ERR("IoAllocateMdl failed\n");
1525                     Status = STATUS_INSUFFICIENT_RESOURCES;
1526                     goto end;
1527                 }
1528 
1529                 Status = STATUS_SUCCESS;
1530 
1531                 _SEH2_TRY {
1532                     MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
1533                 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1534                     Status = _SEH2_GetExceptionCode();
1535                 } _SEH2_END;
1536 
1537                 if (!NT_SUCCESS(Status)) {
1538                     ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1539                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1540                     context.stripes[i].Irp->MdlAddress = NULL;
1541                     goto end;
1542                 }
1543             } else
1544                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1545 
1546             IrpSp->Parameters.Read.Length = context.stripes[i].length;
1547             IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].start + cis[i].offset;
1548 
1549             context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
1550 
1551             IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion, &context.stripes[i], true, true, true);
1552 
1553             context.stripes_left++;
1554 
1555             Vcb->scrub.data_scrubbed += context.stripes[i].length;
1556         }
1557     }
1558 
1559     if (context.stripes_left == 0) {
1560         ERR("error - not reading any stripes\n");
1561         Status = STATUS_INTERNAL_ERROR;
1562         goto end;
1563     }
1564 
1565     KeInitializeEvent(&context.Event, NotificationEvent, false);
1566 
1567     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1568         if (c->devices[i]->devobj && context.stripes[i].length > 0)
1569             IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
1570     }
1571 
1572     KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
1573 
1574     // return an error if any of the stripes returned an error
1575     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1576         if (!NT_SUCCESS(context.stripes[i].iosb.Status)) {
1577             Status = context.stripes[i].iosb.Status;
1578             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
1579             goto end;
1580         }
1581     }
1582 
1583     if (type == BLOCK_FLAG_DUPLICATE) {
1584         Status = scrub_extent_dup(Vcb, c, offset, csum, &context);
1585         if (!NT_SUCCESS(Status)) {
1586             ERR("scrub_extent_dup returned %08lx\n", Status);
1587             goto end;
1588         }
1589     } else if (type == BLOCK_FLAG_RAID0) {
1590         Status = scrub_extent_raid0(Vcb, c, offset, size, startoffstripe, csum, &context);
1591         if (!NT_SUCCESS(Status)) {
1592             ERR("scrub_extent_raid0 returned %08lx\n", Status);
1593             goto end;
1594         }
1595     } else if (type == BLOCK_FLAG_RAID10) {
1596         Status = scrub_extent_raid10(Vcb, c, offset, size, startoffstripe, csum, &context);
1597         if (!NT_SUCCESS(Status)) {
1598             ERR("scrub_extent_raid10 returned %08lx\n", Status);
1599             goto end;
1600         }
1601     }
1602 
1603 end:
1604     if (context.stripes) {
1605         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1606             if (context.stripes[i].Irp) {
1607                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
1608                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
1609                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1610                 }
1611                 IoFreeIrp(context.stripes[i].Irp);
1612             }
1613 
1614             if (context.stripes[i].buf)
1615                 ExFreePool(context.stripes[i].buf);
1616 
1617             if (context.stripes[i].bad_csums)
1618                 ExFreePool(context.stripes[i].bad_csums);
1619         }
1620 
1621         ExFreePool(context.stripes);
1622     }
1623 
1624     return Status;
1625 }
1626 
1627 static NTSTATUS scrub_data_extent(device_extension* Vcb, chunk* c, uint64_t offset, ULONG type, void* csum, RTL_BITMAP* bmp, ULONG bmplen) {
1628     NTSTATUS Status;
1629     ULONG runlength, index;
1630 
1631     runlength = RtlFindFirstRunClear(bmp, &index);
1632 
1633     while (runlength != 0) {
1634         if (index >= bmplen)
1635             break;
1636 
1637         if (index + runlength >= bmplen) {
1638             runlength = bmplen - index;
1639 
1640             if (runlength == 0)
1641                 break;
1642         }
1643 
1644         do {
1645             ULONG rl;
1646 
1647             if (runlength * Vcb->superblock.sector_size > SCRUB_UNIT)
1648                 rl = SCRUB_UNIT / Vcb->superblock.sector_size;
1649             else
1650                 rl = runlength;
1651 
1652             Status = scrub_extent(Vcb, c, type, offset + UInt32x32To64(index, Vcb->superblock.sector_size),
1653                                   rl * Vcb->superblock.sector_size, (uint8_t*)csum + (index * Vcb->csum_size));
1654             if (!NT_SUCCESS(Status)) {
1655                 ERR("scrub_data_extent_dup returned %08lx\n", Status);
1656                 return Status;
1657             }
1658 
1659             runlength -= rl;
1660             index += rl;
1661         } while (runlength > 0);
1662 
1663         runlength = RtlFindNextForwardRunClear(bmp, index, &index);
1664     }
1665 
1666     return STATUS_SUCCESS;
1667 }
1668 
1669 typedef struct {
1670     uint8_t* buf;
1671     PIRP Irp;
1672     void* context;
1673     IO_STATUS_BLOCK iosb;
1674     uint64_t offset;
1675     bool rewrite, missing;
1676     RTL_BITMAP error;
1677     ULONG* errorarr;
1678 } scrub_context_raid56_stripe;
1679 
1680 typedef struct {
1681     scrub_context_raid56_stripe* stripes;
1682     LONG stripes_left;
1683     KEVENT Event;
1684     RTL_BITMAP alloc;
1685     RTL_BITMAP has_csum;
1686     RTL_BITMAP is_tree;
1687     void* csum;
1688     uint8_t* parity_scratch;
1689     uint8_t* parity_scratch2;
1690 } scrub_context_raid56;
1691 
1692 _Function_class_(IO_COMPLETION_ROUTINE)
1693 static NTSTATUS __stdcall scrub_read_completion_raid56(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
1694     scrub_context_raid56_stripe* stripe = conptr;
1695     scrub_context_raid56* context = (scrub_context_raid56*)stripe->context;
1696     LONG left = InterlockedDecrement(&context->stripes_left);
1697 
1698     UNUSED(DeviceObject);
1699 
1700     stripe->iosb = Irp->IoStatus;
1701 
1702     if (left == 0)
1703         KeSetEvent(&context->Event, 0, false);
1704 
1705     return STATUS_MORE_PROCESSING_REQUIRED;
1706 }
1707 
1708 static void scrub_raid5_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1709                                uint64_t num, uint16_t missing_devices) {
1710     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off;
1711     uint16_t stripe, parity = (bit_start + num + c->chunk_item->num_stripes - 1) % c->chunk_item->num_stripes;
1712     uint64_t stripeoff;
1713 
1714     stripe = (parity + 1) % c->chunk_item->num_stripes;
1715     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1);
1716     stripeoff = num * sectors_per_stripe;
1717 
1718     if (missing_devices == 0)
1719         RtlCopyMemory(context->parity_scratch, &context->stripes[parity].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1720 
1721     while (stripe != parity) {
1722         RtlClearAllBits(&context->stripes[stripe].error);
1723 
1724         for (i = 0; i < sectors_per_stripe; i++) {
1725             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1726                 if (RtlCheckBit(&context->is_tree, off)) {
1727                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size];
1728                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1729 
1730                     if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1731                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size);
1732                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1733 
1734                         if (missing_devices > 0)
1735                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1736                     }
1737 
1738                     off += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1739                     stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1740                     i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1;
1741 
1742                     continue;
1743                 } else if (RtlCheckBit(&context->has_csum, off)) {
1744                     if (!check_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), (uint8_t*)context->csum + (Vcb->csum_size * off))) {
1745                         RtlSetBit(&context->stripes[stripe].error, i);
1746                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1747 
1748                         if (missing_devices > 0) {
1749                             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1750 
1751                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1752                         }
1753                     }
1754                 }
1755             }
1756 
1757             off++;
1758             stripeoff++;
1759         }
1760 
1761         if (missing_devices == 0)
1762             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1763 
1764         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1765         stripeoff = num * sectors_per_stripe;
1766     }
1767 
1768     // check parity
1769 
1770     if (missing_devices == 0) {
1771         RtlClearAllBits(&context->stripes[parity].error);
1772 
1773         for (i = 0; i < sectors_per_stripe; i++) {
1774             ULONG o, j;
1775 
1776             o = i * Vcb->superblock.sector_size;
1777             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1778                 if (context->parity_scratch[o] != 0) {
1779                     RtlSetBit(&context->stripes[parity].error, i);
1780                     break;
1781                 }
1782                 o++;
1783             }
1784         }
1785     }
1786 
1787     // log and fix errors
1788 
1789     if (missing_devices > 0)
1790         return;
1791 
1792     for (i = 0; i < sectors_per_stripe; i++) {
1793         ULONG num_errors = 0, bad_off;
1794         uint64_t bad_stripe;
1795         bool alloc = false;
1796 
1797         stripe = (parity + 1) % c->chunk_item->num_stripes;
1798         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1799 
1800         while (stripe != parity) {
1801             if (RtlCheckBit(&context->alloc, off)) {
1802                 alloc = true;
1803 
1804                 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1805                     bad_stripe = stripe;
1806                     bad_off = off;
1807                     num_errors++;
1808                 }
1809             }
1810 
1811             off += sectors_per_stripe;
1812             stripe = (stripe + 1) % c->chunk_item->num_stripes;
1813         }
1814 
1815         if (!alloc)
1816             continue;
1817 
1818         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity].error, i)) // everything fine
1819             continue;
1820 
1821         if (num_errors == 0 && RtlCheckBit(&context->stripes[parity].error, i)) { // parity error
1822             uint64_t addr;
1823 
1824             do_xor(&context->stripes[parity].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1825                    &context->parity_scratch[i * Vcb->superblock.sector_size],
1826                    Vcb->superblock.sector_size);
1827 
1828             bad_off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1829             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size);
1830 
1831             context->stripes[parity].rewrite = true;
1832 
1833             log_error(Vcb, addr, c->devices[parity]->devitem.dev_id, false, true, true);
1834             log_device_error(Vcb, c->devices[parity], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1835         } else if (num_errors == 1) {
1836             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off * Vcb->superblock.sector_size);
1837 
1838             if (RtlCheckBit(&context->is_tree, bad_off)) {
1839                 tree_header* th;
1840 
1841                 do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
1842                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1843                        Vcb->superblock.node_size);
1844 
1845                 th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
1846 
1847                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
1848                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1849                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
1850 
1851                     context->stripes[bad_stripe].rewrite = true;
1852 
1853                     RtlClearBits(&context->stripes[bad_stripe].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
1854 
1855                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, true, false);
1856                 } else
1857                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, false, false);
1858             } else {
1859                 uint8_t hash[MAX_HASH_SIZE];
1860 
1861                 do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
1862                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1863                        Vcb->superblock.sector_size);
1864 
1865                 get_sector_csum(Vcb, &context->parity_scratch[i * Vcb->superblock.sector_size], hash);
1866 
1867                 if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * bad_off), Vcb->csum_size) == Vcb->csum_size) {
1868                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1869                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
1870 
1871                     context->stripes[bad_stripe].rewrite = true;
1872 
1873                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, true, false);
1874                 } else
1875                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, false, false);
1876             }
1877         } else {
1878             stripe = (parity + 1) % c->chunk_item->num_stripes;
1879             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1880 
1881             while (stripe != parity) {
1882                 if (RtlCheckBit(&context->alloc, off)) {
1883                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1884                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1885 
1886                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
1887                     }
1888                 }
1889 
1890                 off += sectors_per_stripe;
1891                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1892             }
1893         }
1894     }
1895 }
1896 
1897 static void scrub_raid6_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1898                                uint64_t num, uint16_t missing_devices) {
1899     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size), i, off;
1900     uint16_t stripe, parity1 = (bit_start + num + c->chunk_item->num_stripes - 2) % c->chunk_item->num_stripes;
1901     uint16_t parity2 = (parity1 + 1) % c->chunk_item->num_stripes;
1902     uint64_t stripeoff;
1903 
1904     stripe = (parity1 + 2) % c->chunk_item->num_stripes;
1905     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2);
1906     stripeoff = num * sectors_per_stripe;
1907 
1908     if (c->devices[parity1]->devobj)
1909         RtlCopyMemory(context->parity_scratch, &context->stripes[parity1].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1910 
1911     if (c->devices[parity2]->devobj)
1912         RtlZeroMemory(context->parity_scratch2, (ULONG)c->chunk_item->stripe_length);
1913 
1914     while (stripe != parity1) {
1915         RtlClearAllBits(&context->stripes[stripe].error);
1916 
1917         for (i = 0; i < sectors_per_stripe; i++) {
1918             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1919                 if (RtlCheckBit(&context->is_tree, off)) {
1920                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff * Vcb->superblock.sector_size];
1921                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1922 
1923                     if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1924                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size / Vcb->superblock.sector_size);
1925                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1926 
1927                         if (missing_devices == 2)
1928                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1929                     }
1930 
1931                     off += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1932                     stripeoff += Vcb->superblock.node_size / Vcb->superblock.sector_size;
1933                     i += (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1;
1934 
1935                     continue;
1936                 } else if (RtlCheckBit(&context->has_csum, off)) {
1937                     uint8_t hash[MAX_HASH_SIZE];
1938 
1939                     get_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff * Vcb->superblock.sector_size), hash);
1940 
1941                     if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * off), Vcb->csum_size) != Vcb->csum_size) {
1942                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
1943 
1944                         RtlSetBit(&context->stripes[stripe].error, i);
1945                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1946 
1947                         if (missing_devices == 2)
1948                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1949                     }
1950                 }
1951             }
1952 
1953             off++;
1954             stripeoff++;
1955         }
1956 
1957         if (c->devices[parity1]->devobj)
1958             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1959 
1960         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1961         stripeoff = num * sectors_per_stripe;
1962     }
1963 
1964     RtlClearAllBits(&context->stripes[parity1].error);
1965 
1966     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity2]->devobj)) {
1967         // check parity 1
1968 
1969         for (i = 0; i < sectors_per_stripe; i++) {
1970             ULONG o, j;
1971 
1972             o = i * Vcb->superblock.sector_size;
1973             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1974                 if (context->parity_scratch[o] != 0) {
1975                     RtlSetBit(&context->stripes[parity1].error, i);
1976                     break;
1977                 }
1978                 o++;
1979             }
1980         }
1981     }
1982 
1983     RtlClearAllBits(&context->stripes[parity2].error);
1984 
1985     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity1]->devobj)) {
1986         // check parity 2
1987 
1988         stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
1989 
1990         while (stripe != parity2) {
1991             galois_double(context->parity_scratch2, (uint32_t)c->chunk_item->stripe_length);
1992             do_xor(context->parity_scratch2, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1993 
1994             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
1995         }
1996 
1997         for (i = 0; i < sectors_per_stripe; i++) {
1998             if (RtlCompareMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
1999                                 &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size) != Vcb->superblock.sector_size)
2000                 RtlSetBit(&context->stripes[parity2].error, i);
2001         }
2002     }
2003 
2004     if (missing_devices == 2)
2005         return;
2006 
2007     // log and fix errors
2008 
2009     for (i = 0; i < sectors_per_stripe; i++) {
2010         ULONG num_errors = 0;
2011         uint64_t bad_stripe1, bad_stripe2;
2012         ULONG bad_off1, bad_off2;
2013         bool alloc = false;
2014 
2015         stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2016         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2017 
2018         while (stripe != parity1) {
2019             if (RtlCheckBit(&context->alloc, off)) {
2020                 alloc = true;
2021 
2022                 if (!c->devices[stripe]->devobj || RtlCheckBit(&context->stripes[stripe].error, i)) {
2023                     if (num_errors == 0) {
2024                         bad_stripe1 = stripe;
2025                         bad_off1 = off;
2026                     } else if (num_errors == 1) {
2027                         bad_stripe2 = stripe;
2028                         bad_off2 = off;
2029                     }
2030                     num_errors++;
2031                 }
2032             }
2033 
2034             off += sectors_per_stripe;
2035             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2036         }
2037 
2038         if (!alloc)
2039             continue;
2040 
2041         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity1].error, i) && !RtlCheckBit(&context->stripes[parity2].error, i)) // everything fine
2042             continue;
2043 
2044         if (num_errors == 0) { // parity error
2045             uint64_t addr;
2046 
2047             if (RtlCheckBit(&context->stripes[parity1].error, i)) {
2048                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2049                        &context->parity_scratch[i * Vcb->superblock.sector_size],
2050                        Vcb->superblock.sector_size);
2051 
2052                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2053                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2054 
2055                 context->stripes[parity1].rewrite = true;
2056 
2057                 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2058                 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2059             }
2060 
2061             if (RtlCheckBit(&context->stripes[parity2].error, i)) {
2062                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2063                               &context->parity_scratch2[i * Vcb->superblock.sector_size],
2064                               Vcb->superblock.sector_size);
2065 
2066                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2067                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2068 
2069                 context->stripes[parity2].rewrite = true;
2070 
2071                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2072                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2073             }
2074         } else if (num_errors == 1) {
2075             uint32_t len;
2076             uint16_t stripe_num, bad_stripe_num;
2077             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2078             uint8_t* scratch;
2079 
2080             len = RtlCheckBit(&context->is_tree, bad_off1)? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2081 
2082             scratch = ExAllocatePoolWithTag(PagedPool, len, ALLOC_TAG);
2083             if (!scratch) {
2084                 ERR("out of memory\n");
2085                 return;
2086             }
2087 
2088             RtlZeroMemory(scratch, len);
2089 
2090             do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
2091                    &context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2092 
2093             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2094 
2095             if (c->devices[parity2]->devobj) {
2096                 stripe_num = c->chunk_item->num_stripes - 3;
2097                 while (stripe != parity2) {
2098                     galois_double(scratch, len);
2099 
2100                     if (stripe != bad_stripe1)
2101                         do_xor(scratch, &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2102                     else
2103                         bad_stripe_num = stripe_num;
2104 
2105                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2106                     stripe_num--;
2107                 }
2108 
2109                 do_xor(scratch, &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2110 
2111                 if (bad_stripe_num != 0)
2112                     galois_divpower(scratch, (uint8_t)bad_stripe_num, len);
2113             }
2114 
2115             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2116                 uint8_t hash1[MAX_HASH_SIZE];
2117                 uint8_t hash2[MAX_HASH_SIZE];
2118                 tree_header *th1 = NULL, *th2 = NULL;
2119 
2120                 if (c->devices[parity1]->devobj) {
2121                     th1 = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
2122                     get_tree_checksum(Vcb, th1, hash1);
2123                 }
2124 
2125                 if (c->devices[parity2]->devobj) {
2126                     th2 = (tree_header*)scratch;
2127                     get_tree_checksum(Vcb, th2, hash2);
2128                 }
2129 
2130                 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, th1, Vcb->csum_size) == Vcb->csum_size && th1->address == addr) ||
2131                     (c->devices[parity2]->devobj && RtlCompareMemory(hash2, th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr)) {
2132                     if (!c->devices[parity1]->devobj || RtlCompareMemory(hash1, th1, Vcb->csum_size) != Vcb->csum_size || th1->address != addr) {
2133                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2134                                       scratch, Vcb->superblock.node_size);
2135 
2136                         if (c->devices[parity1]->devobj) {
2137                             // fix parity 1
2138 
2139                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2140 
2141                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2142                                           &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2143                                           Vcb->superblock.node_size);
2144 
2145                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2146 
2147                             while (stripe != parity1) {
2148                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2149                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2150                                        Vcb->superblock.node_size);
2151 
2152                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2153                             }
2154 
2155                             context->stripes[parity1].rewrite = true;
2156 
2157                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2158                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2159                         }
2160                     } else {
2161                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2162                                       &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2163 
2164                         if (!c->devices[parity2]->devobj || RtlCompareMemory(hash2, th2, Vcb->csum_size) != Vcb->csum_size || th2->address != addr) {
2165                             // fix parity 2
2166                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2167 
2168                             if (c->devices[parity2]->devobj) {
2169                                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2170                                             &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2171                                             Vcb->superblock.node_size);
2172 
2173                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2174 
2175                                 while (stripe != parity2) {
2176                                     galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.node_size);
2177 
2178                                     do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2179                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2180                                         Vcb->superblock.node_size);
2181 
2182                                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2183                                 }
2184 
2185                                 context->stripes[parity2].rewrite = true;
2186 
2187                                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2188                                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2189                             }
2190                         }
2191                     }
2192 
2193                     context->stripes[bad_stripe1].rewrite = true;
2194 
2195                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2196 
2197                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2198                 } else
2199                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2200             } else {
2201                 uint8_t hash1[MAX_HASH_SIZE];
2202                 uint8_t hash2[MAX_HASH_SIZE];
2203 
2204                 if (c->devices[parity1]->devobj)
2205                     get_sector_csum(Vcb, &context->parity_scratch[i * Vcb->superblock.sector_size], hash1);
2206 
2207                 if (c->devices[parity2]->devobj)
2208                     get_sector_csum(Vcb, scratch, hash2);
2209 
2210                 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) ||
2211                     (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size)) {
2212                     if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
2213                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2214                                       scratch, Vcb->superblock.sector_size);
2215 
2216                         if (c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2217                             // fix parity 1
2218 
2219                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2220 
2221                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2222                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2223                                         Vcb->superblock.sector_size);
2224 
2225                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2226 
2227                             while (stripe != parity1) {
2228                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2229                                     &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2230                                     Vcb->superblock.sector_size);
2231 
2232                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2233                             }
2234 
2235                             context->stripes[parity1].rewrite = true;
2236 
2237                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2238                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2239                         }
2240                     } else {
2241                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2242                                       &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2243 
2244                         if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2245                             // fix parity 2
2246                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2247 
2248                             RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2249                                         &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2250                                         Vcb->superblock.sector_size);
2251 
2252                             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2253 
2254                             while (stripe != parity2) {
2255                                 galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], Vcb->superblock.sector_size);
2256 
2257                                 do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2258                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2259                                        Vcb->superblock.sector_size);
2260 
2261                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2262                             }
2263 
2264                             context->stripes[parity2].rewrite = true;
2265 
2266                             log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2267                             log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2268                         }
2269                     }
2270 
2271                     context->stripes[bad_stripe1].rewrite = true;
2272 
2273                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2274                 } else
2275                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2276             }
2277 
2278             ExFreePool(scratch);
2279         } else if (num_errors == 2 && missing_devices == 0) {
2280             uint16_t x, y, k;
2281             uint64_t addr;
2282             uint32_t len = (RtlCheckBit(&context->is_tree, bad_off1) || RtlCheckBit(&context->is_tree, bad_off2)) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2283             uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
2284             uint32_t j;
2285 
2286             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2287 
2288             // put qxy in parity_scratch
2289             // put pxy in parity_scratch2
2290 
2291             k = c->chunk_item->num_stripes - 3;
2292             if (stripe == bad_stripe1 || stripe == bad_stripe2) {
2293                 RtlZeroMemory(&context->parity_scratch[i * Vcb->superblock.sector_size], len);
2294                 RtlZeroMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size], len);
2295 
2296                 if (stripe == bad_stripe1)
2297                     x = k;
2298                 else
2299                     y = k;
2300             } else {
2301                 RtlCopyMemory(&context->parity_scratch[i * Vcb->superblock.sector_size],
2302                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2303                 RtlCopyMemory(&context->parity_scratch2[i * Vcb->superblock.sector_size],
2304                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2305             }
2306 
2307             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2308 
2309             k--;
2310             do {
2311                 galois_double(&context->parity_scratch[i * Vcb->superblock.sector_size], len);
2312 
2313                 if (stripe != bad_stripe1 && stripe != bad_stripe2) {
2314                     do_xor(&context->parity_scratch[i * Vcb->superblock.sector_size],
2315                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2316                     do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size],
2317                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2318                 } else if (stripe == bad_stripe1)
2319                     x = k;
2320                 else if (stripe == bad_stripe2)
2321                     y = k;
2322 
2323                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2324                 k--;
2325             } while (stripe != parity2);
2326 
2327             gyx = gpow2(y > x ? (y-x) : (255-x+y));
2328             gx = gpow2(255-x);
2329 
2330             denom = gdiv(1, gyx ^ 1);
2331             a = gmul(gyx, denom);
2332             b = gmul(gx, denom);
2333 
2334             p = &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)];
2335             q = &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)];
2336             pxy = &context->parity_scratch2[i * Vcb->superblock.sector_size];
2337             qxy = &context->parity_scratch[i * Vcb->superblock.sector_size];
2338 
2339             for (j = 0; j < len; j++) {
2340                 *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
2341 
2342                 p++;
2343                 q++;
2344                 pxy++;
2345                 qxy++;
2346             }
2347 
2348             do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->parity_scratch[i * Vcb->superblock.sector_size], len);
2349             do_xor(&context->parity_scratch2[i * Vcb->superblock.sector_size], &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)], len);
2350 
2351             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 * Vcb->superblock.sector_size);
2352 
2353             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2354                 tree_header* th = (tree_header*)&context->parity_scratch[i * Vcb->superblock.sector_size];
2355 
2356                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2357                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2358                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2359 
2360                     context->stripes[bad_stripe1].rewrite = true;
2361 
2362                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2363 
2364                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2365                 } else
2366                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2367             } else {
2368                 if (check_sector_csum(Vcb, &context->parity_scratch[i * Vcb->superblock.sector_size], (uint8_t*)context->csum + (Vcb->csum_size * bad_off1))) {
2369                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2370                                   &context->parity_scratch[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2371 
2372                     context->stripes[bad_stripe1].rewrite = true;
2373 
2374                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2375                 } else
2376                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2377             }
2378 
2379             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off2 * Vcb->superblock.sector_size);
2380 
2381             if (RtlCheckBit(&context->is_tree, bad_off2)) {
2382                 tree_header* th = (tree_header*)&context->parity_scratch2[i * Vcb->superblock.sector_size];
2383 
2384                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2385                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2386                                   &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.node_size);
2387 
2388                     context->stripes[bad_stripe2].rewrite = true;
2389 
2390                     RtlClearBits(&context->stripes[bad_stripe2].error, i + 1, (Vcb->superblock.node_size / Vcb->superblock.sector_size) - 1);
2391 
2392                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, true, false);
2393                 } else
2394                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, false, false);
2395             } else {
2396                 if (check_sector_csum(Vcb, &context->parity_scratch2[i * Vcb->superblock.sector_size], (uint8_t*)context->csum + (Vcb->csum_size * bad_off2))) {
2397                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i * Vcb->superblock.sector_size)],
2398                                   &context->parity_scratch2[i * Vcb->superblock.sector_size], Vcb->superblock.sector_size);
2399 
2400                     context->stripes[bad_stripe2].rewrite = true;
2401 
2402                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, true, false);
2403                 } else
2404                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, false, false);
2405             }
2406         } else {
2407             stripe = (parity2 + 1) % c->chunk_item->num_stripes;
2408             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2409 
2410             while (stripe != parity1) {
2411                 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
2412                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
2413                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off * Vcb->superblock.sector_size);
2414 
2415                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
2416                     }
2417                 }
2418 
2419                 off += sectors_per_stripe;
2420                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2421             }
2422         }
2423     }
2424 }
2425 
2426 static NTSTATUS scrub_chunk_raid56_stripe_run(device_extension* Vcb, chunk* c, uint64_t stripe_start, uint64_t stripe_end) {
2427     NTSTATUS Status;
2428     KEY searchkey;
2429     traverse_ptr tp;
2430     bool b;
2431     uint64_t run_start, run_end, full_stripe_len, stripe;
2432     uint32_t max_read, num_sectors;
2433     ULONG arrlen, *allocarr, *csumarr = NULL, *treearr, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2434     scrub_context_raid56 context;
2435     uint16_t i;
2436     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2437 
2438     TRACE("(%p, %p, %I64x, %I64x)\n", Vcb, c, stripe_start, stripe_end);
2439 
2440     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2441     run_start = c->offset + (stripe_start * full_stripe_len);
2442     run_end = c->offset + ((stripe_end + 1) * full_stripe_len);
2443 
2444     searchkey.obj_id = run_start;
2445     searchkey.obj_type = TYPE_METADATA_ITEM;
2446     searchkey.offset = 0xffffffffffffffff;
2447 
2448     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2449     if (!NT_SUCCESS(Status)) {
2450         ERR("find_item returned %08lx\n", Status);
2451         return Status;
2452     }
2453 
2454     num_sectors = (uint32_t)((stripe_end - stripe_start + 1) * full_stripe_len / Vcb->superblock.sector_size);
2455     arrlen = (ULONG)sector_align((num_sectors / 8) + 1, sizeof(ULONG));
2456 
2457     allocarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2458     if (!allocarr) {
2459         ERR("out of memory\n");
2460         return STATUS_INSUFFICIENT_RESOURCES;
2461     }
2462 
2463     treearr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2464     if (!treearr) {
2465         ERR("out of memory\n");
2466         ExFreePool(allocarr);
2467         return STATUS_INSUFFICIENT_RESOURCES;
2468     }
2469 
2470     RtlInitializeBitMap(&context.alloc, allocarr, num_sectors);
2471     RtlClearAllBits(&context.alloc);
2472 
2473     RtlInitializeBitMap(&context.is_tree, treearr, num_sectors);
2474     RtlClearAllBits(&context.is_tree);
2475 
2476     context.parity_scratch = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2477     if (!context.parity_scratch) {
2478         ERR("out of memory\n");
2479         ExFreePool(allocarr);
2480         ExFreePool(treearr);
2481         return STATUS_INSUFFICIENT_RESOURCES;
2482     }
2483 
2484     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2485         csumarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2486         if (!csumarr) {
2487             ERR("out of memory\n");
2488             ExFreePool(allocarr);
2489             ExFreePool(treearr);
2490             ExFreePool(context.parity_scratch);
2491             return STATUS_INSUFFICIENT_RESOURCES;
2492         }
2493 
2494         RtlInitializeBitMap(&context.has_csum, csumarr, num_sectors);
2495         RtlClearAllBits(&context.has_csum);
2496 
2497         context.csum = ExAllocatePoolWithTag(PagedPool, num_sectors * Vcb->csum_size, ALLOC_TAG);
2498         if (!context.csum) {
2499             ERR("out of memory\n");
2500             ExFreePool(allocarr);
2501             ExFreePool(treearr);
2502             ExFreePool(context.parity_scratch);
2503             ExFreePool(csumarr);
2504             return STATUS_INSUFFICIENT_RESOURCES;
2505         }
2506     }
2507 
2508     if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2509         context.parity_scratch2 = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2510         if (!context.parity_scratch2) {
2511             ERR("out of memory\n");
2512             ExFreePool(allocarr);
2513             ExFreePool(treearr);
2514             ExFreePool(context.parity_scratch);
2515 
2516             if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2517                 ExFreePool(csumarr);
2518                 ExFreePool(context.csum);
2519             }
2520 
2521             return STATUS_INSUFFICIENT_RESOURCES;
2522         }
2523     }
2524 
2525     do {
2526         traverse_ptr next_tp;
2527 
2528         if (tp.item->key.obj_id >= run_end)
2529             break;
2530 
2531         if (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM) {
2532             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2533 
2534             if (tp.item->key.obj_id + size > run_start) {
2535                 uint64_t extent_start = max(run_start, tp.item->key.obj_id);
2536                 uint64_t extent_end = min(tp.item->key.obj_id + size, run_end);
2537                 bool extent_is_tree = false;
2538 
2539                 RtlSetBits(&context.alloc, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size));
2540 
2541                 if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2542                     extent_is_tree = true;
2543                 else {
2544                     EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2545 
2546                     if (tp.item->size < sizeof(EXTENT_ITEM)) {
2547                         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2548                         Status = STATUS_INTERNAL_ERROR;
2549                         goto end;
2550                     }
2551 
2552                     if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2553                         extent_is_tree = true;
2554                 }
2555 
2556                 if (extent_is_tree)
2557                     RtlSetBits(&context.is_tree, (ULONG)((extent_start - run_start) / Vcb->superblock.sector_size), (ULONG)((extent_end - extent_start) / Vcb->superblock.sector_size));
2558                 else if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2559                     traverse_ptr tp2;
2560                     bool b2;
2561 
2562                     searchkey.obj_id = EXTENT_CSUM_ID;
2563                     searchkey.obj_type = TYPE_EXTENT_CSUM;
2564                     searchkey.offset = extent_start;
2565 
2566                     Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
2567                     if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
2568                         ERR("find_item returned %08lx\n", Status);
2569                         goto end;
2570                     }
2571 
2572                     do {
2573                         traverse_ptr next_tp2;
2574 
2575                         if (tp2.item->key.offset >= extent_end)
2576                             break;
2577 
2578                         if (tp2.item->key.offset >= extent_start) {
2579                             uint64_t csum_start = max(extent_start, tp2.item->key.offset);
2580                             uint64_t csum_end = min(extent_end, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / Vcb->csum_size));
2581 
2582                             RtlSetBits(&context.has_csum, (ULONG)((csum_start - run_start) / Vcb->superblock.sector_size), (ULONG)((csum_end - csum_start) / Vcb->superblock.sector_size));
2583 
2584                             RtlCopyMemory((uint8_t*)context.csum + ((csum_start - run_start) * Vcb->csum_size / Vcb->superblock.sector_size),
2585                                           tp2.item->data + ((csum_start - tp2.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size),
2586                                           (ULONG)((csum_end - csum_start) * Vcb->csum_size / Vcb->superblock.sector_size));
2587                         }
2588 
2589                         b2 = find_next_item(Vcb, &tp2, &next_tp2, false, NULL);
2590 
2591                         if (b2)
2592                             tp2 = next_tp2;
2593                     } while (b2);
2594                 }
2595             }
2596         }
2597 
2598         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2599 
2600         if (b)
2601             tp = next_tp;
2602     } while (b);
2603 
2604     context.stripes = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_context_raid56_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
2605     if (!context.stripes) {
2606         ERR("out of memory\n");
2607         Status = STATUS_INSUFFICIENT_RESOURCES;
2608         goto end;
2609     }
2610 
2611     max_read = (uint32_t)min(1048576 / c->chunk_item->stripe_length, stripe_end - stripe_start + 1); // only process 1 MB of data at a time
2612 
2613     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2614         context.stripes[i].buf = ExAllocatePoolWithTag(PagedPool, (ULONG)(max_read * c->chunk_item->stripe_length), ALLOC_TAG);
2615         if (!context.stripes[i].buf) {
2616             uint64_t j;
2617 
2618             ERR("out of memory\n");
2619 
2620             for (j = 0; j < i; j++) {
2621                 ExFreePool(context.stripes[j].buf);
2622             }
2623             ExFreePool(context.stripes);
2624 
2625             Status = STATUS_INSUFFICIENT_RESOURCES;
2626             goto end;
2627         }
2628 
2629         context.stripes[i].errorarr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(((c->chunk_item->stripe_length / Vcb->superblock.sector_size) / 8) + 1, sizeof(ULONG)), ALLOC_TAG);
2630         if (!context.stripes[i].errorarr) {
2631             uint64_t j;
2632 
2633             ERR("out of memory\n");
2634 
2635             ExFreePool(context.stripes[i].buf);
2636 
2637             for (j = 0; j < i; j++) {
2638                 ExFreePool(context.stripes[j].buf);
2639             }
2640             ExFreePool(context.stripes);
2641 
2642             Status = STATUS_INSUFFICIENT_RESOURCES;
2643             goto end;
2644         }
2645 
2646         RtlInitializeBitMap(&context.stripes[i].error, context.stripes[i].errorarr, (ULONG)(c->chunk_item->stripe_length / Vcb->superblock.sector_size));
2647 
2648         context.stripes[i].context = &context;
2649         context.stripes[i].rewrite = false;
2650     }
2651 
2652     stripe = stripe_start;
2653 
2654     Status = STATUS_SUCCESS;
2655 
2656     chunk_lock_range(Vcb, c, run_start, run_end - run_start);
2657 
2658     do {
2659         ULONG read_stripes;
2660         uint16_t missing_devices = 0;
2661         bool need_wait = false;
2662 
2663         if (max_read < stripe_end + 1 - stripe)
2664             read_stripes = max_read;
2665         else
2666             read_stripes = (ULONG)(stripe_end + 1 - stripe);
2667 
2668         context.stripes_left = c->chunk_item->num_stripes;
2669 
2670         // read megabyte by megabyte
2671         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2672             if (c->devices[i]->devobj) {
2673                 PIO_STACK_LOCATION IrpSp;
2674 
2675                 context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
2676 
2677                 if (!context.stripes[i].Irp) {
2678                     ERR("IoAllocateIrp failed\n");
2679                     Status = STATUS_INSUFFICIENT_RESOURCES;
2680                     goto end3;
2681                 }
2682 
2683                 context.stripes[i].Irp->MdlAddress = NULL;
2684 
2685                 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2686                 IrpSp->MajorFunction = IRP_MJ_READ;
2687                 IrpSp->FileObject = c->devices[i]->fileobj;
2688 
2689                 if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2690                     context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(read_stripes * c->chunk_item->stripe_length), ALLOC_TAG);
2691                     if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2692                         ERR("out of memory\n");
2693                         Status = STATUS_INSUFFICIENT_RESOURCES;
2694                         goto end3;
2695                     }
2696 
2697                     context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2698 
2699                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2700                 } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
2701                     context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, (ULONG)(read_stripes * c->chunk_item->stripe_length), false, false, NULL);
2702                     if (!context.stripes[i].Irp->MdlAddress) {
2703                         ERR("IoAllocateMdl failed\n");
2704                         Status = STATUS_INSUFFICIENT_RESOURCES;
2705                         goto end3;
2706                     }
2707 
2708                     Status = STATUS_SUCCESS;
2709 
2710                     _SEH2_TRY {
2711                         MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
2712                     } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2713                         Status = _SEH2_GetExceptionCode();
2714                     } _SEH2_END;
2715 
2716                     if (!NT_SUCCESS(Status)) {
2717                         ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2718                         IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2719                         goto end3;
2720                     }
2721                 } else
2722                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2723 
2724                 context.stripes[i].offset = stripe * c->chunk_item->stripe_length;
2725 
2726                 IrpSp->Parameters.Read.Length = (ULONG)(read_stripes * c->chunk_item->stripe_length);
2727                 IrpSp->Parameters.Read.ByteOffset.QuadPart = cis[i].offset + context.stripes[i].offset;
2728 
2729                 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2730                 context.stripes[i].missing = false;
2731 
2732                 IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion_raid56, &context.stripes[i], true, true, true);
2733 
2734                 Vcb->scrub.data_scrubbed += read_stripes * c->chunk_item->stripe_length;
2735                 need_wait = true;
2736             } else {
2737                 context.stripes[i].Irp = NULL;
2738                 context.stripes[i].missing = true;
2739                 missing_devices++;
2740                 InterlockedDecrement(&context.stripes_left);
2741             }
2742         }
2743 
2744         if (c->chunk_item->type & BLOCK_FLAG_RAID5 && missing_devices > 1) {
2745             ERR("too many missing devices (%u, maximum 1)\n", missing_devices);
2746             Status = STATUS_UNEXPECTED_IO_ERROR;
2747             goto end3;
2748         } else if (c->chunk_item->type & BLOCK_FLAG_RAID6 && missing_devices > 2) {
2749             ERR("too many missing devices (%u, maximum 2)\n", missing_devices);
2750             Status = STATUS_UNEXPECTED_IO_ERROR;
2751             goto end3;
2752         }
2753 
2754         if (need_wait) {
2755             KeInitializeEvent(&context.Event, NotificationEvent, false);
2756 
2757             for (i = 0; i < c->chunk_item->num_stripes; i++) {
2758                 if (c->devices[i]->devobj)
2759                     IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
2760             }
2761 
2762             KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2763         }
2764 
2765         // return an error if any of the stripes returned an error
2766         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2767             if (!context.stripes[i].missing && !NT_SUCCESS(context.stripes[i].iosb.Status)) {
2768                 Status = context.stripes[i].iosb.Status;
2769                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
2770                 goto end3;
2771             }
2772         }
2773 
2774         if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2775             for (i = 0; i < read_stripes; i++) {
2776                 scrub_raid6_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2777             }
2778         } else {
2779             for (i = 0; i < read_stripes; i++) {
2780                 scrub_raid5_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2781             }
2782         }
2783         stripe += read_stripes;
2784 
2785 end3:
2786         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2787             if (context.stripes[i].Irp) {
2788                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
2789                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
2790                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2791                 }
2792                 IoFreeIrp(context.stripes[i].Irp);
2793                 context.stripes[i].Irp = NULL;
2794 
2795                 if (context.stripes[i].rewrite) {
2796                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + context.stripes[i].offset,
2797                                              context.stripes[i].buf, (uint32_t)(read_stripes * c->chunk_item->stripe_length));
2798 
2799                     if (!NT_SUCCESS(Status)) {
2800                         ERR("write_data_phys returned %08lx\n", Status);
2801                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
2802                         goto end2;
2803                     }
2804                 }
2805             }
2806         }
2807 
2808         if (!NT_SUCCESS(Status))
2809             break;
2810     } while (stripe < stripe_end);
2811 
2812 end2:
2813     chunk_unlock_range(Vcb, c, run_start, run_end - run_start);
2814 
2815     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2816         ExFreePool(context.stripes[i].buf);
2817         ExFreePool(context.stripes[i].errorarr);
2818     }
2819     ExFreePool(context.stripes);
2820 
2821 end:
2822     ExFreePool(treearr);
2823     ExFreePool(allocarr);
2824     ExFreePool(context.parity_scratch);
2825 
2826     if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2827         ExFreePool(context.parity_scratch2);
2828 
2829     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2830         ExFreePool(csumarr);
2831         ExFreePool(context.csum);
2832     }
2833 
2834     return Status;
2835 }
2836 
2837 static NTSTATUS scrub_chunk_raid56(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2838     NTSTATUS Status;
2839     KEY searchkey;
2840     traverse_ptr tp;
2841     bool b;
2842     uint64_t full_stripe_len, stripe, stripe_start, stripe_end, total_data = 0;
2843     ULONG num_extents = 0, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2844 
2845     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2846     stripe = (*offset - c->offset) / full_stripe_len;
2847 
2848     *offset = c->offset + (stripe * full_stripe_len);
2849 
2850     searchkey.obj_id = *offset;
2851     searchkey.obj_type = TYPE_METADATA_ITEM;
2852     searchkey.offset = 0xffffffffffffffff;
2853 
2854     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2855     if (!NT_SUCCESS(Status)) {
2856         ERR("find_item returned %08lx\n", Status);
2857         return Status;
2858     }
2859 
2860     *changed = false;
2861 
2862     do {
2863         traverse_ptr next_tp;
2864 
2865         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2866             break;
2867 
2868         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2869             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2870 
2871             TRACE("%I64x\n", tp.item->key.obj_id);
2872 
2873             if (size < Vcb->superblock.sector_size) {
2874                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
2875                 return STATUS_INTERNAL_ERROR;
2876             }
2877 
2878             stripe = (tp.item->key.obj_id - c->offset) / full_stripe_len;
2879 
2880             if (*changed) {
2881                 if (stripe > stripe_end + 1) {
2882                     Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2883                     if (!NT_SUCCESS(Status)) {
2884                         ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2885                         return Status;
2886                     }
2887 
2888                     stripe_start = stripe;
2889                 }
2890             } else
2891                 stripe_start = stripe;
2892 
2893             stripe_end = (tp.item->key.obj_id + size - 1 - c->offset) / full_stripe_len;
2894 
2895             *changed = true;
2896 
2897             total_data += size;
2898             num_extents++;
2899 
2900             // only do so much at a time
2901             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
2902                 break;
2903         }
2904 
2905         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2906 
2907         if (b)
2908             tp = next_tp;
2909     } while (b);
2910 
2911     if (*changed) {
2912         Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2913         if (!NT_SUCCESS(Status)) {
2914             ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2915             return Status;
2916         }
2917 
2918         *offset = c->offset + ((stripe_end + 1) * full_stripe_len);
2919     }
2920 
2921     return STATUS_SUCCESS;
2922 }
2923 
2924 static NTSTATUS scrub_chunk(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2925     NTSTATUS Status;
2926     KEY searchkey;
2927     traverse_ptr tp;
2928     bool b = false, tree_run = false;
2929     ULONG type, num_extents = 0;
2930     uint64_t total_data = 0, tree_run_start, tree_run_end;
2931 
2932     TRACE("chunk %I64x\n", c->offset);
2933 
2934     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2935 
2936     if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2937         type = BLOCK_FLAG_DUPLICATE;
2938     else if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2939         type = BLOCK_FLAG_RAID0;
2940     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2941         type = BLOCK_FLAG_DUPLICATE;
2942     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2943         type = BLOCK_FLAG_RAID10;
2944     else if (c->chunk_item->type & BLOCK_FLAG_RAID5) {
2945         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2946         goto end;
2947     } else if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2948         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2949         goto end;
2950     } else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2951         type = BLOCK_FLAG_DUPLICATE;
2952     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2953         type = BLOCK_FLAG_DUPLICATE;
2954     else // SINGLE
2955         type = BLOCK_FLAG_DUPLICATE;
2956 
2957     searchkey.obj_id = *offset;
2958     searchkey.obj_type = TYPE_METADATA_ITEM;
2959     searchkey.offset = 0xffffffffffffffff;
2960 
2961     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2962     if (!NT_SUCCESS(Status)) {
2963         ERR("error - find_item returned %08lx\n", Status);
2964         goto end;
2965     }
2966 
2967     do {
2968         traverse_ptr next_tp;
2969 
2970         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2971             break;
2972 
2973         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2974             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2975             bool is_tree;
2976             void* csum = NULL;
2977             RTL_BITMAP bmp;
2978             ULONG* bmparr = NULL, bmplen;
2979 
2980             TRACE("%I64x\n", tp.item->key.obj_id);
2981 
2982             is_tree = false;
2983 
2984             if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2985                 is_tree = true;
2986             else {
2987                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2988 
2989                 if (tp.item->size < sizeof(EXTENT_ITEM)) {
2990                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2991                     Status = STATUS_INTERNAL_ERROR;
2992                     goto end;
2993                 }
2994 
2995                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2996                     is_tree = true;
2997             }
2998 
2999             if (size < Vcb->superblock.sector_size) {
3000                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
3001                 Status = STATUS_INTERNAL_ERROR;
3002                 goto end;
3003             }
3004 
3005             // load csum
3006             if (!is_tree) {
3007                 traverse_ptr tp2;
3008 
3009                 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)(Vcb->csum_size * size / Vcb->superblock.sector_size), ALLOC_TAG);
3010                 if (!csum) {
3011                     ERR("out of memory\n");
3012                     Status = STATUS_INSUFFICIENT_RESOURCES;
3013                     goto end;
3014                 }
3015 
3016                 bmplen = (ULONG)(size / Vcb->superblock.sector_size);
3017 
3018                 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)(sector_align((bmplen >> 3) + 1, sizeof(ULONG))), ALLOC_TAG);
3019                 if (!bmparr) {
3020                     ERR("out of memory\n");
3021                     ExFreePool(csum);
3022                     Status = STATUS_INSUFFICIENT_RESOURCES;
3023                     goto end;
3024                 }
3025 
3026                 RtlInitializeBitMap(&bmp, bmparr, bmplen);
3027                 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
3028 
3029                 searchkey.obj_id = EXTENT_CSUM_ID;
3030                 searchkey.obj_type = TYPE_EXTENT_CSUM;
3031                 searchkey.offset = tp.item->key.obj_id;
3032 
3033                 Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
3034                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
3035                     ERR("find_item returned %08lx\n", Status);
3036                     ExFreePool(csum);
3037                     ExFreePool(bmparr);
3038                     goto end;
3039                 }
3040 
3041                 if (Status != STATUS_NOT_FOUND) {
3042                     do {
3043                         traverse_ptr next_tp2;
3044 
3045                         if (tp2.item->key.obj_type == TYPE_EXTENT_CSUM) {
3046                             if (tp2.item->key.offset >= tp.item->key.obj_id + size)
3047                                 break;
3048                             else if (tp2.item->size >= Vcb->csum_size && tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / Vcb->csum_size) >= tp.item->key.obj_id) {
3049                                 uint64_t cs = max(tp.item->key.obj_id, tp2.item->key.offset);
3050                                 uint64_t ce = min(tp.item->key.obj_id + size, tp2.item->key.offset + (tp2.item->size * Vcb->superblock.sector_size / Vcb->csum_size));
3051 
3052                                 RtlCopyMemory((uint8_t*)csum + ((cs - tp.item->key.obj_id) * Vcb->csum_size / Vcb->superblock.sector_size),
3053                                               tp2.item->data + ((cs - tp2.item->key.offset) * Vcb->csum_size / Vcb->superblock.sector_size),
3054                                               (ULONG)((ce - cs) * Vcb->csum_size / Vcb->superblock.sector_size));
3055 
3056                                 RtlClearBits(&bmp, (ULONG)((cs - tp.item->key.obj_id) / Vcb->superblock.sector_size), (ULONG)((ce - cs) / Vcb->superblock.sector_size));
3057 
3058                                 if (ce == tp.item->key.obj_id + size)
3059                                     break;
3060                             }
3061                         }
3062 
3063                         if (find_next_item(Vcb, &tp2, &next_tp2, false, NULL))
3064                             tp2 = next_tp2;
3065                         else
3066                             break;
3067                     } while (true);
3068                 }
3069             }
3070 
3071             if (tree_run) {
3072                 if (!is_tree || tp.item->key.obj_id > tree_run_end) {
3073                     Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3074                     if (!NT_SUCCESS(Status)) {
3075                         ERR("scrub_extent returned %08lx\n", Status);
3076                         goto end;
3077                     }
3078 
3079                     if (!is_tree)
3080                         tree_run = false;
3081                     else {
3082                         tree_run_start = tp.item->key.obj_id;
3083                         tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3084                     }
3085                 } else
3086                     tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3087             } else if (is_tree) {
3088                 tree_run = true;
3089                 tree_run_start = tp.item->key.obj_id;
3090                 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3091             }
3092 
3093             if (!is_tree) {
3094                 Status = scrub_data_extent(Vcb, c, tp.item->key.obj_id, type, csum, &bmp, bmplen);
3095                 if (!NT_SUCCESS(Status)) {
3096                     ERR("scrub_data_extent returned %08lx\n", Status);
3097                     ExFreePool(csum);
3098                     ExFreePool(bmparr);
3099                     goto end;
3100                 }
3101 
3102                 ExFreePool(csum);
3103                 ExFreePool(bmparr);
3104             }
3105 
3106             *offset = tp.item->key.obj_id + size;
3107             *changed = true;
3108 
3109             total_data += size;
3110             num_extents++;
3111 
3112             // only do so much at a time
3113             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
3114                 break;
3115         }
3116 
3117         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
3118 
3119         if (b)
3120             tp = next_tp;
3121     } while (b);
3122 
3123     if (tree_run) {
3124         Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3125         if (!NT_SUCCESS(Status)) {
3126             ERR("scrub_extent returned %08lx\n", Status);
3127             goto end;
3128         }
3129     }
3130 
3131     Status = STATUS_SUCCESS;
3132 
3133 end:
3134     ExReleaseResourceLite(&Vcb->tree_lock);
3135 
3136     return Status;
3137 }
3138 
3139 _Function_class_(KSTART_ROUTINE)
3140 static void __stdcall scrub_thread(void* context) {
3141     device_extension* Vcb = context;
3142     LIST_ENTRY chunks, *le;
3143     NTSTATUS Status;
3144     LARGE_INTEGER time;
3145 
3146     KeInitializeEvent(&Vcb->scrub.finished, NotificationEvent, false);
3147 
3148     InitializeListHead(&chunks);
3149 
3150     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3151 
3152     if (Vcb->need_write && !Vcb->readonly)
3153         Status = do_write(Vcb, NULL);
3154     else
3155         Status = STATUS_SUCCESS;
3156 
3157     free_trees(Vcb);
3158 
3159     if (!NT_SUCCESS(Status)) {
3160         ExReleaseResourceLite(&Vcb->tree_lock);
3161         ERR("do_write returned %08lx\n", Status);
3162         Vcb->scrub.error = Status;
3163         goto end;
3164     }
3165 
3166     ExConvertExclusiveToSharedLite(&Vcb->tree_lock);
3167 
3168     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3169 
3170     KeQuerySystemTime(&Vcb->scrub.start_time);
3171     Vcb->scrub.finish_time.QuadPart = 0;
3172     Vcb->scrub.resume_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3173     Vcb->scrub.duration.QuadPart = 0;
3174     Vcb->scrub.total_chunks = 0;
3175     Vcb->scrub.chunks_left = 0;
3176     Vcb->scrub.data_scrubbed = 0;
3177     Vcb->scrub.num_errors = 0;
3178 
3179     while (!IsListEmpty(&Vcb->scrub.errors)) {
3180         scrub_error* err = CONTAINING_RECORD(RemoveHeadList(&Vcb->scrub.errors), scrub_error, list_entry);
3181         ExFreePool(err);
3182     }
3183 
3184     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3185 
3186     le = Vcb->chunks.Flink;
3187     while (le != &Vcb->chunks) {
3188         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3189 
3190         acquire_chunk_lock(c, Vcb);
3191 
3192         if (!c->readonly) {
3193             InsertTailList(&chunks, &c->list_entry_balance);
3194             Vcb->scrub.total_chunks++;
3195             Vcb->scrub.chunks_left++;
3196         }
3197 
3198         release_chunk_lock(c, Vcb);
3199 
3200         le = le->Flink;
3201     }
3202 
3203     ExReleaseResourceLite(&Vcb->chunk_lock);
3204 
3205     ExReleaseResource(&Vcb->scrub.stats_lock);
3206 
3207     ExReleaseResourceLite(&Vcb->tree_lock);
3208 
3209     while (!IsListEmpty(&chunks)) {
3210         chunk* c = CONTAINING_RECORD(RemoveHeadList(&chunks), chunk, list_entry_balance);
3211         uint64_t offset = c->offset;
3212         bool changed;
3213 
3214         c->reloc = true;
3215 
3216         KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3217 
3218         if (!Vcb->scrub.stopping) {
3219             do {
3220                 changed = false;
3221 
3222                 Status = scrub_chunk(Vcb, c, &offset, &changed);
3223                 if (!NT_SUCCESS(Status)) {
3224                     ERR("scrub_chunk returned %08lx\n", Status);
3225                     Vcb->scrub.stopping = true;
3226                     Vcb->scrub.error = Status;
3227                     break;
3228                 }
3229 
3230                 if (offset == c->offset + c->chunk_item->size || Vcb->scrub.stopping)
3231                     break;
3232 
3233                 KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3234             } while (changed);
3235         }
3236 
3237         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3238 
3239         if (!Vcb->scrub.stopping)
3240             Vcb->scrub.chunks_left--;
3241 
3242         if (IsListEmpty(&chunks))
3243             KeQuerySystemTime(&Vcb->scrub.finish_time);
3244 
3245         ExReleaseResource(&Vcb->scrub.stats_lock);
3246 
3247         c->reloc = false;
3248         c->list_entry_balance.Flink = NULL;
3249     }
3250 
3251     KeQuerySystemTime(&time);
3252     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3253 
3254 end:
3255     ZwClose(Vcb->scrub.thread);
3256     Vcb->scrub.thread = NULL;
3257 
3258     KeSetEvent(&Vcb->scrub.finished, 0, false);
3259 }
3260 
3261 NTSTATUS start_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3262     NTSTATUS Status;
3263     OBJECT_ATTRIBUTES oa;
3264 
3265     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3266         return STATUS_PRIVILEGE_NOT_HELD;
3267 
3268     if (Vcb->locked) {
3269         WARN("cannot start scrub while locked\n");
3270         return STATUS_DEVICE_NOT_READY;
3271     }
3272 
3273     if (Vcb->balance.thread) {
3274         WARN("cannot start scrub while balance running\n");
3275         return STATUS_DEVICE_NOT_READY;
3276     }
3277 
3278     if (Vcb->scrub.thread) {
3279         WARN("scrub already running\n");
3280         return STATUS_DEVICE_NOT_READY;
3281     }
3282 
3283     if (Vcb->readonly)
3284         return STATUS_MEDIA_WRITE_PROTECTED;
3285 
3286     Vcb->scrub.stopping = false;
3287     Vcb->scrub.paused = false;
3288     Vcb->scrub.error = STATUS_SUCCESS;
3289     KeInitializeEvent(&Vcb->scrub.event, NotificationEvent, !Vcb->scrub.paused);
3290 
3291     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3292 
3293     Status = PsCreateSystemThread(&Vcb->scrub.thread, 0, &oa, NULL, NULL, scrub_thread, Vcb);
3294     if (!NT_SUCCESS(Status)) {
3295         ERR("PsCreateSystemThread returned %08lx\n", Status);
3296         return Status;
3297     }
3298 
3299     return STATUS_SUCCESS;
3300 }
3301 
3302 NTSTATUS query_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode, void* data, ULONG length) {
3303     btrfs_query_scrub* bqs = (btrfs_query_scrub*)data;
3304     ULONG len;
3305     NTSTATUS Status;
3306     LIST_ENTRY* le;
3307     btrfs_scrub_error* bse = NULL;
3308 
3309     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3310         return STATUS_PRIVILEGE_NOT_HELD;
3311 
3312     if (length < offsetof(btrfs_query_scrub, errors))
3313         return STATUS_BUFFER_TOO_SMALL;
3314 
3315     ExAcquireResourceSharedLite(&Vcb->scrub.stats_lock, true);
3316 
3317     if (Vcb->scrub.thread && Vcb->scrub.chunks_left > 0)
3318         bqs->status = Vcb->scrub.paused ? BTRFS_SCRUB_PAUSED : BTRFS_SCRUB_RUNNING;
3319     else
3320         bqs->status = BTRFS_SCRUB_STOPPED;
3321 
3322     bqs->start_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3323     bqs->finish_time.QuadPart = Vcb->scrub.finish_time.QuadPart;
3324     bqs->chunks_left = Vcb->scrub.chunks_left;
3325     bqs->total_chunks = Vcb->scrub.total_chunks;
3326     bqs->data_scrubbed = Vcb->scrub.data_scrubbed;
3327 
3328     bqs->duration = Vcb->scrub.duration.QuadPart;
3329 
3330     if (bqs->status == BTRFS_SCRUB_RUNNING) {
3331         LARGE_INTEGER time;
3332 
3333         KeQuerySystemTime(&time);
3334         bqs->duration += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3335     }
3336 
3337     bqs->error = Vcb->scrub.error;
3338 
3339     bqs->num_errors = Vcb->scrub.num_errors;
3340 
3341     len = length - offsetof(btrfs_query_scrub, errors);
3342 
3343     le = Vcb->scrub.errors.Flink;
3344     while (le != &Vcb->scrub.errors) {
3345         scrub_error* err = CONTAINING_RECORD(le, scrub_error, list_entry);
3346         ULONG errlen;
3347 
3348         if (err->is_metadata)
3349             errlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3350         else
3351             errlen = offsetof(btrfs_scrub_error, data.filename) + err->data.filename_length;
3352 
3353         if (len < errlen) {
3354             Status = STATUS_BUFFER_OVERFLOW;
3355             goto end;
3356         }
3357 
3358         if (!bse)
3359             bse = &bqs->errors;
3360         else {
3361             ULONG lastlen;
3362 
3363             if (bse->is_metadata)
3364                 lastlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3365             else
3366                 lastlen = offsetof(btrfs_scrub_error, data.filename) + bse->data.filename_length;
3367 
3368             bse->next_entry = lastlen;
3369             bse = (btrfs_scrub_error*)(((uint8_t*)bse) + lastlen);
3370         }
3371 
3372         bse->next_entry = 0;
3373         bse->address = err->address;
3374         bse->device = err->device;
3375         bse->recovered = err->recovered;
3376         bse->is_metadata = err->is_metadata;
3377         bse->parity = err->parity;
3378 
3379         if (err->is_metadata) {
3380             bse->metadata.root = err->metadata.root;
3381             bse->metadata.level = err->metadata.level;
3382             bse->metadata.firstitem = err->metadata.firstitem;
3383         } else {
3384             bse->data.subvol = err->data.subvol;
3385             bse->data.offset = err->data.offset;
3386             bse->data.filename_length = err->data.filename_length;
3387             RtlCopyMemory(bse->data.filename, err->data.filename, err->data.filename_length);
3388         }
3389 
3390         len -= errlen;
3391         le = le->Flink;
3392     }
3393 
3394     Status = STATUS_SUCCESS;
3395 
3396 end:
3397     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
3398 
3399     return Status;
3400 }
3401 
3402 NTSTATUS pause_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3403     LARGE_INTEGER time;
3404 
3405     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3406         return STATUS_PRIVILEGE_NOT_HELD;
3407 
3408     if (!Vcb->scrub.thread)
3409         return STATUS_DEVICE_NOT_READY;
3410 
3411     if (Vcb->scrub.paused)
3412         return STATUS_DEVICE_NOT_READY;
3413 
3414     Vcb->scrub.paused = true;
3415     KeClearEvent(&Vcb->scrub.event);
3416 
3417     KeQuerySystemTime(&time);
3418     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3419 
3420     return STATUS_SUCCESS;
3421 }
3422 
3423 NTSTATUS resume_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3424     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3425         return STATUS_PRIVILEGE_NOT_HELD;
3426 
3427     if (!Vcb->scrub.thread)
3428         return STATUS_DEVICE_NOT_READY;
3429 
3430     if (!Vcb->scrub.paused)
3431         return STATUS_DEVICE_NOT_READY;
3432 
3433     Vcb->scrub.paused = false;
3434     KeSetEvent(&Vcb->scrub.event, 0, false);
3435 
3436     KeQuerySystemTime(&Vcb->scrub.resume_time);
3437 
3438     return STATUS_SUCCESS;
3439 }
3440 
3441 NTSTATUS stop_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3442     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3443         return STATUS_PRIVILEGE_NOT_HELD;
3444 
3445     if (!Vcb->scrub.thread)
3446         return STATUS_DEVICE_NOT_READY;
3447 
3448     Vcb->scrub.paused = false;
3449     Vcb->scrub.stopping = true;
3450     KeSetEvent(&Vcb->scrub.event, 0, false);
3451 
3452     return STATUS_SUCCESS;
3453 }
3454