xref: /reactos/drivers/filesystems/btrfs/scrub.c (revision d6eebaa4)
1 /* Copyright (c) Mark Harmstone 2017
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 
20 #define SCRUB_UNIT 0x100000 // 1 MB
21 
22 struct _scrub_context;
23 
24 typedef struct {
25     struct _scrub_context* context;
26     PIRP Irp;
27     uint64_t start;
28     uint32_t length;
29     IO_STATUS_BLOCK iosb;
30     uint8_t* buf;
31     bool csum_error;
32     void* bad_csums;
33 } scrub_context_stripe;
34 
35 typedef struct _scrub_context {
36     KEVENT Event;
37     scrub_context_stripe* stripes;
38     LONG stripes_left;
39 } scrub_context;
40 
41 typedef struct {
42     ANSI_STRING name;
43     bool orig_subvol;
44     LIST_ENTRY list_entry;
45 } path_part;
46 
47 static void log_file_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t subvol, uint64_t inode, uint64_t offset) {
48     LIST_ENTRY *le, parts;
49     root* r = NULL;
50     KEY searchkey;
51     traverse_ptr tp;
52     uint64_t dir;
53     bool orig_subvol = true, not_in_tree = false;
54     ANSI_STRING fn;
55     scrub_error* err;
56     NTSTATUS Status;
57     ULONG utf16len;
58 
59     le = Vcb->roots.Flink;
60     while (le != &Vcb->roots) {
61         root* r2 = CONTAINING_RECORD(le, root, list_entry);
62 
63         if (r2->id == subvol) {
64             r = r2;
65             break;
66         }
67 
68         le = le->Flink;
69     }
70 
71     if (!r) {
72         ERR("could not find subvol %I64x\n", subvol);
73         return;
74     }
75 
76     InitializeListHead(&parts);
77 
78     dir = inode;
79 
80     while (true) {
81         if (dir == r->root_item.objid) {
82             if (r == Vcb->root_fileref->fcb->subvol)
83                 break;
84 
85             searchkey.obj_id = r->id;
86             searchkey.obj_type = TYPE_ROOT_BACKREF;
87             searchkey.offset = 0xffffffffffffffff;
88 
89             Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
90             if (!NT_SUCCESS(Status)) {
91                 ERR("find_item returned %08lx\n", Status);
92                 goto end;
93             }
94 
95             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
96                 ROOT_REF* rr = (ROOT_REF*)tp.item->data;
97                 path_part* pp;
98 
99                 if (tp.item->size < sizeof(ROOT_REF)) {
100                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(ROOT_REF));
101                     goto end;
102                 }
103 
104                 if (tp.item->size < offsetof(ROOT_REF, name[0]) + rr->n) {
105                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
106                         tp.item->size, offsetof(ROOT_REF, name[0]) + rr->n);
107                     goto end;
108                 }
109 
110                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
111                 if (!pp) {
112                     ERR("out of memory\n");
113                     goto end;
114                 }
115 
116                 pp->name.Buffer = rr->name;
117                 pp->name.Length = pp->name.MaximumLength = rr->n;
118                 pp->orig_subvol = false;
119 
120                 InsertTailList(&parts, &pp->list_entry);
121 
122                 r = NULL;
123 
124                 le = Vcb->roots.Flink;
125                 while (le != &Vcb->roots) {
126                     root* r2 = CONTAINING_RECORD(le, root, list_entry);
127 
128                     if (r2->id == tp.item->key.offset) {
129                         r = r2;
130                         break;
131                     }
132 
133                     le = le->Flink;
134                 }
135 
136                 if (!r) {
137                     ERR("could not find subvol %I64x\n", tp.item->key.offset);
138                     goto end;
139                 }
140 
141                 dir = rr->dir;
142                 orig_subvol = false;
143             } else {
144                 not_in_tree = true;
145                 break;
146             }
147         } else {
148             searchkey.obj_id = dir;
149             searchkey.obj_type = TYPE_INODE_EXTREF;
150             searchkey.offset = 0xffffffffffffffff;
151 
152             Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
153             if (!NT_SUCCESS(Status)) {
154                 ERR("find_item returned %08lx\n", Status);
155                 goto end;
156             }
157 
158             if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_REF) {
159                 INODE_REF* ir = (INODE_REF*)tp.item->data;
160                 path_part* pp;
161 
162                 if (tp.item->size < sizeof(INODE_REF)) {
163                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_REF));
164                     goto end;
165                 }
166 
167                 if (tp.item->size < offsetof(INODE_REF, name[0]) + ir->n) {
168                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
169                         tp.item->size, offsetof(INODE_REF, name[0]) + ir->n);
170                     goto end;
171                 }
172 
173                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
174                 if (!pp) {
175                     ERR("out of memory\n");
176                     goto end;
177                 }
178 
179                 pp->name.Buffer = ir->name;
180                 pp->name.Length = pp->name.MaximumLength = ir->n;
181                 pp->orig_subvol = orig_subvol;
182 
183                 InsertTailList(&parts, &pp->list_entry);
184 
185                 if (dir == tp.item->key.offset)
186                     break;
187 
188                 dir = tp.item->key.offset;
189             } else if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_EXTREF) {
190                 INODE_EXTREF* ier = (INODE_EXTREF*)tp.item->data;
191                 path_part* pp;
192 
193                 if (tp.item->size < sizeof(INODE_EXTREF)) {
194                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
195                                                                                   tp.item->size, sizeof(INODE_EXTREF));
196                     goto end;
197                 }
198 
199                 if (tp.item->size < offsetof(INODE_EXTREF, name[0]) + ier->n) {
200                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
201                         tp.item->size, offsetof(INODE_EXTREF, name[0]) + ier->n);
202                     goto end;
203                 }
204 
205                 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
206                 if (!pp) {
207                     ERR("out of memory\n");
208                     goto end;
209                 }
210 
211                 pp->name.Buffer = ier->name;
212                 pp->name.Length = pp->name.MaximumLength = ier->n;
213                 pp->orig_subvol = orig_subvol;
214 
215                 InsertTailList(&parts, &pp->list_entry);
216 
217                 if (dir == ier->dir)
218                     break;
219 
220                 dir = ier->dir;
221             } else {
222                 ERR("could not find INODE_REF for inode %I64x in subvol %I64x\n", dir, r->id);
223                 goto end;
224             }
225         }
226     }
227 
228     fn.MaximumLength = 0;
229 
230     if (not_in_tree) {
231         le = parts.Blink;
232         while (le != &parts) {
233             path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
234             LIST_ENTRY* le2 = le->Blink;
235 
236             if (pp->orig_subvol)
237                 break;
238 
239             RemoveTailList(&parts);
240             ExFreePool(pp);
241 
242             le = le2;
243         }
244     }
245 
246     le = parts.Flink;
247     while (le != &parts) {
248         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
249 
250         fn.MaximumLength += pp->name.Length + 1;
251 
252         le = le->Flink;
253     }
254 
255     fn.Buffer = ExAllocatePoolWithTag(PagedPool, fn.MaximumLength, ALLOC_TAG);
256     if (!fn.Buffer) {
257         ERR("out of memory\n");
258         goto end;
259     }
260 
261     fn.Length = 0;
262 
263     le = parts.Blink;
264     while (le != &parts) {
265         path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
266 
267         fn.Buffer[fn.Length] = '\\';
268         fn.Length++;
269 
270         RtlCopyMemory(&fn.Buffer[fn.Length], pp->name.Buffer, pp->name.Length);
271         fn.Length += pp->name.Length;
272 
273         le = le->Blink;
274     }
275 
276     if (not_in_tree)
277         ERR("subvol %I64x, %.*s, offset %I64x\n", subvol, fn.Length, fn.Buffer, offset);
278     else
279         ERR("%.*s, offset %I64x\n", fn.Length, fn.Buffer, offset);
280 
281     Status = utf8_to_utf16(NULL, 0, &utf16len, fn.Buffer, fn.Length);
282     if (!NT_SUCCESS(Status)) {
283         ERR("utf8_to_utf16 1 returned %08lx\n", Status);
284         ExFreePool(fn.Buffer);
285         goto end;
286     }
287 
288     err = ExAllocatePoolWithTag(PagedPool, offsetof(scrub_error, data.filename[0]) + utf16len, ALLOC_TAG);
289     if (!err) {
290         ERR("out of memory\n");
291         ExFreePool(fn.Buffer);
292         goto end;
293     }
294 
295     err->address = addr;
296     err->device = devid;
297     err->recovered = false;
298     err->is_metadata = false;
299     err->parity = false;
300 
301     err->data.subvol = not_in_tree ? subvol : 0;
302     err->data.offset = offset;
303     err->data.filename_length = (uint16_t)utf16len;
304 
305     Status = utf8_to_utf16(err->data.filename, utf16len, &utf16len, fn.Buffer, fn.Length);
306     if (!NT_SUCCESS(Status)) {
307         ERR("utf8_to_utf16 2 returned %08lx\n", Status);
308         ExFreePool(fn.Buffer);
309         ExFreePool(err);
310         goto end;
311     }
312 
313     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
314 
315     Vcb->scrub.num_errors++;
316     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
317 
318     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
319 
320     ExFreePool(fn.Buffer);
321 
322 end:
323     while (!IsListEmpty(&parts)) {
324         path_part* pp = CONTAINING_RECORD(RemoveHeadList(&parts), path_part, list_entry);
325 
326         ExFreePool(pp);
327     }
328 }
329 
330 static void log_file_checksum_error_shared(device_extension* Vcb, uint64_t treeaddr, uint64_t addr, uint64_t devid, uint64_t extent) {
331     tree_header* tree;
332     NTSTATUS Status;
333     leaf_node* ln;
334     ULONG i;
335 
336     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
337     if (!tree) {
338         ERR("out of memory\n");
339         return;
340     }
341 
342     Status = read_data(Vcb, treeaddr, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
343     if (!NT_SUCCESS(Status)) {
344         ERR("read_data returned %08lx\n", Status);
345         goto end;
346     }
347 
348     if (tree->level != 0) {
349         ERR("tree level was %x, expected 0\n", tree->level);
350         goto end;
351     }
352 
353     ln = (leaf_node*)&tree[1];
354 
355     for (i = 0; i < tree->num_items; i++) {
356         if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
357             EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)tree + sizeof(tree_header) + ln[i].offset);
358             EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
359 
360             if (ed->type == EXTENT_TYPE_REGULAR && ed2->size != 0 && ed2->address == addr)
361                 log_file_checksum_error(Vcb, addr, devid, tree->tree_id, ln[i].key.obj_id, ln[i].key.offset + addr - extent);
362         }
363     }
364 
365 end:
366     ExFreePool(tree);
367 }
368 
369 static void log_tree_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t root, uint8_t level, KEY* firstitem) {
370     scrub_error* err;
371 
372     err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
373     if (!err) {
374         ERR("out of memory\n");
375         return;
376     }
377 
378     err->address = addr;
379     err->device = devid;
380     err->recovered = false;
381     err->is_metadata = true;
382     err->parity = false;
383 
384     err->metadata.root = root;
385     err->metadata.level = level;
386 
387     if (firstitem) {
388         ERR("root %I64x, level %u, first item (%I64x,%x,%I64x)\n", root, level, firstitem->obj_id,
389                                                                 firstitem->obj_type, firstitem->offset);
390 
391         err->metadata.firstitem = *firstitem;
392     } else {
393         ERR("root %I64x, level %u\n", root, level);
394 
395         RtlZeroMemory(&err->metadata.firstitem, sizeof(KEY));
396     }
397 
398     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
399 
400     Vcb->scrub.num_errors++;
401     InsertTailList(&Vcb->scrub.errors, &err->list_entry);
402 
403     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
404 }
405 
406 static void log_tree_checksum_error_shared(device_extension* Vcb, uint64_t offset, uint64_t address, uint64_t devid) {
407     tree_header* tree;
408     NTSTATUS Status;
409     internal_node* in;
410     ULONG i;
411 
412     tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
413     if (!tree) {
414         ERR("out of memory\n");
415         return;
416     }
417 
418     Status = read_data(Vcb, offset, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
419     if (!NT_SUCCESS(Status)) {
420         ERR("read_data returned %08lx\n", Status);
421         goto end;
422     }
423 
424     if (tree->level == 0) {
425         ERR("tree level was 0\n");
426         goto end;
427     }
428 
429     in = (internal_node*)&tree[1];
430 
431     for (i = 0; i < tree->num_items; i++) {
432         if (in[i].address == address) {
433             log_tree_checksum_error(Vcb, address, devid, tree->tree_id, tree->level - 1, &in[i].key);
434             break;
435         }
436     }
437 
438 end:
439     ExFreePool(tree);
440 }
441 
442 static void log_unrecoverable_error(device_extension* Vcb, uint64_t address, uint64_t devid) {
443     KEY searchkey;
444     traverse_ptr tp;
445     NTSTATUS Status;
446     EXTENT_ITEM* ei;
447     EXTENT_ITEM2* ei2 = NULL;
448     uint8_t* ptr;
449     ULONG len;
450     uint64_t rc;
451 
452     // FIXME - still log even if rest of this function fails
453 
454     searchkey.obj_id = address;
455     searchkey.obj_type = TYPE_METADATA_ITEM;
456     searchkey.offset = 0xffffffffffffffff;
457 
458     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
459     if (!NT_SUCCESS(Status)) {
460         ERR("find_item returned %08lx\n", Status);
461         return;
462     }
463 
464     if ((tp.item->key.obj_type != TYPE_EXTENT_ITEM && tp.item->key.obj_type != TYPE_METADATA_ITEM) ||
465         tp.item->key.obj_id >= address + Vcb->superblock.sector_size ||
466         (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.obj_id + tp.item->key.offset <= address) ||
467         (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->key.obj_id + Vcb->superblock.node_size <= address)
468     )
469         return;
470 
471     if (tp.item->size < sizeof(EXTENT_ITEM)) {
472         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
473         return;
474     }
475 
476     ei = (EXTENT_ITEM*)tp.item->data;
477     ptr = (uint8_t*)&ei[1];
478     len = tp.item->size - sizeof(EXTENT_ITEM);
479 
480     if (tp.item->key.obj_id == TYPE_EXTENT_ITEM && ei->flags & EXTENT_ITEM_TREE_BLOCK) {
481         if (tp.item->size < sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)) {
482             ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
483                                                                           tp.item->size, sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2));
484             return;
485         }
486 
487         ei2 = (EXTENT_ITEM2*)ptr;
488 
489         ptr += sizeof(EXTENT_ITEM2);
490         len -= sizeof(EXTENT_ITEM2);
491     }
492 
493     rc = 0;
494 
495     while (len > 0) {
496         uint8_t type = *ptr;
497 
498         ptr++;
499         len--;
500 
501         if (type == TYPE_TREE_BLOCK_REF) {
502             TREE_BLOCK_REF* tbr;
503 
504             if (len < sizeof(TREE_BLOCK_REF)) {
505                 ERR("TREE_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(TREE_BLOCK_REF), len);
506                 break;
507             }
508 
509             tbr = (TREE_BLOCK_REF*)ptr;
510 
511             log_tree_checksum_error(Vcb, address, devid, tbr->offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
512 
513             rc++;
514 
515             ptr += sizeof(TREE_BLOCK_REF);
516             len -= sizeof(TREE_BLOCK_REF);
517         } else if (type == TYPE_EXTENT_DATA_REF) {
518             EXTENT_DATA_REF* edr;
519 
520             if (len < sizeof(EXTENT_DATA_REF)) {
521                 ERR("EXTENT_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(EXTENT_DATA_REF), len);
522                 break;
523             }
524 
525             edr = (EXTENT_DATA_REF*)ptr;
526 
527             log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
528 
529             rc += edr->count;
530 
531             ptr += sizeof(EXTENT_DATA_REF);
532             len -= sizeof(EXTENT_DATA_REF);
533         } else if (type == TYPE_SHARED_BLOCK_REF) {
534             SHARED_BLOCK_REF* sbr;
535 
536             if (len < sizeof(SHARED_BLOCK_REF)) {
537                 ERR("SHARED_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_BLOCK_REF), len);
538                 break;
539             }
540 
541             sbr = (SHARED_BLOCK_REF*)ptr;
542 
543             log_tree_checksum_error_shared(Vcb, sbr->offset, address, devid);
544 
545             rc++;
546 
547             ptr += sizeof(SHARED_BLOCK_REF);
548             len -= sizeof(SHARED_BLOCK_REF);
549         } else if (type == TYPE_SHARED_DATA_REF) {
550             SHARED_DATA_REF* sdr;
551 
552             if (len < sizeof(SHARED_DATA_REF)) {
553                 ERR("SHARED_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_DATA_REF), len);
554                 break;
555             }
556 
557             sdr = (SHARED_DATA_REF*)ptr;
558 
559             log_file_checksum_error_shared(Vcb, sdr->offset, address, devid, tp.item->key.obj_id);
560 
561             rc += sdr->count;
562 
563             ptr += sizeof(SHARED_DATA_REF);
564             len -= sizeof(SHARED_DATA_REF);
565         } else {
566             ERR("unknown extent type %x\n", type);
567             break;
568         }
569     }
570 
571     if (rc < ei->refcount) {
572         do {
573             traverse_ptr next_tp;
574 
575             if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
576                 tp = next_tp;
577             else
578                 break;
579 
580             if (tp.item->key.obj_id == address) {
581                 if (tp.item->key.obj_type == TYPE_TREE_BLOCK_REF)
582                     log_tree_checksum_error(Vcb, address, devid, tp.item->key.offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
583                 else if (tp.item->key.obj_type == TYPE_EXTENT_DATA_REF) {
584                     EXTENT_DATA_REF* edr;
585 
586                     if (tp.item->size < sizeof(EXTENT_DATA_REF)) {
587                         ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
588                                                                              tp.item->size, sizeof(EXTENT_DATA_REF));
589                         break;
590                     }
591 
592                     edr = (EXTENT_DATA_REF*)tp.item->data;
593 
594                     log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
595                 } else if (tp.item->key.obj_type == TYPE_SHARED_BLOCK_REF)
596                     log_tree_checksum_error_shared(Vcb, tp.item->key.offset, address, devid);
597                 else if (tp.item->key.obj_type == TYPE_SHARED_DATA_REF)
598                     log_file_checksum_error_shared(Vcb, tp.item->key.offset, address, devid, tp.item->key.obj_id);
599             } else
600                 break;
601         } while (true);
602     }
603 }
604 
605 static void log_error(device_extension* Vcb, uint64_t addr, uint64_t devid, bool metadata, bool recoverable, bool parity) {
606     if (recoverable) {
607         scrub_error* err;
608 
609         if (parity) {
610             ERR("recovering from parity error at %I64x on device %I64x\n", addr, devid);
611         } else {
612             if (metadata)
613                 ERR("recovering from metadata checksum error at %I64x on device %I64x\n", addr, devid);
614             else
615                 ERR("recovering from data checksum error at %I64x on device %I64x\n", addr, devid);
616         }
617 
618         err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
619         if (!err) {
620             ERR("out of memory\n");
621             return;
622         }
623 
624         err->address = addr;
625         err->device = devid;
626         err->recovered = true;
627         err->is_metadata = metadata;
628         err->parity = parity;
629 
630         if (metadata)
631             RtlZeroMemory(&err->metadata, sizeof(err->metadata));
632         else
633             RtlZeroMemory(&err->data, sizeof(err->data));
634 
635         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
636 
637         Vcb->scrub.num_errors++;
638         InsertTailList(&Vcb->scrub.errors, &err->list_entry);
639 
640         ExReleaseResourceLite(&Vcb->scrub.stats_lock);
641     } else {
642         if (metadata)
643             ERR("unrecoverable metadata checksum error at %I64x\n", addr);
644         else
645             ERR("unrecoverable data checksum error at %I64x\n", addr);
646 
647         log_unrecoverable_error(Vcb, addr, devid);
648     }
649 }
650 
651 _Function_class_(IO_COMPLETION_ROUTINE)
652 static NTSTATUS __stdcall scrub_read_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
653     scrub_context_stripe* stripe = conptr;
654     scrub_context* context = (scrub_context*)stripe->context;
655     ULONG left = InterlockedDecrement(&context->stripes_left);
656 
657     UNUSED(DeviceObject);
658 
659     stripe->iosb = Irp->IoStatus;
660 
661     if (left == 0)
662         KeSetEvent(&context->Event, 0, false);
663 
664     return STATUS_MORE_PROCESSING_REQUIRED;
665 }
666 
667 static NTSTATUS scrub_extent_dup(device_extension* Vcb, chunk* c, uint64_t offset, void* csum, scrub_context* context) {
668     NTSTATUS Status;
669     bool csum_error = false;
670     ULONG i;
671     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
672     uint16_t present_devices = 0;
673 
674     if (csum) {
675         ULONG good_stripe = 0xffffffff;
676 
677         for (i = 0; i < c->chunk_item->num_stripes; i++) {
678             if (c->devices[i]->devobj) {
679                 present_devices++;
680 
681                 // if first stripe is okay, we only need to check that the others are identical to it
682                 if (good_stripe != 0xffffffff) {
683                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
684                                         context->stripes[good_stripe].length) != context->stripes[i].length) {
685                         context->stripes[i].csum_error = true;
686                         csum_error = true;
687                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
688                     }
689                 } else {
690                     Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].length >> Vcb->sector_shift, csum);
691                     if (Status == STATUS_CRC_ERROR) {
692                         context->stripes[i].csum_error = true;
693                         csum_error = true;
694                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
695                     } else if (!NT_SUCCESS(Status)) {
696                         ERR("check_csum returned %08lx\n", Status);
697                         return Status;
698                     } else
699                         good_stripe = i;
700                 }
701             }
702         }
703     } else {
704         ULONG good_stripe = 0xffffffff;
705 
706         for (i = 0; i < c->chunk_item->num_stripes; i++) {
707             ULONG j;
708 
709             if (c->devices[i]->devobj) {
710                 // if first stripe is okay, we only need to check that the others are identical to it
711                 if (good_stripe != 0xffffffff) {
712                     if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
713                                          context->stripes[good_stripe].length) != context->stripes[i].length) {
714                         context->stripes[i].csum_error = true;
715                         csum_error = true;
716                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
717                     }
718                 } else {
719                     for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
720                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
721 
722                         if (!check_tree_checksum(Vcb, th) || th->address != offset + UInt32x32To64(j, Vcb->superblock.node_size)) {
723                             context->stripes[i].csum_error = true;
724                             csum_error = true;
725                             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
726                         }
727                     }
728 
729                     if (!context->stripes[i].csum_error)
730                         good_stripe = i;
731                 }
732             }
733         }
734     }
735 
736     if (!csum_error)
737         return STATUS_SUCCESS;
738 
739     // handle checksum error
740 
741     for (i = 0; i < c->chunk_item->num_stripes; i++) {
742         if (context->stripes[i].csum_error) {
743             if (csum) {
744                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[i].length * Vcb->csum_size) >> Vcb->sector_shift, ALLOC_TAG);
745                 if (!context->stripes[i].bad_csums) {
746                     ERR("out of memory\n");
747                     return STATUS_INSUFFICIENT_RESOURCES;
748                 }
749 
750                 do_calc_job(Vcb, context->stripes[i].buf, context->stripes[i].length >> Vcb->sector_shift, context->stripes[i].bad_csums);
751             } else {
752                 ULONG j;
753 
754                 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[i].length * Vcb->csum_size) >> Vcb->sector_shift, ALLOC_TAG);
755                 if (!context->stripes[i].bad_csums) {
756                     ERR("out of memory\n");
757                     return STATUS_INSUFFICIENT_RESOURCES;
758                 }
759 
760                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
761                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
762 
763                     get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[i].bad_csums + (Vcb->csum_size * j));
764                 }
765             }
766         }
767     }
768 
769     if (present_devices > 1) {
770         ULONG good_stripe = 0xffffffff;
771 
772         for (i = 0; i < c->chunk_item->num_stripes; i++) {
773             if (c->devices[i]->devobj && !context->stripes[i].csum_error) {
774                 good_stripe = i;
775                 break;
776             }
777         }
778 
779         if (good_stripe != 0xffffffff) {
780             // log
781 
782             for (i = 0; i < c->chunk_item->num_stripes; i++) {
783                 if (context->stripes[i].csum_error) {
784                     ULONG j;
785 
786                     if (csum) {
787                         for (j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
788                             if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
789                                 uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
790 
791                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
792                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
793                             }
794                         }
795                     } else {
796                         for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
797                             tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
798                             uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
799 
800                             if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
801                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
802                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
803                             }
804                         }
805                     }
806                 }
807             }
808 
809             // write good data over bad
810 
811             for (i = 0; i < c->chunk_item->num_stripes; i++) {
812                 if (context->stripes[i].csum_error && !c->devices[i]->readonly) {
813                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
814                                              context->stripes[good_stripe].buf, context->stripes[i].length);
815 
816                     if (!NT_SUCCESS(Status)) {
817                         ERR("write_data_phys returned %08lx\n", Status);
818                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
819                         return Status;
820                     }
821                 }
822             }
823 
824             return STATUS_SUCCESS;
825         }
826 
827         // if csum errors on all stripes, check sector by sector
828 
829         for (i = 0; i < c->chunk_item->num_stripes; i++) {
830             if (c->devices[i]->devobj) {
831                 if (csum) {
832                     for (ULONG j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
833                         if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
834                             ULONG k;
835                             uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
836                             bool recovered = false;
837 
838                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
839                                 if (i != k && c->devices[k]->devobj &&
840                                     RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size),
841                                                      (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
842                                     log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
843                                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
844 
845                                     RtlCopyMemory(context->stripes[i].buf + (j << Vcb->sector_shift),
846                                                   context->stripes[k].buf + (j << Vcb->sector_shift), Vcb->superblock.sector_size);
847 
848                                     recovered = true;
849                                     break;
850                                 }
851                             }
852 
853                             if (!recovered) {
854                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
855                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
856                             }
857                         }
858                     }
859                 } else {
860                     for (ULONG j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
861                         tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
862                         uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
863 
864                         if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
865                             ULONG k;
866                             bool recovered = false;
867 
868                             for (k = 0; k < c->chunk_item->num_stripes; k++) {
869                                 if (i != k && c->devices[k]->devobj) {
870                                     tree_header* th2 = (tree_header*)&context->stripes[k].buf[j * Vcb->superblock.node_size];
871 
872                                     if (RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
873                                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
874                                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
875 
876                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
877 
878                                         recovered = true;
879                                         break;
880                                     }
881                                 }
882                             }
883 
884                             if (!recovered) {
885                                 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
886                                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
887                             }
888                         }
889                     }
890                 }
891             }
892         }
893 
894         // write good data over bad
895 
896         for (i = 0; i < c->chunk_item->num_stripes; i++) {
897             if (c->devices[i]->devobj && !c->devices[i]->readonly) {
898                 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
899                                          context->stripes[i].buf, context->stripes[i].length);
900                 if (!NT_SUCCESS(Status)) {
901                     ERR("write_data_phys returned %08lx\n", Status);
902                     log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
903                     return Status;
904                 }
905             }
906         }
907 
908         return STATUS_SUCCESS;
909     }
910 
911     for (i = 0; i < c->chunk_item->num_stripes; i++) {
912         if (c->devices[i]->devobj) {
913             ULONG j;
914 
915             if (csum) {
916                 for (j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
917                     if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
918                         uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
919 
920                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
921                     }
922                 }
923             } else {
924                 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
925                     tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
926                     uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
927 
928                     if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr)
929                         log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
930                 }
931             }
932         }
933     }
934 
935     return STATUS_SUCCESS;
936 }
937 
938 static NTSTATUS scrub_extent_raid0(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
939     ULONG j;
940     uint16_t stripe;
941     uint32_t pos, *stripeoff;
942 
943     pos = 0;
944     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes, ALLOC_TAG);
945     if (!stripeoff) {
946         ERR("out of memory\n");
947         return STATUS_INSUFFICIENT_RESOURCES;
948     }
949 
950     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes);
951 
952     stripe = startoffstripe;
953     while (pos < length) {
954         uint32_t readlen;
955 
956         if (pos == 0)
957             readlen = (uint32_t)min(context->stripes[stripe].length, c->chunk_item->stripe_length - (context->stripes[stripe].start % c->chunk_item->stripe_length));
958         else
959             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
960 
961         if (csum) {
962             for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
963                 if (!check_sector_csum(Vcb, context->stripes[stripe].buf + stripeoff[stripe], (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift))) {
964                     uint64_t addr = offset + pos;
965 
966                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
967                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
968                 }
969 
970                 pos += Vcb->superblock.sector_size;
971                 stripeoff[stripe] += Vcb->superblock.sector_size;
972             }
973         } else {
974             for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
975                 tree_header* th = (tree_header*)(context->stripes[stripe].buf + stripeoff[stripe]);
976                 uint64_t addr = offset + pos;
977 
978                 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
979                     log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
980                     log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
981                 }
982 
983                 pos += Vcb->superblock.node_size;
984                 stripeoff[stripe] += Vcb->superblock.node_size;
985             }
986         }
987 
988         stripe = (stripe + 1) % c->chunk_item->num_stripes;
989     }
990 
991     ExFreePool(stripeoff);
992 
993     return STATUS_SUCCESS;
994 }
995 
996 static NTSTATUS scrub_extent_raid10(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
997     ULONG j;
998     uint16_t stripe, sub_stripes = max(c->chunk_item->sub_stripes, 1);
999     uint32_t pos, *stripeoff;
1000     bool csum_error = false;
1001     NTSTATUS Status;
1002 
1003     pos = 0;
1004     stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes, ALLOC_TAG);
1005     if (!stripeoff) {
1006         ERR("out of memory\n");
1007         return STATUS_INSUFFICIENT_RESOURCES;
1008     }
1009 
1010     RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes);
1011 
1012     stripe = startoffstripe;
1013     while (pos < length) {
1014         uint32_t readlen;
1015 
1016         if (pos == 0)
1017             readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1018                                   c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1019         else
1020             readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1021 
1022         if (csum) {
1023             ULONG good_stripe = 0xffffffff;
1024             uint16_t k;
1025 
1026             for (k = 0; k < sub_stripes; k++) {
1027                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1028                     // if first stripe is okay, we only need to check that the others are identical to it
1029                     if (good_stripe != 0xffffffff) {
1030                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1031                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1032                                             readlen) != readlen) {
1033                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1034                             csum_error = true;
1035                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1036                         }
1037                     } else {
1038                         for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
1039                             if (!check_sector_csum(Vcb, context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j,
1040                                                    (uint8_t*)csum + (((pos + j) * Vcb->csum_size) >> Vcb->sector_shift))) {
1041                                 csum_error = true;
1042                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1043                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1044                                 break;
1045                             }
1046                         }
1047 
1048                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1049                             good_stripe = k;
1050                     }
1051                 }
1052             }
1053 
1054             pos += readlen;
1055             stripeoff[stripe] += readlen;
1056         } else {
1057             ULONG good_stripe = 0xffffffff;
1058             uint16_t k;
1059 
1060             for (k = 0; k < sub_stripes; k++) {
1061                 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1062                     // if first stripe is okay, we only need to check that the others are identical to it
1063                     if (good_stripe != 0xffffffff) {
1064                         if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1065                                             context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1066                                             readlen) != readlen) {
1067                             context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1068                             csum_error = true;
1069                             log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1070                         }
1071                     } else {
1072                         for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
1073                             tree_header* th = (tree_header*)(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j);
1074                             uint64_t addr = offset + pos + j;
1075 
1076                             if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1077                                 csum_error = true;
1078                                 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1079                                 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1080                                 break;
1081                             }
1082                         }
1083 
1084                         if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1085                             good_stripe = k;
1086                     }
1087                 }
1088             }
1089 
1090             pos += readlen;
1091             stripeoff[stripe] += readlen;
1092         }
1093 
1094         stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1095     }
1096 
1097     if (!csum_error) {
1098         Status = STATUS_SUCCESS;
1099         goto end;
1100     }
1101 
1102     for (j = 0; j < c->chunk_item->num_stripes; j += sub_stripes) {
1103         ULONG goodstripe = 0xffffffff;
1104         uint16_t k;
1105         bool hasbadstripe = false;
1106 
1107         if (context->stripes[j].length == 0)
1108             continue;
1109 
1110         for (k = 0; k < sub_stripes; k++) {
1111             if (c->devices[j + k]->devobj) {
1112                 if (!context->stripes[j + k].csum_error)
1113                     goodstripe = k;
1114                 else
1115                     hasbadstripe = true;
1116             }
1117         }
1118 
1119         if (hasbadstripe) {
1120             if (goodstripe != 0xffffffff) {
1121                 for (k = 0; k < sub_stripes; k++) {
1122                     if (c->devices[j + k]->devobj && context->stripes[j + k].csum_error) {
1123                         uint32_t so = 0;
1124                         bool recovered = false;
1125 
1126                         pos = 0;
1127 
1128                         stripe = startoffstripe;
1129                         while (pos < length) {
1130                             uint32_t readlen;
1131 
1132                             if (pos == 0)
1133                                 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1134                                               c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1135                             else
1136                                 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1137 
1138                             if (stripe == j / sub_stripes) {
1139                                 if (csum) {
1140                                     ULONG l;
1141 
1142                                     for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1143                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1144                                                              context->stripes[j + goodstripe].buf + so,
1145                                                              Vcb->superblock.sector_size) != Vcb->superblock.sector_size) {
1146                                             uint64_t addr = offset + pos;
1147 
1148                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1149 
1150                                             recovered = true;
1151                                         }
1152 
1153                                         pos += Vcb->superblock.sector_size;
1154                                         so += Vcb->superblock.sector_size;
1155                                     }
1156                                 } else {
1157                                     ULONG l;
1158 
1159                                     for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1160                                         if (RtlCompareMemory(context->stripes[j + k].buf + so,
1161                                                             context->stripes[j + goodstripe].buf + so,
1162                                                             Vcb->superblock.node_size) != Vcb->superblock.node_size) {
1163                                             uint64_t addr = offset + pos;
1164 
1165                                             log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1166 
1167                                             recovered = true;
1168                                         }
1169 
1170                                         pos += Vcb->superblock.node_size;
1171                                         so += Vcb->superblock.node_size;
1172                                     }
1173                                 }
1174                             } else
1175                                 pos += readlen;
1176 
1177                             stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1178                         }
1179 
1180                         if (recovered) {
1181                             // write good data over bad
1182 
1183                             if (!c->devices[j + k]->readonly) {
1184                                 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1185 
1186                                 Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1187                                                          context->stripes[j + goodstripe].buf, context->stripes[j + goodstripe].length);
1188 
1189                                 if (!NT_SUCCESS(Status)) {
1190                                     ERR("write_data_phys returned %08lx\n", Status);
1191                                     log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1192                                     goto end;
1193                                 }
1194                             }
1195                         }
1196                     }
1197                 }
1198             } else {
1199                 uint32_t so = 0;
1200                 bool recovered = false;
1201 
1202                 if (csum) {
1203                     for (k = 0; k < sub_stripes; k++) {
1204                         if (c->devices[j + k]->devobj) {
1205                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[j + k].length * Vcb->csum_size) >> Vcb->sector_shift,
1206                                                                                       ALLOC_TAG);
1207                             if (!context->stripes[j + k].bad_csums) {
1208                                 ERR("out of memory\n");
1209                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1210                                 goto end;
1211                             }
1212 
1213                             do_calc_job(Vcb, context->stripes[j + k].buf, context->stripes[j + k].length >> Vcb->sector_shift, context->stripes[j + k].bad_csums);
1214                         }
1215                     }
1216                 } else {
1217                     for (k = 0; k < sub_stripes; k++) {
1218                         if (c->devices[j + k]->devobj) {
1219                             ULONG l;
1220 
1221                             context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * Vcb->csum_size / Vcb->superblock.node_size,
1222                                                                                       ALLOC_TAG);
1223                             if (!context->stripes[j + k].bad_csums) {
1224                                 ERR("out of memory\n");
1225                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1226                                 goto end;
1227                             }
1228 
1229                             for (l = 0; l < context->stripes[j + k].length / Vcb->superblock.node_size; l++) {
1230                                 tree_header* th = (tree_header*)&context->stripes[j + k].buf[l * Vcb->superblock.node_size];
1231 
1232                                 get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[j + k].bad_csums + (Vcb->csum_size * l));
1233                             }
1234                         }
1235                     }
1236                 }
1237 
1238                 pos = 0;
1239 
1240                 stripe = startoffstripe;
1241                 while (pos < length) {
1242                     uint32_t readlen;
1243 
1244                     if (pos == 0)
1245                         readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1246                                       c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1247                     else
1248                         readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1249 
1250                     if (stripe == j / sub_stripes) {
1251                         ULONG l;
1252 
1253                         if (csum) {
1254                             for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1255                                 bool has_error = false;
1256 
1257                                 goodstripe = 0xffffffff;
1258                                 for (k = 0; k < sub_stripes; k++) {
1259                                     if (c->devices[j + k]->devobj) {
1260                                         if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + ((so * Vcb->csum_size) >> Vcb->sector_shift),
1261                                             (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift),
1262                                             Vcb->csum_size) != Vcb->csum_size) {
1263                                             has_error = true;
1264                                         } else
1265                                             goodstripe = k;
1266                                     }
1267                                 }
1268 
1269                                 if (has_error) {
1270                                     if (goodstripe != 0xffffffff) {
1271                                         for (k = 0; k < sub_stripes; k++) {
1272                                             if (c->devices[j + k]->devobj &&
1273                                                 RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + ((so * Vcb->csum_size) >> Vcb->sector_shift),
1274                                                                  (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift),
1275                                                                  Vcb->csum_size) != Vcb->csum_size) {
1276                                                 uint64_t addr = offset + pos;
1277 
1278                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1279 
1280                                                 recovered = true;
1281 
1282                                                 RtlCopyMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so,
1283                                                               Vcb->superblock.sector_size);
1284                                             }
1285                                         }
1286                                     } else {
1287                                         uint64_t addr = offset + pos;
1288 
1289                                         for (k = 0; k < sub_stripes; k++) {
1290                                             if (c->devices[j + j]->devobj) {
1291                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, false, false);
1292                                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1293                                             }
1294                                         }
1295                                     }
1296                                 }
1297 
1298                                 pos += Vcb->superblock.sector_size;
1299                                 so += Vcb->superblock.sector_size;
1300                             }
1301                         } else {
1302                             for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1303                                 for (k = 0; k < sub_stripes; k++) {
1304                                     if (c->devices[j + k]->devobj) {
1305                                         tree_header* th = (tree_header*)&context->stripes[j + k].buf[so];
1306                                         uint64_t addr = offset + pos;
1307 
1308                                         if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
1309                                             ULONG m;
1310 
1311                                             recovered = false;
1312 
1313                                             for (m = 0; m < sub_stripes; m++) {
1314                                                 if (m != k) {
1315                                                     tree_header* th2 = (tree_header*)&context->stripes[j + m].buf[so];
1316 
1317                                                     if (RtlCompareMemory((uint8_t*)context->stripes[j + m].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
1318                                                         log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1319 
1320                                                         RtlCopyMemory(th, th2, Vcb->superblock.node_size);
1321 
1322                                                         recovered = true;
1323                                                         break;
1324                                                     } else
1325                                                         log_device_error(Vcb, c->devices[j + m], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1326                                                 }
1327                                             }
1328 
1329                                             if (!recovered)
1330                                                 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, false, false);
1331                                         }
1332                                     }
1333                                 }
1334 
1335                                 pos += Vcb->superblock.node_size;
1336                                 so += Vcb->superblock.node_size;
1337                             }
1338                         }
1339                     } else
1340                         pos += readlen;
1341 
1342                     stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1343                 }
1344 
1345                 if (recovered) {
1346                     // write good data over bad
1347 
1348                     for (k = 0; k < sub_stripes; k++) {
1349                         if (c->devices[j + k]->devobj && !c->devices[j + k]->readonly) {
1350                             CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1351 
1352                             Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1353                                                      context->stripes[j + k].buf, context->stripes[j + k].length);
1354 
1355                             if (!NT_SUCCESS(Status)) {
1356                                 ERR("write_data_phys returned %08lx\n", Status);
1357                                 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1358                                 goto end;
1359                             }
1360                         }
1361                     }
1362                 }
1363             }
1364         }
1365     }
1366 
1367     Status = STATUS_SUCCESS;
1368 
1369 end:
1370     ExFreePool(stripeoff);
1371 
1372     return Status;
1373 }
1374 
1375 static NTSTATUS scrub_extent(device_extension* Vcb, chunk* c, ULONG type, uint64_t offset, uint32_t size, void* csum) {
1376     ULONG i;
1377     scrub_context context;
1378     CHUNK_ITEM_STRIPE* cis;
1379     NTSTATUS Status;
1380     uint16_t startoffstripe = 0, num_missing, allowed_missing;
1381 
1382     TRACE("(%p, %p, %lx, %I64x, %x, %p)\n", Vcb, c, type, offset, size, csum);
1383 
1384     context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
1385     if (!context.stripes) {
1386         ERR("out of memory\n");
1387         Status = STATUS_INSUFFICIENT_RESOURCES;
1388         goto end;
1389     }
1390 
1391     RtlZeroMemory(context.stripes, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes);
1392 
1393     context.stripes_left = 0;
1394 
1395     cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1396 
1397     if (type == BLOCK_FLAG_RAID0) {
1398         uint64_t startoff, endoff;
1399         uint16_t endoffstripe;
1400 
1401         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &startoff, &startoffstripe);
1402         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &endoff, &endoffstripe);
1403 
1404         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1405             if (startoffstripe > i)
1406                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1407             else if (startoffstripe == i)
1408                 context.stripes[i].start = startoff;
1409             else
1410                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1411 
1412             if (endoffstripe > i)
1413                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1414             else if (endoffstripe == i)
1415                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1416             else
1417                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1418         }
1419 
1420         allowed_missing = 0;
1421     } else if (type == BLOCK_FLAG_RAID10) {
1422         uint64_t startoff, endoff;
1423         uint16_t endoffstripe, j, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1424 
1425         get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &startoff, &startoffstripe);
1426         get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &endoff, &endoffstripe);
1427 
1428         if ((c->chunk_item->num_stripes % sub_stripes) != 0) {
1429             ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", c->offset, c->chunk_item->num_stripes, sub_stripes);
1430             Status = STATUS_INTERNAL_ERROR;
1431             goto end;
1432         }
1433 
1434         startoffstripe *= sub_stripes;
1435         endoffstripe *= sub_stripes;
1436 
1437         for (i = 0; i < c->chunk_item->num_stripes; i += sub_stripes) {
1438             if (startoffstripe > i)
1439                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1440             else if (startoffstripe == i)
1441                 context.stripes[i].start = startoff;
1442             else
1443                 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1444 
1445             if (endoffstripe > i)
1446                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1447             else if (endoffstripe == i)
1448                 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1449             else
1450                 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1451 
1452             for (j = 1; j < sub_stripes; j++) {
1453                 context.stripes[i+j].start = context.stripes[i].start;
1454                 context.stripes[i+j].length = context.stripes[i].length;
1455             }
1456         }
1457 
1458         startoffstripe /= sub_stripes;
1459         allowed_missing = 1;
1460     } else
1461         allowed_missing = c->chunk_item->num_stripes - 1;
1462 
1463     num_missing = 0;
1464 
1465     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1466         PIO_STACK_LOCATION IrpSp;
1467 
1468         context.stripes[i].context = (struct _scrub_context*)&context;
1469 
1470         if (type == BLOCK_FLAG_DUPLICATE) {
1471             context.stripes[i].start = offset - c->offset;
1472             context.stripes[i].length = size;
1473         } else if (type != BLOCK_FLAG_RAID0 && type != BLOCK_FLAG_RAID10) {
1474             ERR("unexpected chunk type %lx\n", type);
1475             Status = STATUS_INTERNAL_ERROR;
1476             goto end;
1477         }
1478 
1479         if (!c->devices[i]->devobj) {
1480             num_missing++;
1481 
1482             if (num_missing > allowed_missing) {
1483                 ERR("too many missing devices (at least %u, maximum allowed %u)\n", num_missing, allowed_missing);
1484                 Status = STATUS_INTERNAL_ERROR;
1485                 goto end;
1486             }
1487         } else if (context.stripes[i].length > 0) {
1488             context.stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1489 
1490             if (!context.stripes[i].buf) {
1491                 ERR("out of memory\n");
1492                 Status = STATUS_INSUFFICIENT_RESOURCES;
1493                 goto end;
1494             }
1495 
1496             context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
1497 
1498             if (!context.stripes[i].Irp) {
1499                 ERR("IoAllocateIrp failed\n");
1500                 Status = STATUS_INSUFFICIENT_RESOURCES;
1501                 goto end;
1502             }
1503 
1504             IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
1505             IrpSp->MajorFunction = IRP_MJ_READ;
1506             IrpSp->FileObject = c->devices[i]->fileobj;
1507 
1508             if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
1509                 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1510                 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
1511                     ERR("out of memory\n");
1512                     Status = STATUS_INSUFFICIENT_RESOURCES;
1513                     goto end;
1514                 }
1515 
1516                 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
1517 
1518                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1519             } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
1520                 context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, context.stripes[i].length, false, false, NULL);
1521                 if (!context.stripes[i].Irp->MdlAddress) {
1522                     ERR("IoAllocateMdl failed\n");
1523                     Status = STATUS_INSUFFICIENT_RESOURCES;
1524                     goto end;
1525                 }
1526 
1527                 Status = STATUS_SUCCESS;
1528 
1529                 _SEH2_TRY {
1530                     MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
1531                 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1532                     Status = _SEH2_GetExceptionCode();
1533                 } _SEH2_END;
1534 
1535                 if (!NT_SUCCESS(Status)) {
1536                     ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1537                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1538                     context.stripes[i].Irp->MdlAddress = NULL;
1539                     goto end;
1540                 }
1541             } else
1542                 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1543 
1544             IrpSp->Parameters.Read.Length = context.stripes[i].length;
1545             IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].start + cis[i].offset;
1546 
1547             context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
1548 
1549             IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion, &context.stripes[i], true, true, true);
1550 
1551             context.stripes_left++;
1552 
1553             Vcb->scrub.data_scrubbed += context.stripes[i].length;
1554         }
1555     }
1556 
1557     if (context.stripes_left == 0) {
1558         ERR("error - not reading any stripes\n");
1559         Status = STATUS_INTERNAL_ERROR;
1560         goto end;
1561     }
1562 
1563     KeInitializeEvent(&context.Event, NotificationEvent, false);
1564 
1565     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1566         if (c->devices[i]->devobj && context.stripes[i].length > 0)
1567             IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
1568     }
1569 
1570     KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
1571 
1572     // return an error if any of the stripes returned an error
1573     for (i = 0; i < c->chunk_item->num_stripes; i++) {
1574         if (!NT_SUCCESS(context.stripes[i].iosb.Status)) {
1575             Status = context.stripes[i].iosb.Status;
1576             log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
1577             goto end;
1578         }
1579     }
1580 
1581     if (type == BLOCK_FLAG_DUPLICATE) {
1582         Status = scrub_extent_dup(Vcb, c, offset, csum, &context);
1583         if (!NT_SUCCESS(Status)) {
1584             ERR("scrub_extent_dup returned %08lx\n", Status);
1585             goto end;
1586         }
1587     } else if (type == BLOCK_FLAG_RAID0) {
1588         Status = scrub_extent_raid0(Vcb, c, offset, size, startoffstripe, csum, &context);
1589         if (!NT_SUCCESS(Status)) {
1590             ERR("scrub_extent_raid0 returned %08lx\n", Status);
1591             goto end;
1592         }
1593     } else if (type == BLOCK_FLAG_RAID10) {
1594         Status = scrub_extent_raid10(Vcb, c, offset, size, startoffstripe, csum, &context);
1595         if (!NT_SUCCESS(Status)) {
1596             ERR("scrub_extent_raid10 returned %08lx\n", Status);
1597             goto end;
1598         }
1599     }
1600 
1601 end:
1602     if (context.stripes) {
1603         for (i = 0; i < c->chunk_item->num_stripes; i++) {
1604             if (context.stripes[i].Irp) {
1605                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
1606                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
1607                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1608                 }
1609                 IoFreeIrp(context.stripes[i].Irp);
1610             }
1611 
1612             if (context.stripes[i].buf)
1613                 ExFreePool(context.stripes[i].buf);
1614 
1615             if (context.stripes[i].bad_csums)
1616                 ExFreePool(context.stripes[i].bad_csums);
1617         }
1618 
1619         ExFreePool(context.stripes);
1620     }
1621 
1622     return Status;
1623 }
1624 
1625 static NTSTATUS scrub_data_extent(device_extension* Vcb, chunk* c, uint64_t offset, ULONG type, void* csum, RTL_BITMAP* bmp, ULONG bmplen) {
1626     NTSTATUS Status;
1627     ULONG runlength, index;
1628 
1629     runlength = RtlFindFirstRunClear(bmp, &index);
1630 
1631     while (runlength != 0) {
1632         if (index >= bmplen)
1633             break;
1634 
1635         if (index + runlength >= bmplen) {
1636             runlength = bmplen - index;
1637 
1638             if (runlength == 0)
1639                 break;
1640         }
1641 
1642         do {
1643             ULONG rl;
1644 
1645             if (runlength << Vcb->sector_shift > SCRUB_UNIT)
1646                 rl = SCRUB_UNIT >> Vcb->sector_shift;
1647             else
1648                 rl = runlength;
1649 
1650             Status = scrub_extent(Vcb, c, type, offset + ((uint64_t)index << Vcb->sector_shift),
1651                                   rl << Vcb->sector_shift, (uint8_t*)csum + (index * Vcb->csum_size));
1652             if (!NT_SUCCESS(Status)) {
1653                 ERR("scrub_data_extent_dup returned %08lx\n", Status);
1654                 return Status;
1655             }
1656 
1657             runlength -= rl;
1658             index += rl;
1659         } while (runlength > 0);
1660 
1661         runlength = RtlFindNextForwardRunClear(bmp, index, &index);
1662     }
1663 
1664     return STATUS_SUCCESS;
1665 }
1666 
1667 typedef struct {
1668     uint8_t* buf;
1669     PIRP Irp;
1670     void* context;
1671     IO_STATUS_BLOCK iosb;
1672     uint64_t offset;
1673     bool rewrite, missing;
1674     RTL_BITMAP error;
1675     ULONG* errorarr;
1676 } scrub_context_raid56_stripe;
1677 
1678 typedef struct {
1679     scrub_context_raid56_stripe* stripes;
1680     LONG stripes_left;
1681     KEVENT Event;
1682     RTL_BITMAP alloc;
1683     RTL_BITMAP has_csum;
1684     RTL_BITMAP is_tree;
1685     void* csum;
1686     uint8_t* parity_scratch;
1687     uint8_t* parity_scratch2;
1688 } scrub_context_raid56;
1689 
1690 _Function_class_(IO_COMPLETION_ROUTINE)
1691 static NTSTATUS __stdcall scrub_read_completion_raid56(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
1692     scrub_context_raid56_stripe* stripe = conptr;
1693     scrub_context_raid56* context = (scrub_context_raid56*)stripe->context;
1694     LONG left = InterlockedDecrement(&context->stripes_left);
1695 
1696     UNUSED(DeviceObject);
1697 
1698     stripe->iosb = Irp->IoStatus;
1699 
1700     if (left == 0)
1701         KeSetEvent(&context->Event, 0, false);
1702 
1703     return STATUS_MORE_PROCESSING_REQUIRED;
1704 }
1705 
1706 static void scrub_raid5_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1707                                uint64_t num, uint16_t missing_devices) {
1708     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift), off;
1709     uint16_t stripe, parity = (bit_start + num + c->chunk_item->num_stripes - 1) % c->chunk_item->num_stripes;
1710     uint64_t stripeoff;
1711 
1712     stripe = (parity + 1) % c->chunk_item->num_stripes;
1713     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1);
1714     stripeoff = num * sectors_per_stripe;
1715 
1716     if (missing_devices == 0)
1717         RtlCopyMemory(context->parity_scratch, &context->stripes[parity].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1718 
1719     while (stripe != parity) {
1720         RtlClearAllBits(&context->stripes[stripe].error);
1721 
1722         for (ULONG i = 0; i < sectors_per_stripe; i++) {
1723             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1724                 if (RtlCheckBit(&context->is_tree, off)) {
1725                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff << Vcb->sector_shift];
1726                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1727 
1728                     if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1729                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size >> Vcb->sector_shift);
1730                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1731 
1732                         if (missing_devices > 0)
1733                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1734                     }
1735 
1736                     off += Vcb->superblock.node_size >> Vcb->sector_shift;
1737                     stripeoff += Vcb->superblock.node_size >> Vcb->sector_shift;
1738                     i += (Vcb->superblock.node_size >> Vcb->sector_shift) - 1;
1739 
1740                     continue;
1741                 } else if (RtlCheckBit(&context->has_csum, off)) {
1742                     if (!check_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff << Vcb->sector_shift), (uint8_t*)context->csum + (Vcb->csum_size * off))) {
1743                         RtlSetBit(&context->stripes[stripe].error, i);
1744                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1745 
1746                         if (missing_devices > 0) {
1747                             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1748 
1749                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1750                         }
1751                     }
1752                 }
1753             }
1754 
1755             off++;
1756             stripeoff++;
1757         }
1758 
1759         if (missing_devices == 0)
1760             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1761 
1762         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1763         stripeoff = num * sectors_per_stripe;
1764     }
1765 
1766     // check parity
1767 
1768     if (missing_devices == 0) {
1769         RtlClearAllBits(&context->stripes[parity].error);
1770 
1771         for (ULONG i = 0; i < sectors_per_stripe; i++) {
1772             ULONG o, j;
1773 
1774             o = i << Vcb->sector_shift;
1775             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1776                 if (context->parity_scratch[o] != 0) {
1777                     RtlSetBit(&context->stripes[parity].error, i);
1778                     break;
1779                 }
1780                 o++;
1781             }
1782         }
1783     }
1784 
1785     // log and fix errors
1786 
1787     if (missing_devices > 0)
1788         return;
1789 
1790     for (ULONG i = 0; i < sectors_per_stripe; i++) {
1791         ULONG num_errors = 0, bad_off = 0;
1792         uint64_t bad_stripe = 0;
1793         bool alloc = false;
1794 
1795         stripe = (parity + 1) % c->chunk_item->num_stripes;
1796         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1797 
1798         while (stripe != parity) {
1799             if (RtlCheckBit(&context->alloc, off)) {
1800                 alloc = true;
1801 
1802                 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1803                     bad_stripe = stripe;
1804                     bad_off = off;
1805                     num_errors++;
1806                 }
1807             }
1808 
1809             off += sectors_per_stripe;
1810             stripe = (stripe + 1) % c->chunk_item->num_stripes;
1811         }
1812 
1813         if (!alloc)
1814             continue;
1815 
1816         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity].error, i)) // everything fine
1817             continue;
1818 
1819         if (num_errors == 0 && RtlCheckBit(&context->stripes[parity].error, i)) { // parity error
1820             uint64_t addr;
1821 
1822             do_xor(&context->stripes[parity].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1823                    &context->parity_scratch[i << Vcb->sector_shift],
1824                    Vcb->superblock.sector_size);
1825 
1826             bad_off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1827             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off << Vcb->sector_shift);
1828 
1829             context->stripes[parity].rewrite = true;
1830 
1831             log_error(Vcb, addr, c->devices[parity]->devitem.dev_id, false, true, true);
1832             log_device_error(Vcb, c->devices[parity], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1833         } else if (num_errors == 1) {
1834             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off << Vcb->sector_shift);
1835 
1836             if (RtlCheckBit(&context->is_tree, bad_off)) {
1837                 tree_header* th;
1838 
1839                 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
1840                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1841                        Vcb->superblock.node_size);
1842 
1843                 th = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
1844 
1845                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
1846                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1847                                   &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
1848 
1849                     context->stripes[bad_stripe].rewrite = true;
1850 
1851                     RtlClearBits(&context->stripes[bad_stripe].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
1852 
1853                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, true, false);
1854                 } else
1855                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, false, false);
1856             } else {
1857                 uint8_t hash[MAX_HASH_SIZE];
1858 
1859                 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
1860                        &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1861                        Vcb->superblock.sector_size);
1862 
1863                 get_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], hash);
1864 
1865                 if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * bad_off), Vcb->csum_size) == Vcb->csum_size) {
1866                     RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1867                                   &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
1868 
1869                     context->stripes[bad_stripe].rewrite = true;
1870 
1871                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, true, false);
1872                 } else
1873                     log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, false, false);
1874             }
1875         } else {
1876             stripe = (parity + 1) % c->chunk_item->num_stripes;
1877             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1878 
1879             while (stripe != parity) {
1880                 if (RtlCheckBit(&context->alloc, off)) {
1881                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1882                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1883 
1884                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
1885                     }
1886                 }
1887 
1888                 off += sectors_per_stripe;
1889                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1890             }
1891         }
1892     }
1893 }
1894 
1895 static void scrub_raid6_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1896                                uint64_t num, uint16_t missing_devices) {
1897     ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift), off;
1898     uint16_t stripe, parity1 = (bit_start + num + c->chunk_item->num_stripes - 2) % c->chunk_item->num_stripes;
1899     uint16_t parity2 = (parity1 + 1) % c->chunk_item->num_stripes;
1900     uint64_t stripeoff;
1901 
1902     stripe = (parity1 + 2) % c->chunk_item->num_stripes;
1903     off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2);
1904     stripeoff = num * sectors_per_stripe;
1905 
1906     if (c->devices[parity1]->devobj)
1907         RtlCopyMemory(context->parity_scratch, &context->stripes[parity1].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1908 
1909     if (c->devices[parity2]->devobj)
1910         RtlZeroMemory(context->parity_scratch2, (ULONG)c->chunk_item->stripe_length);
1911 
1912     while (stripe != parity1) {
1913         RtlClearAllBits(&context->stripes[stripe].error);
1914 
1915         for (ULONG i = 0; i < sectors_per_stripe; i++) {
1916             if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1917                 if (RtlCheckBit(&context->is_tree, off)) {
1918                     tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff << Vcb->sector_shift];
1919                     uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1920 
1921                     if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1922                         RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size >> Vcb->sector_shift);
1923                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1924 
1925                         if (missing_devices == 2)
1926                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1927                     }
1928 
1929                     off += Vcb->superblock.node_size >> Vcb->sector_shift;
1930                     stripeoff += Vcb->superblock.node_size >> Vcb->sector_shift;
1931                     i += (Vcb->superblock.node_size >> Vcb->sector_shift) - 1;
1932 
1933                     continue;
1934                 } else if (RtlCheckBit(&context->has_csum, off)) {
1935                     uint8_t hash[MAX_HASH_SIZE];
1936 
1937                     get_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff << Vcb->sector_shift), hash);
1938 
1939                     if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * off), Vcb->csum_size) != Vcb->csum_size) {
1940                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1941 
1942                         RtlSetBit(&context->stripes[stripe].error, i);
1943                         log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1944 
1945                         if (missing_devices == 2)
1946                             log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1947                     }
1948                 }
1949             }
1950 
1951             off++;
1952             stripeoff++;
1953         }
1954 
1955         if (c->devices[parity1]->devobj)
1956             do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1957 
1958         stripe = (stripe + 1) % c->chunk_item->num_stripes;
1959         stripeoff = num * sectors_per_stripe;
1960     }
1961 
1962     RtlClearAllBits(&context->stripes[parity1].error);
1963 
1964     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity2]->devobj)) {
1965         // check parity 1
1966 
1967         for (ULONG i = 0; i < sectors_per_stripe; i++) {
1968             ULONG o, j;
1969 
1970             o = i << Vcb->sector_shift;
1971             for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1972                 if (context->parity_scratch[o] != 0) {
1973                     RtlSetBit(&context->stripes[parity1].error, i);
1974                     break;
1975                 }
1976                 o++;
1977             }
1978         }
1979     }
1980 
1981     RtlClearAllBits(&context->stripes[parity2].error);
1982 
1983     if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity1]->devobj)) {
1984         // check parity 2
1985 
1986         stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
1987 
1988         while (stripe != parity2) {
1989             galois_double(context->parity_scratch2, (uint32_t)c->chunk_item->stripe_length);
1990             do_xor(context->parity_scratch2, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1991 
1992             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
1993         }
1994 
1995         for (ULONG i = 0; i < sectors_per_stripe; i++) {
1996             if (RtlCompareMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1997                                  &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.sector_size) != Vcb->superblock.sector_size)
1998                 RtlSetBit(&context->stripes[parity2].error, i);
1999         }
2000     }
2001 
2002     if (missing_devices == 2)
2003         return;
2004 
2005     // log and fix errors
2006 
2007     for (ULONG i = 0; i < sectors_per_stripe; i++) {
2008         ULONG num_errors = 0;
2009         uint64_t bad_stripe1 = 0, bad_stripe2 = 0;
2010         ULONG bad_off1 = 0, bad_off2 = 0;
2011         bool alloc = false;
2012 
2013         stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2014         off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2015 
2016         while (stripe != parity1) {
2017             if (RtlCheckBit(&context->alloc, off)) {
2018                 alloc = true;
2019 
2020                 if (!c->devices[stripe]->devobj || RtlCheckBit(&context->stripes[stripe].error, i)) {
2021                     if (num_errors == 0) {
2022                         bad_stripe1 = stripe;
2023                         bad_off1 = off;
2024                     } else if (num_errors == 1) {
2025                         bad_stripe2 = stripe;
2026                         bad_off2 = off;
2027                     }
2028                     num_errors++;
2029                 }
2030             }
2031 
2032             off += sectors_per_stripe;
2033             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2034         }
2035 
2036         if (!alloc)
2037             continue;
2038 
2039         if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity1].error, i) && !RtlCheckBit(&context->stripes[parity2].error, i)) // everything fine
2040             continue;
2041 
2042         if (num_errors == 0) { // parity error
2043             uint64_t addr;
2044 
2045             if (RtlCheckBit(&context->stripes[parity1].error, i)) {
2046                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2047                        &context->parity_scratch[i << Vcb->sector_shift],
2048                        Vcb->superblock.sector_size);
2049 
2050                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2051                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2052 
2053                 context->stripes[parity1].rewrite = true;
2054 
2055                 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2056                 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2057             }
2058 
2059             if (RtlCheckBit(&context->stripes[parity2].error, i)) {
2060                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2061                               &context->parity_scratch2[i << Vcb->sector_shift],
2062                               Vcb->superblock.sector_size);
2063 
2064                 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2065                 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2066 
2067                 context->stripes[parity2].rewrite = true;
2068 
2069                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2070                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2071             }
2072         } else if (num_errors == 1) {
2073             uint32_t len;
2074             uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2075             uint8_t* scratch;
2076 
2077             len = RtlCheckBit(&context->is_tree, bad_off1) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2078 
2079             scratch = ExAllocatePoolWithTag(PagedPool, len, ALLOC_TAG);
2080             if (!scratch) {
2081                 ERR("out of memory\n");
2082                 return;
2083             }
2084 
2085             RtlZeroMemory(scratch, len);
2086 
2087             do_xor(&context->parity_scratch[i << Vcb->sector_shift],
2088                    &context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2089 
2090             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2091 
2092             if (c->devices[parity2]->devobj) {
2093                 uint16_t stripe_num, bad_stripe_num = 0;
2094 
2095                 stripe_num = c->chunk_item->num_stripes - 3;
2096                 while (stripe != parity2) {
2097                     galois_double(scratch, len);
2098 
2099                     if (stripe != bad_stripe1)
2100                         do_xor(scratch, &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2101                     else
2102                         bad_stripe_num = stripe_num;
2103 
2104                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2105                     stripe_num--;
2106                 }
2107 
2108                 do_xor(scratch, &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2109 
2110                 if (bad_stripe_num != 0)
2111                     galois_divpower(scratch, (uint8_t)bad_stripe_num, len);
2112             }
2113 
2114             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2115                 uint8_t hash1[MAX_HASH_SIZE];
2116                 uint8_t hash2[MAX_HASH_SIZE];
2117                 tree_header *th1 = NULL, *th2 = NULL;
2118 
2119                 if (c->devices[parity1]->devobj) {
2120                     th1 = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
2121                     get_tree_checksum(Vcb, th1, hash1);
2122                 }
2123 
2124                 if (c->devices[parity2]->devobj) {
2125                     th2 = (tree_header*)scratch;
2126                     get_tree_checksum(Vcb, th2, hash2);
2127                 }
2128 
2129                 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, th1, Vcb->csum_size) == Vcb->csum_size && th1->address == addr) ||
2130                     (c->devices[parity2]->devobj && RtlCompareMemory(hash2, th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr)) {
2131                     if (!c->devices[parity1]->devobj || RtlCompareMemory(hash1, th1, Vcb->csum_size) != Vcb->csum_size || th1->address != addr) {
2132                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2133                                       scratch, Vcb->superblock.node_size);
2134 
2135                         if (c->devices[parity1]->devobj) {
2136                             // fix parity 1
2137 
2138                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2139 
2140                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2141                                           &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2142                                           Vcb->superblock.node_size);
2143 
2144                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2145 
2146                             while (stripe != parity1) {
2147                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2148                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2149                                        Vcb->superblock.node_size);
2150 
2151                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2152                             }
2153 
2154                             context->stripes[parity1].rewrite = true;
2155 
2156                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2157                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2158                         }
2159                     } else {
2160                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2161                                       &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
2162 
2163                         if (!c->devices[parity2]->devobj || RtlCompareMemory(hash2, th2, Vcb->csum_size) != Vcb->csum_size || th2->address != addr) {
2164                             // fix parity 2
2165                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2166 
2167                             if (c->devices[parity2]->devobj) {
2168                                 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2169                                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2170                                               Vcb->superblock.node_size);
2171 
2172                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2173 
2174                                 while (stripe != parity2) {
2175                                     galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], Vcb->superblock.node_size);
2176 
2177                                     do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2178                                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2179                                            Vcb->superblock.node_size);
2180 
2181                                     stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2182                                 }
2183 
2184                                 context->stripes[parity2].rewrite = true;
2185 
2186                                 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2187                                 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2188                             }
2189                         }
2190                     }
2191 
2192                     context->stripes[bad_stripe1].rewrite = true;
2193 
2194                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2195 
2196                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2197                 } else
2198                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2199             } else {
2200                 uint8_t hash1[MAX_HASH_SIZE];
2201                 uint8_t hash2[MAX_HASH_SIZE];
2202 
2203                 if (c->devices[parity1]->devobj)
2204                     get_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], hash1);
2205 
2206                 if (c->devices[parity2]->devobj)
2207                     get_sector_csum(Vcb, scratch, hash2);
2208 
2209                 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) ||
2210                     (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size)) {
2211                     if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
2212                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2213                                       scratch, Vcb->superblock.sector_size);
2214 
2215                         if (c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2216                             // fix parity 1
2217 
2218                             stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2219 
2220                             RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2221                                           &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2222                                         Vcb->superblock.sector_size);
2223 
2224                             stripe = (stripe + 1) % c->chunk_item->num_stripes;
2225 
2226                             while (stripe != parity1) {
2227                                 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2228                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2229                                        Vcb->superblock.sector_size);
2230 
2231                                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2232                             }
2233 
2234                             context->stripes[parity1].rewrite = true;
2235 
2236                             log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2237                             log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2238                         }
2239                     } else {
2240                         RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2241                                       &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2242 
2243                         if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2244                             // fix parity 2
2245                             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2246 
2247                             RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2248                                           &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2249                                           Vcb->superblock.sector_size);
2250 
2251                             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2252 
2253                             while (stripe != parity2) {
2254                                 galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], Vcb->superblock.sector_size);
2255 
2256                                 do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2257                                        &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2258                                        Vcb->superblock.sector_size);
2259 
2260                                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2261                             }
2262 
2263                             context->stripes[parity2].rewrite = true;
2264 
2265                             log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2266                             log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2267                         }
2268                     }
2269 
2270                     context->stripes[bad_stripe1].rewrite = true;
2271 
2272                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2273                 } else
2274                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2275             }
2276 
2277             ExFreePool(scratch);
2278         } else if (num_errors == 2 && missing_devices == 0) {
2279             uint16_t x = 0, y = 0, k;
2280             uint64_t addr;
2281             uint32_t len = (RtlCheckBit(&context->is_tree, bad_off1) || RtlCheckBit(&context->is_tree, bad_off2)) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2282             uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
2283             uint32_t j;
2284 
2285             stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2286 
2287             // put qxy in parity_scratch
2288             // put pxy in parity_scratch2
2289 
2290             k = c->chunk_item->num_stripes - 3;
2291             if (stripe == bad_stripe1 || stripe == bad_stripe2) {
2292                 RtlZeroMemory(&context->parity_scratch[i << Vcb->sector_shift], len);
2293                 RtlZeroMemory(&context->parity_scratch2[i << Vcb->sector_shift], len);
2294 
2295                 if (stripe == bad_stripe1)
2296                     x = k;
2297                 else
2298                     y = k;
2299             } else {
2300                 RtlCopyMemory(&context->parity_scratch[i << Vcb->sector_shift],
2301                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2302                 RtlCopyMemory(&context->parity_scratch2[i << Vcb->sector_shift],
2303                               &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2304             }
2305 
2306             stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2307 
2308             k--;
2309             do {
2310                 galois_double(&context->parity_scratch[i << Vcb->sector_shift], len);
2311 
2312                 if (stripe != bad_stripe1 && stripe != bad_stripe2) {
2313                     do_xor(&context->parity_scratch[i << Vcb->sector_shift],
2314                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2315                     do_xor(&context->parity_scratch2[i << Vcb->sector_shift],
2316                            &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2317                 } else if (stripe == bad_stripe1)
2318                     x = k;
2319                 else if (stripe == bad_stripe2)
2320                     y = k;
2321 
2322                 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2323                 k--;
2324             } while (stripe != parity2);
2325 
2326             gyx = gpow2(y > x ? (y-x) : (255-x+y));
2327             gx = gpow2(255-x);
2328 
2329             denom = gdiv(1, gyx ^ 1);
2330             a = gmul(gyx, denom);
2331             b = gmul(gx, denom);
2332 
2333             p = &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)];
2334             q = &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)];
2335             pxy = &context->parity_scratch2[i << Vcb->sector_shift];
2336             qxy = &context->parity_scratch[i << Vcb->sector_shift];
2337 
2338             for (j = 0; j < len; j++) {
2339                 *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
2340 
2341                 p++;
2342                 q++;
2343                 pxy++;
2344                 qxy++;
2345             }
2346 
2347             do_xor(&context->parity_scratch2[i << Vcb->sector_shift], &context->parity_scratch[i << Vcb->sector_shift], len);
2348             do_xor(&context->parity_scratch2[i << Vcb->sector_shift], &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2349 
2350             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2351 
2352             if (RtlCheckBit(&context->is_tree, bad_off1)) {
2353                 tree_header* th = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
2354 
2355                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2356                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2357                                   &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
2358 
2359                     context->stripes[bad_stripe1].rewrite = true;
2360 
2361                     RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2362 
2363                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2364                 } else
2365                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2366             } else {
2367                 if (check_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], (uint8_t*)context->csum + (Vcb->csum_size * bad_off1))) {
2368                     RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2369                                   &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2370 
2371                     context->stripes[bad_stripe1].rewrite = true;
2372 
2373                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2374                 } else
2375                     log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2376             }
2377 
2378             addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off2 << Vcb->sector_shift);
2379 
2380             if (RtlCheckBit(&context->is_tree, bad_off2)) {
2381                 tree_header* th = (tree_header*)&context->parity_scratch2[i << Vcb->sector_shift];
2382 
2383                 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2384                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2385                                   &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.node_size);
2386 
2387                     context->stripes[bad_stripe2].rewrite = true;
2388 
2389                     RtlClearBits(&context->stripes[bad_stripe2].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2390 
2391                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, true, false);
2392                 } else
2393                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, false, false);
2394             } else {
2395                 if (check_sector_csum(Vcb, &context->parity_scratch2[i << Vcb->sector_shift], (uint8_t*)context->csum + (Vcb->csum_size * bad_off2))) {
2396                     RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2397                                   &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2398 
2399                     context->stripes[bad_stripe2].rewrite = true;
2400 
2401                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, true, false);
2402                 } else
2403                     log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, false, false);
2404             }
2405         } else {
2406             stripe = (parity2 + 1) % c->chunk_item->num_stripes;
2407             off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2408 
2409             while (stripe != parity1) {
2410                 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
2411                     if (RtlCheckBit(&context->stripes[stripe].error, i)) {
2412                         uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
2413 
2414                         log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
2415                     }
2416                 }
2417 
2418                 off += sectors_per_stripe;
2419                 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2420             }
2421         }
2422     }
2423 }
2424 
2425 static NTSTATUS scrub_chunk_raid56_stripe_run(device_extension* Vcb, chunk* c, uint64_t stripe_start, uint64_t stripe_end) {
2426     NTSTATUS Status;
2427     KEY searchkey;
2428     traverse_ptr tp;
2429     bool b;
2430     uint64_t run_start, run_end, full_stripe_len, stripe;
2431     uint32_t max_read, num_sectors;
2432     ULONG arrlen, *allocarr, *csumarr = NULL, *treearr, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2433     scrub_context_raid56 context;
2434     uint16_t i;
2435     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2436 
2437     TRACE("(%p, %p, %I64x, %I64x)\n", Vcb, c, stripe_start, stripe_end);
2438 
2439     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2440     run_start = c->offset + (stripe_start * full_stripe_len);
2441     run_end = c->offset + ((stripe_end + 1) * full_stripe_len);
2442 
2443     searchkey.obj_id = run_start;
2444     searchkey.obj_type = TYPE_METADATA_ITEM;
2445     searchkey.offset = 0xffffffffffffffff;
2446 
2447     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2448     if (!NT_SUCCESS(Status)) {
2449         ERR("find_item returned %08lx\n", Status);
2450         return Status;
2451     }
2452 
2453     num_sectors = (uint32_t)(((stripe_end - stripe_start + 1) * full_stripe_len) >> Vcb->sector_shift);
2454     arrlen = (ULONG)sector_align((num_sectors / 8) + 1, sizeof(ULONG));
2455 
2456     allocarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2457     if (!allocarr) {
2458         ERR("out of memory\n");
2459         return STATUS_INSUFFICIENT_RESOURCES;
2460     }
2461 
2462     treearr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2463     if (!treearr) {
2464         ERR("out of memory\n");
2465         ExFreePool(allocarr);
2466         return STATUS_INSUFFICIENT_RESOURCES;
2467     }
2468 
2469     RtlInitializeBitMap(&context.alloc, allocarr, num_sectors);
2470     RtlClearAllBits(&context.alloc);
2471 
2472     RtlInitializeBitMap(&context.is_tree, treearr, num_sectors);
2473     RtlClearAllBits(&context.is_tree);
2474 
2475     context.parity_scratch = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2476     if (!context.parity_scratch) {
2477         ERR("out of memory\n");
2478         ExFreePool(allocarr);
2479         ExFreePool(treearr);
2480         return STATUS_INSUFFICIENT_RESOURCES;
2481     }
2482 
2483     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2484         csumarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2485         if (!csumarr) {
2486             ERR("out of memory\n");
2487             ExFreePool(allocarr);
2488             ExFreePool(treearr);
2489             ExFreePool(context.parity_scratch);
2490             return STATUS_INSUFFICIENT_RESOURCES;
2491         }
2492 
2493         RtlInitializeBitMap(&context.has_csum, csumarr, num_sectors);
2494         RtlClearAllBits(&context.has_csum);
2495 
2496         context.csum = ExAllocatePoolWithTag(PagedPool, num_sectors * Vcb->csum_size, ALLOC_TAG);
2497         if (!context.csum) {
2498             ERR("out of memory\n");
2499             ExFreePool(allocarr);
2500             ExFreePool(treearr);
2501             ExFreePool(context.parity_scratch);
2502             ExFreePool(csumarr);
2503             return STATUS_INSUFFICIENT_RESOURCES;
2504         }
2505     }
2506 
2507     if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2508         context.parity_scratch2 = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2509         if (!context.parity_scratch2) {
2510             ERR("out of memory\n");
2511             ExFreePool(allocarr);
2512             ExFreePool(treearr);
2513             ExFreePool(context.parity_scratch);
2514 
2515             if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2516                 ExFreePool(csumarr);
2517                 ExFreePool(context.csum);
2518             }
2519 
2520             return STATUS_INSUFFICIENT_RESOURCES;
2521         }
2522     }
2523 
2524     do {
2525         traverse_ptr next_tp;
2526 
2527         if (tp.item->key.obj_id >= run_end)
2528             break;
2529 
2530         if (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM) {
2531             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2532 
2533             if (tp.item->key.obj_id + size > run_start) {
2534                 uint64_t extent_start = max(run_start, tp.item->key.obj_id);
2535                 uint64_t extent_end = min(tp.item->key.obj_id + size, run_end);
2536                 bool extent_is_tree = false;
2537 
2538                 RtlSetBits(&context.alloc, (ULONG)((extent_start - run_start) >> Vcb->sector_shift), (ULONG)((extent_end - extent_start) >> Vcb->sector_shift));
2539 
2540                 if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2541                     extent_is_tree = true;
2542                 else {
2543                     EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2544 
2545                     if (tp.item->size < sizeof(EXTENT_ITEM)) {
2546                         ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2547                         Status = STATUS_INTERNAL_ERROR;
2548                         goto end;
2549                     }
2550 
2551                     if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2552                         extent_is_tree = true;
2553                 }
2554 
2555                 if (extent_is_tree)
2556                     RtlSetBits(&context.is_tree, (ULONG)((extent_start - run_start) >> Vcb->sector_shift), (ULONG)((extent_end - extent_start) >> Vcb->sector_shift));
2557                 else if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2558                     traverse_ptr tp2;
2559                     bool b2;
2560 
2561                     searchkey.obj_id = EXTENT_CSUM_ID;
2562                     searchkey.obj_type = TYPE_EXTENT_CSUM;
2563                     searchkey.offset = extent_start;
2564 
2565                     Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
2566                     if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
2567                         ERR("find_item returned %08lx\n", Status);
2568                         goto end;
2569                     }
2570 
2571                     do {
2572                         traverse_ptr next_tp2;
2573 
2574                         if (tp2.item->key.offset >= extent_end)
2575                             break;
2576 
2577                         if (tp2.item->key.offset >= extent_start) {
2578                             uint64_t csum_start = max(extent_start, tp2.item->key.offset);
2579                             uint64_t csum_end = min(extent_end, tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size));
2580 
2581                             RtlSetBits(&context.has_csum, (ULONG)((csum_start - run_start) >> Vcb->sector_shift), (ULONG)((csum_end - csum_start) >> Vcb->sector_shift));
2582 
2583                             RtlCopyMemory((uint8_t*)context.csum + (((csum_start - run_start) * Vcb->csum_size) >> Vcb->sector_shift),
2584                                           tp2.item->data + (((csum_start - tp2.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
2585                                           (ULONG)(((csum_end - csum_start) * Vcb->csum_size) >> Vcb->sector_shift));
2586                         }
2587 
2588                         b2 = find_next_item(Vcb, &tp2, &next_tp2, false, NULL);
2589 
2590                         if (b2)
2591                             tp2 = next_tp2;
2592                     } while (b2);
2593                 }
2594             }
2595         }
2596 
2597         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2598 
2599         if (b)
2600             tp = next_tp;
2601     } while (b);
2602 
2603     context.stripes = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_context_raid56_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
2604     if (!context.stripes) {
2605         ERR("out of memory\n");
2606         Status = STATUS_INSUFFICIENT_RESOURCES;
2607         goto end;
2608     }
2609 
2610     max_read = (uint32_t)min(1048576 / c->chunk_item->stripe_length, stripe_end - stripe_start + 1); // only process 1 MB of data at a time
2611 
2612     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2613         context.stripes[i].buf = ExAllocatePoolWithTag(PagedPool, (ULONG)(max_read * c->chunk_item->stripe_length), ALLOC_TAG);
2614         if (!context.stripes[i].buf) {
2615             uint64_t j;
2616 
2617             ERR("out of memory\n");
2618 
2619             for (j = 0; j < i; j++) {
2620                 ExFreePool(context.stripes[j].buf);
2621             }
2622             ExFreePool(context.stripes);
2623 
2624             Status = STATUS_INSUFFICIENT_RESOURCES;
2625             goto end;
2626         }
2627 
2628         context.stripes[i].errorarr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(((c->chunk_item->stripe_length >> Vcb->sector_shift) / 8) + 1, sizeof(ULONG)), ALLOC_TAG);
2629         if (!context.stripes[i].errorarr) {
2630             uint64_t j;
2631 
2632             ERR("out of memory\n");
2633 
2634             ExFreePool(context.stripes[i].buf);
2635 
2636             for (j = 0; j < i; j++) {
2637                 ExFreePool(context.stripes[j].buf);
2638             }
2639             ExFreePool(context.stripes);
2640 
2641             Status = STATUS_INSUFFICIENT_RESOURCES;
2642             goto end;
2643         }
2644 
2645         RtlInitializeBitMap(&context.stripes[i].error, context.stripes[i].errorarr, (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift));
2646 
2647         context.stripes[i].context = &context;
2648         context.stripes[i].rewrite = false;
2649     }
2650 
2651     stripe = stripe_start;
2652 
2653     Status = STATUS_SUCCESS;
2654 
2655     chunk_lock_range(Vcb, c, run_start, run_end - run_start);
2656 
2657     do {
2658         ULONG read_stripes;
2659         uint16_t missing_devices = 0;
2660         bool need_wait = false;
2661 
2662         if (max_read < stripe_end + 1 - stripe)
2663             read_stripes = max_read;
2664         else
2665             read_stripes = (ULONG)(stripe_end + 1 - stripe);
2666 
2667         context.stripes_left = c->chunk_item->num_stripes;
2668 
2669         // read megabyte by megabyte
2670         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2671             if (c->devices[i]->devobj) {
2672                 PIO_STACK_LOCATION IrpSp;
2673 
2674                 context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
2675 
2676                 if (!context.stripes[i].Irp) {
2677                     ERR("IoAllocateIrp failed\n");
2678                     Status = STATUS_INSUFFICIENT_RESOURCES;
2679                     goto end3;
2680                 }
2681 
2682                 context.stripes[i].Irp->MdlAddress = NULL;
2683 
2684                 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2685                 IrpSp->MajorFunction = IRP_MJ_READ;
2686                 IrpSp->FileObject = c->devices[i]->fileobj;
2687 
2688                 if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2689                     context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(read_stripes * c->chunk_item->stripe_length), ALLOC_TAG);
2690                     if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2691                         ERR("out of memory\n");
2692                         Status = STATUS_INSUFFICIENT_RESOURCES;
2693                         goto end3;
2694                     }
2695 
2696                     context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2697 
2698                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2699                 } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
2700                     context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, (ULONG)(read_stripes * c->chunk_item->stripe_length), false, false, NULL);
2701                     if (!context.stripes[i].Irp->MdlAddress) {
2702                         ERR("IoAllocateMdl failed\n");
2703                         Status = STATUS_INSUFFICIENT_RESOURCES;
2704                         goto end3;
2705                     }
2706 
2707                     Status = STATUS_SUCCESS;
2708 
2709                     _SEH2_TRY {
2710                         MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
2711                     } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2712                         Status = _SEH2_GetExceptionCode();
2713                     } _SEH2_END;
2714 
2715                     if (!NT_SUCCESS(Status)) {
2716                         ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2717                         IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2718                         goto end3;
2719                     }
2720                 } else
2721                     context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2722 
2723                 context.stripes[i].offset = stripe * c->chunk_item->stripe_length;
2724 
2725                 IrpSp->Parameters.Read.Length = (ULONG)(read_stripes * c->chunk_item->stripe_length);
2726                 IrpSp->Parameters.Read.ByteOffset.QuadPart = cis[i].offset + context.stripes[i].offset;
2727 
2728                 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2729                 context.stripes[i].missing = false;
2730 
2731                 IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion_raid56, &context.stripes[i], true, true, true);
2732 
2733                 Vcb->scrub.data_scrubbed += read_stripes * c->chunk_item->stripe_length;
2734                 need_wait = true;
2735             } else {
2736                 context.stripes[i].Irp = NULL;
2737                 context.stripes[i].missing = true;
2738                 missing_devices++;
2739                 InterlockedDecrement(&context.stripes_left);
2740             }
2741         }
2742 
2743         if (c->chunk_item->type & BLOCK_FLAG_RAID5 && missing_devices > 1) {
2744             ERR("too many missing devices (%u, maximum 1)\n", missing_devices);
2745             Status = STATUS_UNEXPECTED_IO_ERROR;
2746             goto end3;
2747         } else if (c->chunk_item->type & BLOCK_FLAG_RAID6 && missing_devices > 2) {
2748             ERR("too many missing devices (%u, maximum 2)\n", missing_devices);
2749             Status = STATUS_UNEXPECTED_IO_ERROR;
2750             goto end3;
2751         }
2752 
2753         if (need_wait) {
2754             KeInitializeEvent(&context.Event, NotificationEvent, false);
2755 
2756             for (i = 0; i < c->chunk_item->num_stripes; i++) {
2757                 if (c->devices[i]->devobj)
2758                     IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
2759             }
2760 
2761             KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2762         }
2763 
2764         // return an error if any of the stripes returned an error
2765         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2766             if (!context.stripes[i].missing && !NT_SUCCESS(context.stripes[i].iosb.Status)) {
2767                 Status = context.stripes[i].iosb.Status;
2768                 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
2769                 goto end3;
2770             }
2771         }
2772 
2773         if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2774             for (i = 0; i < read_stripes; i++) {
2775                 scrub_raid6_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2776             }
2777         } else {
2778             for (i = 0; i < read_stripes; i++) {
2779                 scrub_raid5_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2780             }
2781         }
2782         stripe += read_stripes;
2783 
2784 end3:
2785         for (i = 0; i < c->chunk_item->num_stripes; i++) {
2786             if (context.stripes[i].Irp) {
2787                 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
2788                     MmUnlockPages(context.stripes[i].Irp->MdlAddress);
2789                     IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2790                 }
2791                 IoFreeIrp(context.stripes[i].Irp);
2792                 context.stripes[i].Irp = NULL;
2793 
2794                 if (context.stripes[i].rewrite) {
2795                     Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + context.stripes[i].offset,
2796                                              context.stripes[i].buf, (uint32_t)(read_stripes * c->chunk_item->stripe_length));
2797 
2798                     if (!NT_SUCCESS(Status)) {
2799                         ERR("write_data_phys returned %08lx\n", Status);
2800                         log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
2801                         goto end2;
2802                     }
2803                 }
2804             }
2805         }
2806 
2807         if (!NT_SUCCESS(Status))
2808             break;
2809     } while (stripe < stripe_end);
2810 
2811 end2:
2812     chunk_unlock_range(Vcb, c, run_start, run_end - run_start);
2813 
2814     for (i = 0; i < c->chunk_item->num_stripes; i++) {
2815         ExFreePool(context.stripes[i].buf);
2816         ExFreePool(context.stripes[i].errorarr);
2817     }
2818     ExFreePool(context.stripes);
2819 
2820 end:
2821     ExFreePool(treearr);
2822     ExFreePool(allocarr);
2823     ExFreePool(context.parity_scratch);
2824 
2825     if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2826         ExFreePool(context.parity_scratch2);
2827 
2828     if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2829         ExFreePool(csumarr);
2830         ExFreePool(context.csum);
2831     }
2832 
2833     return Status;
2834 }
2835 
2836 static NTSTATUS scrub_chunk_raid56(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2837     NTSTATUS Status;
2838     KEY searchkey;
2839     traverse_ptr tp;
2840     bool b;
2841     uint64_t full_stripe_len, stripe, stripe_start = 0, stripe_end = 0, total_data = 0;
2842     ULONG num_extents = 0, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2843 
2844     full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2845     stripe = (*offset - c->offset) / full_stripe_len;
2846 
2847     *offset = c->offset + (stripe * full_stripe_len);
2848 
2849     searchkey.obj_id = *offset;
2850     searchkey.obj_type = TYPE_METADATA_ITEM;
2851     searchkey.offset = 0xffffffffffffffff;
2852 
2853     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2854     if (!NT_SUCCESS(Status)) {
2855         ERR("find_item returned %08lx\n", Status);
2856         return Status;
2857     }
2858 
2859     *changed = false;
2860 
2861     do {
2862         traverse_ptr next_tp;
2863 
2864         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2865             break;
2866 
2867         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2868             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2869 
2870             TRACE("%I64x\n", tp.item->key.obj_id);
2871 
2872             if (size < Vcb->superblock.sector_size) {
2873                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
2874                 return STATUS_INTERNAL_ERROR;
2875             }
2876 
2877             stripe = (tp.item->key.obj_id - c->offset) / full_stripe_len;
2878 
2879             if (*changed) {
2880                 if (stripe > stripe_end + 1) {
2881                     Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2882                     if (!NT_SUCCESS(Status)) {
2883                         ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2884                         return Status;
2885                     }
2886 
2887                     stripe_start = stripe;
2888                 }
2889             } else
2890                 stripe_start = stripe;
2891 
2892             stripe_end = (tp.item->key.obj_id + size - 1 - c->offset) / full_stripe_len;
2893 
2894             *changed = true;
2895 
2896             total_data += size;
2897             num_extents++;
2898 
2899             // only do so much at a time
2900             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
2901                 break;
2902         }
2903 
2904         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2905 
2906         if (b)
2907             tp = next_tp;
2908     } while (b);
2909 
2910     if (*changed) {
2911         Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2912         if (!NT_SUCCESS(Status)) {
2913             ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2914             return Status;
2915         }
2916 
2917         *offset = c->offset + ((stripe_end + 1) * full_stripe_len);
2918     }
2919 
2920     return STATUS_SUCCESS;
2921 }
2922 
2923 static NTSTATUS scrub_chunk(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2924     NTSTATUS Status;
2925     KEY searchkey;
2926     traverse_ptr tp;
2927     bool b = false, tree_run = false;
2928     ULONG type, num_extents = 0;
2929     uint64_t total_data = 0, tree_run_start = 0, tree_run_end = 0;
2930 
2931     TRACE("chunk %I64x\n", c->offset);
2932 
2933     ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2934 
2935     if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2936         type = BLOCK_FLAG_DUPLICATE;
2937     else if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2938         type = BLOCK_FLAG_RAID0;
2939     else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2940         type = BLOCK_FLAG_DUPLICATE;
2941     else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2942         type = BLOCK_FLAG_RAID10;
2943     else if (c->chunk_item->type & BLOCK_FLAG_RAID5) {
2944         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2945         goto end;
2946     } else if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2947         Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2948         goto end;
2949     } else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2950         type = BLOCK_FLAG_DUPLICATE;
2951     else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2952         type = BLOCK_FLAG_DUPLICATE;
2953     else // SINGLE
2954         type = BLOCK_FLAG_DUPLICATE;
2955 
2956     searchkey.obj_id = *offset;
2957     searchkey.obj_type = TYPE_METADATA_ITEM;
2958     searchkey.offset = 0xffffffffffffffff;
2959 
2960     Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2961     if (!NT_SUCCESS(Status)) {
2962         ERR("error - find_item returned %08lx\n", Status);
2963         goto end;
2964     }
2965 
2966     do {
2967         traverse_ptr next_tp;
2968 
2969         if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2970             break;
2971 
2972         if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2973             uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2974             bool is_tree;
2975             void* csum = NULL;
2976             RTL_BITMAP bmp;
2977             ULONG* bmparr = NULL, bmplen;
2978 
2979             TRACE("%I64x\n", tp.item->key.obj_id);
2980 
2981             is_tree = false;
2982 
2983             if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2984                 is_tree = true;
2985             else {
2986                 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2987 
2988                 if (tp.item->size < sizeof(EXTENT_ITEM)) {
2989                     ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2990                     Status = STATUS_INTERNAL_ERROR;
2991                     goto end;
2992                 }
2993 
2994                 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2995                     is_tree = true;
2996             }
2997 
2998             if (size < Vcb->superblock.sector_size) {
2999                 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
3000                 Status = STATUS_INTERNAL_ERROR;
3001                 goto end;
3002             }
3003 
3004             // load csum
3005             if (!is_tree) {
3006                 traverse_ptr tp2;
3007 
3008                 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((Vcb->csum_size * size) >> Vcb->sector_shift), ALLOC_TAG);
3009                 if (!csum) {
3010                     ERR("out of memory\n");
3011                     Status = STATUS_INSUFFICIENT_RESOURCES;
3012                     goto end;
3013                 }
3014 
3015                 bmplen = (ULONG)(size >> Vcb->sector_shift);
3016 
3017                 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)(sector_align((bmplen >> 3) + 1, sizeof(ULONG))), ALLOC_TAG);
3018                 if (!bmparr) {
3019                     ERR("out of memory\n");
3020                     ExFreePool(csum);
3021                     Status = STATUS_INSUFFICIENT_RESOURCES;
3022                     goto end;
3023                 }
3024 
3025                 RtlInitializeBitMap(&bmp, bmparr, bmplen);
3026                 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
3027 
3028                 searchkey.obj_id = EXTENT_CSUM_ID;
3029                 searchkey.obj_type = TYPE_EXTENT_CSUM;
3030                 searchkey.offset = tp.item->key.obj_id;
3031 
3032                 Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
3033                 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
3034                     ERR("find_item returned %08lx\n", Status);
3035                     ExFreePool(csum);
3036                     ExFreePool(bmparr);
3037                     goto end;
3038                 }
3039 
3040                 if (Status != STATUS_NOT_FOUND) {
3041                     do {
3042                         traverse_ptr next_tp2;
3043 
3044                         if (tp2.item->key.obj_type == TYPE_EXTENT_CSUM) {
3045                             if (tp2.item->key.offset >= tp.item->key.obj_id + size)
3046                                 break;
3047                             else if (tp2.item->size >= Vcb->csum_size && tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size) >= tp.item->key.obj_id) {
3048                                 uint64_t cs = max(tp.item->key.obj_id, tp2.item->key.offset);
3049                                 uint64_t ce = min(tp.item->key.obj_id + size, tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size));
3050 
3051                                 RtlCopyMemory((uint8_t*)csum + (((cs - tp.item->key.obj_id) * Vcb->csum_size) >> Vcb->sector_shift),
3052                                               tp2.item->data + (((cs - tp2.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
3053                                               (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift));
3054 
3055                                 RtlClearBits(&bmp, (ULONG)((cs - tp.item->key.obj_id) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift));
3056 
3057                                 if (ce == tp.item->key.obj_id + size)
3058                                     break;
3059                             }
3060                         }
3061 
3062                         if (find_next_item(Vcb, &tp2, &next_tp2, false, NULL))
3063                             tp2 = next_tp2;
3064                         else
3065                             break;
3066                     } while (true);
3067                 }
3068             }
3069 
3070             if (tree_run) {
3071                 if (!is_tree || tp.item->key.obj_id > tree_run_end) {
3072                     Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3073                     if (!NT_SUCCESS(Status)) {
3074                         ERR("scrub_extent returned %08lx\n", Status);
3075                         goto end;
3076                     }
3077 
3078                     if (!is_tree)
3079                         tree_run = false;
3080                     else {
3081                         tree_run_start = tp.item->key.obj_id;
3082                         tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3083                     }
3084                 } else
3085                     tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3086             } else if (is_tree) {
3087                 tree_run = true;
3088                 tree_run_start = tp.item->key.obj_id;
3089                 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3090             }
3091 
3092             if (!is_tree) {
3093                 Status = scrub_data_extent(Vcb, c, tp.item->key.obj_id, type, csum, &bmp, bmplen);
3094                 if (!NT_SUCCESS(Status)) {
3095                     ERR("scrub_data_extent returned %08lx\n", Status);
3096                     ExFreePool(csum);
3097                     ExFreePool(bmparr);
3098                     goto end;
3099                 }
3100 
3101                 ExFreePool(csum);
3102                 ExFreePool(bmparr);
3103             }
3104 
3105             *offset = tp.item->key.obj_id + size;
3106             *changed = true;
3107 
3108             total_data += size;
3109             num_extents++;
3110 
3111             // only do so much at a time
3112             if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
3113                 break;
3114         }
3115 
3116         b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
3117 
3118         if (b)
3119             tp = next_tp;
3120     } while (b);
3121 
3122     if (tree_run) {
3123         Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3124         if (!NT_SUCCESS(Status)) {
3125             ERR("scrub_extent returned %08lx\n", Status);
3126             goto end;
3127         }
3128     }
3129 
3130     Status = STATUS_SUCCESS;
3131 
3132 end:
3133     ExReleaseResourceLite(&Vcb->tree_lock);
3134 
3135     return Status;
3136 }
3137 
3138 _Function_class_(KSTART_ROUTINE)
3139 static void __stdcall scrub_thread(void* context) {
3140     device_extension* Vcb = context;
3141     LIST_ENTRY chunks, *le;
3142     NTSTATUS Status;
3143     LARGE_INTEGER time;
3144 
3145     KeInitializeEvent(&Vcb->scrub.finished, NotificationEvent, false);
3146 
3147     InitializeListHead(&chunks);
3148 
3149     ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3150 
3151     if (Vcb->need_write && !Vcb->readonly)
3152         Status = do_write(Vcb, NULL);
3153     else
3154         Status = STATUS_SUCCESS;
3155 
3156     free_trees(Vcb);
3157 
3158     if (!NT_SUCCESS(Status)) {
3159         ExReleaseResourceLite(&Vcb->tree_lock);
3160         ERR("do_write returned %08lx\n", Status);
3161         Vcb->scrub.error = Status;
3162         goto end;
3163     }
3164 
3165     ExConvertExclusiveToSharedLite(&Vcb->tree_lock);
3166 
3167     ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3168 
3169     KeQuerySystemTime(&Vcb->scrub.start_time);
3170     Vcb->scrub.finish_time.QuadPart = 0;
3171     Vcb->scrub.resume_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3172     Vcb->scrub.duration.QuadPart = 0;
3173     Vcb->scrub.total_chunks = 0;
3174     Vcb->scrub.chunks_left = 0;
3175     Vcb->scrub.data_scrubbed = 0;
3176     Vcb->scrub.num_errors = 0;
3177 
3178     while (!IsListEmpty(&Vcb->scrub.errors)) {
3179         scrub_error* err = CONTAINING_RECORD(RemoveHeadList(&Vcb->scrub.errors), scrub_error, list_entry);
3180         ExFreePool(err);
3181     }
3182 
3183     ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3184 
3185     le = Vcb->chunks.Flink;
3186     while (le != &Vcb->chunks) {
3187         chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3188 
3189         acquire_chunk_lock(c, Vcb);
3190 
3191         if (!c->readonly) {
3192             InsertTailList(&chunks, &c->list_entry_balance);
3193             Vcb->scrub.total_chunks++;
3194             Vcb->scrub.chunks_left++;
3195         }
3196 
3197         release_chunk_lock(c, Vcb);
3198 
3199         le = le->Flink;
3200     }
3201 
3202     ExReleaseResourceLite(&Vcb->chunk_lock);
3203 
3204     ExReleaseResource(&Vcb->scrub.stats_lock);
3205 
3206     ExReleaseResourceLite(&Vcb->tree_lock);
3207 
3208     while (!IsListEmpty(&chunks)) {
3209         chunk* c = CONTAINING_RECORD(RemoveHeadList(&chunks), chunk, list_entry_balance);
3210         uint64_t offset = c->offset;
3211         bool changed;
3212 
3213         c->reloc = true;
3214 
3215         KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3216 
3217         if (!Vcb->scrub.stopping) {
3218             do {
3219                 changed = false;
3220 
3221                 Status = scrub_chunk(Vcb, c, &offset, &changed);
3222                 if (!NT_SUCCESS(Status)) {
3223                     ERR("scrub_chunk returned %08lx\n", Status);
3224                     Vcb->scrub.stopping = true;
3225                     Vcb->scrub.error = Status;
3226                     break;
3227                 }
3228 
3229                 if (offset == c->offset + c->chunk_item->size || Vcb->scrub.stopping)
3230                     break;
3231 
3232                 KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3233             } while (changed);
3234         }
3235 
3236         ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3237 
3238         if (!Vcb->scrub.stopping)
3239             Vcb->scrub.chunks_left--;
3240 
3241         if (IsListEmpty(&chunks))
3242             KeQuerySystemTime(&Vcb->scrub.finish_time);
3243 
3244         ExReleaseResource(&Vcb->scrub.stats_lock);
3245 
3246         c->reloc = false;
3247         c->list_entry_balance.Flink = NULL;
3248     }
3249 
3250     KeQuerySystemTime(&time);
3251     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3252 
3253 end:
3254     ZwClose(Vcb->scrub.thread);
3255     Vcb->scrub.thread = NULL;
3256 
3257     KeSetEvent(&Vcb->scrub.finished, 0, false);
3258 }
3259 
3260 NTSTATUS start_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3261     NTSTATUS Status;
3262     OBJECT_ATTRIBUTES oa;
3263 
3264     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3265         return STATUS_PRIVILEGE_NOT_HELD;
3266 
3267     if (Vcb->locked) {
3268         WARN("cannot start scrub while locked\n");
3269         return STATUS_DEVICE_NOT_READY;
3270     }
3271 
3272     if (Vcb->balance.thread) {
3273         WARN("cannot start scrub while balance running\n");
3274         return STATUS_DEVICE_NOT_READY;
3275     }
3276 
3277     if (Vcb->scrub.thread) {
3278         WARN("scrub already running\n");
3279         return STATUS_DEVICE_NOT_READY;
3280     }
3281 
3282     if (Vcb->readonly)
3283         return STATUS_MEDIA_WRITE_PROTECTED;
3284 
3285     Vcb->scrub.stopping = false;
3286     Vcb->scrub.paused = false;
3287     Vcb->scrub.error = STATUS_SUCCESS;
3288     KeInitializeEvent(&Vcb->scrub.event, NotificationEvent, !Vcb->scrub.paused);
3289 
3290     InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3291 
3292     Status = PsCreateSystemThread(&Vcb->scrub.thread, 0, &oa, NULL, NULL, scrub_thread, Vcb);
3293     if (!NT_SUCCESS(Status)) {
3294         ERR("PsCreateSystemThread returned %08lx\n", Status);
3295         return Status;
3296     }
3297 
3298     return STATUS_SUCCESS;
3299 }
3300 
3301 NTSTATUS query_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode, void* data, ULONG length) {
3302     btrfs_query_scrub* bqs = (btrfs_query_scrub*)data;
3303     ULONG len;
3304     NTSTATUS Status;
3305     LIST_ENTRY* le;
3306     btrfs_scrub_error* bse = NULL;
3307 
3308     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3309         return STATUS_PRIVILEGE_NOT_HELD;
3310 
3311     if (length < offsetof(btrfs_query_scrub, errors))
3312         return STATUS_BUFFER_TOO_SMALL;
3313 
3314     ExAcquireResourceSharedLite(&Vcb->scrub.stats_lock, true);
3315 
3316     if (Vcb->scrub.thread && Vcb->scrub.chunks_left > 0)
3317         bqs->status = Vcb->scrub.paused ? BTRFS_SCRUB_PAUSED : BTRFS_SCRUB_RUNNING;
3318     else
3319         bqs->status = BTRFS_SCRUB_STOPPED;
3320 
3321     bqs->start_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3322     bqs->finish_time.QuadPart = Vcb->scrub.finish_time.QuadPart;
3323     bqs->chunks_left = Vcb->scrub.chunks_left;
3324     bqs->total_chunks = Vcb->scrub.total_chunks;
3325     bqs->data_scrubbed = Vcb->scrub.data_scrubbed;
3326 
3327     bqs->duration = Vcb->scrub.duration.QuadPart;
3328 
3329     if (bqs->status == BTRFS_SCRUB_RUNNING) {
3330         LARGE_INTEGER time;
3331 
3332         KeQuerySystemTime(&time);
3333         bqs->duration += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3334     }
3335 
3336     bqs->error = Vcb->scrub.error;
3337 
3338     bqs->num_errors = Vcb->scrub.num_errors;
3339 
3340     len = length - offsetof(btrfs_query_scrub, errors);
3341 
3342     le = Vcb->scrub.errors.Flink;
3343     while (le != &Vcb->scrub.errors) {
3344         scrub_error* err = CONTAINING_RECORD(le, scrub_error, list_entry);
3345         ULONG errlen;
3346 
3347         if (err->is_metadata)
3348             errlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3349         else
3350             errlen = offsetof(btrfs_scrub_error, data.filename) + err->data.filename_length;
3351 
3352         if (len < errlen) {
3353             Status = STATUS_BUFFER_OVERFLOW;
3354             goto end;
3355         }
3356 
3357         if (!bse)
3358             bse = &bqs->errors;
3359         else {
3360             ULONG lastlen;
3361 
3362             if (bse->is_metadata)
3363                 lastlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3364             else
3365                 lastlen = offsetof(btrfs_scrub_error, data.filename) + bse->data.filename_length;
3366 
3367             bse->next_entry = lastlen;
3368             bse = (btrfs_scrub_error*)(((uint8_t*)bse) + lastlen);
3369         }
3370 
3371         bse->next_entry = 0;
3372         bse->address = err->address;
3373         bse->device = err->device;
3374         bse->recovered = err->recovered;
3375         bse->is_metadata = err->is_metadata;
3376         bse->parity = err->parity;
3377 
3378         if (err->is_metadata) {
3379             bse->metadata.root = err->metadata.root;
3380             bse->metadata.level = err->metadata.level;
3381             bse->metadata.firstitem = err->metadata.firstitem;
3382         } else {
3383             bse->data.subvol = err->data.subvol;
3384             bse->data.offset = err->data.offset;
3385             bse->data.filename_length = err->data.filename_length;
3386             RtlCopyMemory(bse->data.filename, err->data.filename, err->data.filename_length);
3387         }
3388 
3389         len -= errlen;
3390         le = le->Flink;
3391     }
3392 
3393     Status = STATUS_SUCCESS;
3394 
3395 end:
3396     ExReleaseResourceLite(&Vcb->scrub.stats_lock);
3397 
3398     return Status;
3399 }
3400 
3401 NTSTATUS pause_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3402     LARGE_INTEGER time;
3403 
3404     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3405         return STATUS_PRIVILEGE_NOT_HELD;
3406 
3407     if (!Vcb->scrub.thread)
3408         return STATUS_DEVICE_NOT_READY;
3409 
3410     if (Vcb->scrub.paused)
3411         return STATUS_DEVICE_NOT_READY;
3412 
3413     Vcb->scrub.paused = true;
3414     KeClearEvent(&Vcb->scrub.event);
3415 
3416     KeQuerySystemTime(&time);
3417     Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3418 
3419     return STATUS_SUCCESS;
3420 }
3421 
3422 NTSTATUS resume_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3423     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3424         return STATUS_PRIVILEGE_NOT_HELD;
3425 
3426     if (!Vcb->scrub.thread)
3427         return STATUS_DEVICE_NOT_READY;
3428 
3429     if (!Vcb->scrub.paused)
3430         return STATUS_DEVICE_NOT_READY;
3431 
3432     Vcb->scrub.paused = false;
3433     KeSetEvent(&Vcb->scrub.event, 0, false);
3434 
3435     KeQuerySystemTime(&Vcb->scrub.resume_time);
3436 
3437     return STATUS_SUCCESS;
3438 }
3439 
3440 NTSTATUS stop_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3441     if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3442         return STATUS_PRIVILEGE_NOT_HELD;
3443 
3444     if (!Vcb->scrub.thread)
3445         return STATUS_DEVICE_NOT_READY;
3446 
3447     Vcb->scrub.paused = false;
3448     Vcb->scrub.stopping = true;
3449     KeSetEvent(&Vcb->scrub.event, 0, false);
3450 
3451     return STATUS_SUCCESS;
3452 }
3453