1 /* Copyright (c) Mark Harmstone 2017
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19
20 #define SCRUB_UNIT 0x100000 // 1 MB
21
22 struct _scrub_context;
23
24 typedef struct {
25 struct _scrub_context* context;
26 PIRP Irp;
27 uint64_t start;
28 uint32_t length;
29 IO_STATUS_BLOCK iosb;
30 uint8_t* buf;
31 bool csum_error;
32 void* bad_csums;
33 } scrub_context_stripe;
34
35 typedef struct _scrub_context {
36 KEVENT Event;
37 scrub_context_stripe* stripes;
38 LONG stripes_left;
39 } scrub_context;
40
41 typedef struct {
42 ANSI_STRING name;
43 bool orig_subvol;
44 LIST_ENTRY list_entry;
45 } path_part;
46
log_file_checksum_error(device_extension * Vcb,uint64_t addr,uint64_t devid,uint64_t subvol,uint64_t inode,uint64_t offset)47 static void log_file_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t subvol, uint64_t inode, uint64_t offset) {
48 LIST_ENTRY *le, parts;
49 root* r = NULL;
50 KEY searchkey;
51 traverse_ptr tp;
52 uint64_t dir;
53 bool orig_subvol = true, not_in_tree = false;
54 ANSI_STRING fn;
55 scrub_error* err;
56 NTSTATUS Status;
57 ULONG utf16len;
58
59 le = Vcb->roots.Flink;
60 while (le != &Vcb->roots) {
61 root* r2 = CONTAINING_RECORD(le, root, list_entry);
62
63 if (r2->id == subvol) {
64 r = r2;
65 break;
66 }
67
68 le = le->Flink;
69 }
70
71 if (!r) {
72 ERR("could not find subvol %I64x\n", subvol);
73 return;
74 }
75
76 InitializeListHead(&parts);
77
78 dir = inode;
79
80 while (true) {
81 if (dir == r->root_item.objid) {
82 if (r == Vcb->root_fileref->fcb->subvol)
83 break;
84
85 searchkey.obj_id = r->id;
86 searchkey.obj_type = TYPE_ROOT_BACKREF;
87 searchkey.offset = 0xffffffffffffffff;
88
89 Status = find_item(Vcb, Vcb->root_root, &tp, &searchkey, false, NULL);
90 if (!NT_SUCCESS(Status)) {
91 ERR("find_item returned %08lx\n", Status);
92 goto end;
93 }
94
95 if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == searchkey.obj_type) {
96 ROOT_REF* rr = (ROOT_REF*)tp.item->data;
97 path_part* pp;
98
99 if (tp.item->size < sizeof(ROOT_REF)) {
100 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(ROOT_REF));
101 goto end;
102 }
103
104 if (tp.item->size < offsetof(ROOT_REF, name[0]) + rr->n) {
105 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
106 tp.item->size, offsetof(ROOT_REF, name[0]) + rr->n);
107 goto end;
108 }
109
110 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
111 if (!pp) {
112 ERR("out of memory\n");
113 goto end;
114 }
115
116 pp->name.Buffer = rr->name;
117 pp->name.Length = pp->name.MaximumLength = rr->n;
118 pp->orig_subvol = false;
119
120 InsertTailList(&parts, &pp->list_entry);
121
122 r = NULL;
123
124 le = Vcb->roots.Flink;
125 while (le != &Vcb->roots) {
126 root* r2 = CONTAINING_RECORD(le, root, list_entry);
127
128 if (r2->id == tp.item->key.offset) {
129 r = r2;
130 break;
131 }
132
133 le = le->Flink;
134 }
135
136 if (!r) {
137 ERR("could not find subvol %I64x\n", tp.item->key.offset);
138 goto end;
139 }
140
141 dir = rr->dir;
142 orig_subvol = false;
143 } else {
144 not_in_tree = true;
145 break;
146 }
147 } else {
148 searchkey.obj_id = dir;
149 searchkey.obj_type = TYPE_INODE_EXTREF;
150 searchkey.offset = 0xffffffffffffffff;
151
152 Status = find_item(Vcb, r, &tp, &searchkey, false, NULL);
153 if (!NT_SUCCESS(Status)) {
154 ERR("find_item returned %08lx\n", Status);
155 goto end;
156 }
157
158 if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_REF) {
159 INODE_REF* ir = (INODE_REF*)tp.item->data;
160 path_part* pp;
161
162 if (tp.item->size < sizeof(INODE_REF)) {
163 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(INODE_REF));
164 goto end;
165 }
166
167 if (tp.item->size < offsetof(INODE_REF, name[0]) + ir->n) {
168 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
169 tp.item->size, offsetof(INODE_REF, name[0]) + ir->n);
170 goto end;
171 }
172
173 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
174 if (!pp) {
175 ERR("out of memory\n");
176 goto end;
177 }
178
179 pp->name.Buffer = ir->name;
180 pp->name.Length = pp->name.MaximumLength = ir->n;
181 pp->orig_subvol = orig_subvol;
182
183 InsertTailList(&parts, &pp->list_entry);
184
185 if (dir == tp.item->key.offset)
186 break;
187
188 dir = tp.item->key.offset;
189 } else if (tp.item->key.obj_id == searchkey.obj_id && tp.item->key.obj_type == TYPE_INODE_EXTREF) {
190 INODE_EXTREF* ier = (INODE_EXTREF*)tp.item->data;
191 path_part* pp;
192
193 if (tp.item->size < sizeof(INODE_EXTREF)) {
194 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
195 tp.item->size, sizeof(INODE_EXTREF));
196 goto end;
197 }
198
199 if (tp.item->size < offsetof(INODE_EXTREF, name[0]) + ier->n) {
200 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
201 tp.item->size, offsetof(INODE_EXTREF, name[0]) + ier->n);
202 goto end;
203 }
204
205 pp = ExAllocatePoolWithTag(PagedPool, sizeof(path_part), ALLOC_TAG);
206 if (!pp) {
207 ERR("out of memory\n");
208 goto end;
209 }
210
211 pp->name.Buffer = ier->name;
212 pp->name.Length = pp->name.MaximumLength = ier->n;
213 pp->orig_subvol = orig_subvol;
214
215 InsertTailList(&parts, &pp->list_entry);
216
217 if (dir == ier->dir)
218 break;
219
220 dir = ier->dir;
221 } else {
222 ERR("could not find INODE_REF for inode %I64x in subvol %I64x\n", dir, r->id);
223 goto end;
224 }
225 }
226 }
227
228 fn.MaximumLength = 0;
229
230 if (not_in_tree) {
231 le = parts.Blink;
232 while (le != &parts) {
233 path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
234 LIST_ENTRY* le2 = le->Blink;
235
236 if (pp->orig_subvol)
237 break;
238
239 RemoveTailList(&parts);
240 ExFreePool(pp);
241
242 le = le2;
243 }
244 }
245
246 le = parts.Flink;
247 while (le != &parts) {
248 path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
249
250 fn.MaximumLength += pp->name.Length + 1;
251
252 le = le->Flink;
253 }
254
255 fn.Buffer = ExAllocatePoolWithTag(PagedPool, fn.MaximumLength, ALLOC_TAG);
256 if (!fn.Buffer) {
257 ERR("out of memory\n");
258 goto end;
259 }
260
261 fn.Length = 0;
262
263 le = parts.Blink;
264 while (le != &parts) {
265 path_part* pp = CONTAINING_RECORD(le, path_part, list_entry);
266
267 fn.Buffer[fn.Length] = '\\';
268 fn.Length++;
269
270 RtlCopyMemory(&fn.Buffer[fn.Length], pp->name.Buffer, pp->name.Length);
271 fn.Length += pp->name.Length;
272
273 le = le->Blink;
274 }
275
276 if (not_in_tree)
277 ERR("subvol %I64x, %.*s, offset %I64x\n", subvol, fn.Length, fn.Buffer, offset);
278 else
279 ERR("%.*s, offset %I64x\n", fn.Length, fn.Buffer, offset);
280
281 Status = utf8_to_utf16(NULL, 0, &utf16len, fn.Buffer, fn.Length);
282 if (!NT_SUCCESS(Status)) {
283 ERR("utf8_to_utf16 1 returned %08lx\n", Status);
284 ExFreePool(fn.Buffer);
285 goto end;
286 }
287
288 err = ExAllocatePoolWithTag(PagedPool, offsetof(scrub_error, data.filename[0]) + utf16len, ALLOC_TAG);
289 if (!err) {
290 ERR("out of memory\n");
291 ExFreePool(fn.Buffer);
292 goto end;
293 }
294
295 err->address = addr;
296 err->device = devid;
297 err->recovered = false;
298 err->is_metadata = false;
299 err->parity = false;
300
301 err->data.subvol = not_in_tree ? subvol : 0;
302 err->data.offset = offset;
303 err->data.filename_length = (uint16_t)utf16len;
304
305 Status = utf8_to_utf16(err->data.filename, utf16len, &utf16len, fn.Buffer, fn.Length);
306 if (!NT_SUCCESS(Status)) {
307 ERR("utf8_to_utf16 2 returned %08lx\n", Status);
308 ExFreePool(fn.Buffer);
309 ExFreePool(err);
310 goto end;
311 }
312
313 ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
314
315 Vcb->scrub.num_errors++;
316 InsertTailList(&Vcb->scrub.errors, &err->list_entry);
317
318 ExReleaseResourceLite(&Vcb->scrub.stats_lock);
319
320 ExFreePool(fn.Buffer);
321
322 end:
323 while (!IsListEmpty(&parts)) {
324 path_part* pp = CONTAINING_RECORD(RemoveHeadList(&parts), path_part, list_entry);
325
326 ExFreePool(pp);
327 }
328 }
329
log_file_checksum_error_shared(device_extension * Vcb,uint64_t treeaddr,uint64_t addr,uint64_t devid,uint64_t extent)330 static void log_file_checksum_error_shared(device_extension* Vcb, uint64_t treeaddr, uint64_t addr, uint64_t devid, uint64_t extent) {
331 tree_header* tree;
332 NTSTATUS Status;
333 leaf_node* ln;
334 ULONG i;
335
336 tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
337 if (!tree) {
338 ERR("out of memory\n");
339 return;
340 }
341
342 Status = read_data(Vcb, treeaddr, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
343 if (!NT_SUCCESS(Status)) {
344 ERR("read_data returned %08lx\n", Status);
345 goto end;
346 }
347
348 if (tree->level != 0) {
349 ERR("tree level was %x, expected 0\n", tree->level);
350 goto end;
351 }
352
353 ln = (leaf_node*)&tree[1];
354
355 for (i = 0; i < tree->num_items; i++) {
356 if (ln[i].key.obj_type == TYPE_EXTENT_DATA && ln[i].size >= sizeof(EXTENT_DATA) - 1 + sizeof(EXTENT_DATA2)) {
357 EXTENT_DATA* ed = (EXTENT_DATA*)((uint8_t*)tree + sizeof(tree_header) + ln[i].offset);
358 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
359
360 if (ed->type == EXTENT_TYPE_REGULAR && ed2->size != 0 && ed2->address == addr)
361 log_file_checksum_error(Vcb, addr, devid, tree->tree_id, ln[i].key.obj_id, ln[i].key.offset + addr - extent);
362 }
363 }
364
365 end:
366 ExFreePool(tree);
367 }
368
log_tree_checksum_error(device_extension * Vcb,uint64_t addr,uint64_t devid,uint64_t root,uint8_t level,KEY * firstitem)369 static void log_tree_checksum_error(device_extension* Vcb, uint64_t addr, uint64_t devid, uint64_t root, uint8_t level, KEY* firstitem) {
370 scrub_error* err;
371
372 err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
373 if (!err) {
374 ERR("out of memory\n");
375 return;
376 }
377
378 err->address = addr;
379 err->device = devid;
380 err->recovered = false;
381 err->is_metadata = true;
382 err->parity = false;
383
384 err->metadata.root = root;
385 err->metadata.level = level;
386
387 if (firstitem) {
388 ERR("root %I64x, level %u, first item (%I64x,%x,%I64x)\n", root, level, firstitem->obj_id,
389 firstitem->obj_type, firstitem->offset);
390
391 err->metadata.firstitem = *firstitem;
392 } else {
393 ERR("root %I64x, level %u\n", root, level);
394
395 RtlZeroMemory(&err->metadata.firstitem, sizeof(KEY));
396 }
397
398 ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
399
400 Vcb->scrub.num_errors++;
401 InsertTailList(&Vcb->scrub.errors, &err->list_entry);
402
403 ExReleaseResourceLite(&Vcb->scrub.stats_lock);
404 }
405
log_tree_checksum_error_shared(device_extension * Vcb,uint64_t offset,uint64_t address,uint64_t devid)406 static void log_tree_checksum_error_shared(device_extension* Vcb, uint64_t offset, uint64_t address, uint64_t devid) {
407 tree_header* tree;
408 NTSTATUS Status;
409 internal_node* in;
410 ULONG i;
411
412 tree = ExAllocatePoolWithTag(PagedPool, Vcb->superblock.node_size, ALLOC_TAG);
413 if (!tree) {
414 ERR("out of memory\n");
415 return;
416 }
417
418 Status = read_data(Vcb, offset, Vcb->superblock.node_size, NULL, true, (uint8_t*)tree, NULL, NULL, NULL, 0, false, NormalPagePriority);
419 if (!NT_SUCCESS(Status)) {
420 ERR("read_data returned %08lx\n", Status);
421 goto end;
422 }
423
424 if (tree->level == 0) {
425 ERR("tree level was 0\n");
426 goto end;
427 }
428
429 in = (internal_node*)&tree[1];
430
431 for (i = 0; i < tree->num_items; i++) {
432 if (in[i].address == address) {
433 log_tree_checksum_error(Vcb, address, devid, tree->tree_id, tree->level - 1, &in[i].key);
434 break;
435 }
436 }
437
438 end:
439 ExFreePool(tree);
440 }
441
log_unrecoverable_error(device_extension * Vcb,uint64_t address,uint64_t devid)442 static void log_unrecoverable_error(device_extension* Vcb, uint64_t address, uint64_t devid) {
443 KEY searchkey;
444 traverse_ptr tp;
445 NTSTATUS Status;
446 EXTENT_ITEM* ei;
447 EXTENT_ITEM2* ei2 = NULL;
448 uint8_t* ptr;
449 ULONG len;
450 uint64_t rc;
451
452 // FIXME - still log even if rest of this function fails
453
454 searchkey.obj_id = address;
455 searchkey.obj_type = TYPE_METADATA_ITEM;
456 searchkey.offset = 0xffffffffffffffff;
457
458 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
459 if (!NT_SUCCESS(Status)) {
460 ERR("find_item returned %08lx\n", Status);
461 return;
462 }
463
464 if ((tp.item->key.obj_type != TYPE_EXTENT_ITEM && tp.item->key.obj_type != TYPE_METADATA_ITEM) ||
465 tp.item->key.obj_id >= address + Vcb->superblock.sector_size ||
466 (tp.item->key.obj_type == TYPE_EXTENT_ITEM && tp.item->key.obj_id + tp.item->key.offset <= address) ||
467 (tp.item->key.obj_type == TYPE_METADATA_ITEM && tp.item->key.obj_id + Vcb->superblock.node_size <= address)
468 )
469 return;
470
471 if (tp.item->size < sizeof(EXTENT_ITEM)) {
472 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
473 return;
474 }
475
476 ei = (EXTENT_ITEM*)tp.item->data;
477 ptr = (uint8_t*)&ei[1];
478 len = tp.item->size - sizeof(EXTENT_ITEM);
479
480 if (tp.item->key.obj_id == TYPE_EXTENT_ITEM && ei->flags & EXTENT_ITEM_TREE_BLOCK) {
481 if (tp.item->size < sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2)) {
482 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
483 tp.item->size, sizeof(EXTENT_ITEM) + sizeof(EXTENT_ITEM2));
484 return;
485 }
486
487 ei2 = (EXTENT_ITEM2*)ptr;
488
489 ptr += sizeof(EXTENT_ITEM2);
490 len -= sizeof(EXTENT_ITEM2);
491 }
492
493 rc = 0;
494
495 while (len > 0) {
496 uint8_t type = *ptr;
497
498 ptr++;
499 len--;
500
501 if (type == TYPE_TREE_BLOCK_REF) {
502 TREE_BLOCK_REF* tbr;
503
504 if (len < sizeof(TREE_BLOCK_REF)) {
505 ERR("TREE_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(TREE_BLOCK_REF), len);
506 break;
507 }
508
509 tbr = (TREE_BLOCK_REF*)ptr;
510
511 log_tree_checksum_error(Vcb, address, devid, tbr->offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
512
513 rc++;
514
515 ptr += sizeof(TREE_BLOCK_REF);
516 len -= sizeof(TREE_BLOCK_REF);
517 } else if (type == TYPE_EXTENT_DATA_REF) {
518 EXTENT_DATA_REF* edr;
519
520 if (len < sizeof(EXTENT_DATA_REF)) {
521 ERR("EXTENT_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(EXTENT_DATA_REF), len);
522 break;
523 }
524
525 edr = (EXTENT_DATA_REF*)ptr;
526
527 log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
528
529 rc += edr->count;
530
531 ptr += sizeof(EXTENT_DATA_REF);
532 len -= sizeof(EXTENT_DATA_REF);
533 } else if (type == TYPE_SHARED_BLOCK_REF) {
534 SHARED_BLOCK_REF* sbr;
535
536 if (len < sizeof(SHARED_BLOCK_REF)) {
537 ERR("SHARED_BLOCK_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_BLOCK_REF), len);
538 break;
539 }
540
541 sbr = (SHARED_BLOCK_REF*)ptr;
542
543 log_tree_checksum_error_shared(Vcb, sbr->offset, address, devid);
544
545 rc++;
546
547 ptr += sizeof(SHARED_BLOCK_REF);
548 len -= sizeof(SHARED_BLOCK_REF);
549 } else if (type == TYPE_SHARED_DATA_REF) {
550 SHARED_DATA_REF* sdr;
551
552 if (len < sizeof(SHARED_DATA_REF)) {
553 ERR("SHARED_DATA_REF takes up %Iu bytes, but only %lu remaining\n", sizeof(SHARED_DATA_REF), len);
554 break;
555 }
556
557 sdr = (SHARED_DATA_REF*)ptr;
558
559 log_file_checksum_error_shared(Vcb, sdr->offset, address, devid, tp.item->key.obj_id);
560
561 rc += sdr->count;
562
563 ptr += sizeof(SHARED_DATA_REF);
564 len -= sizeof(SHARED_DATA_REF);
565 } else {
566 ERR("unknown extent type %x\n", type);
567 break;
568 }
569 }
570
571 if (rc < ei->refcount) {
572 do {
573 traverse_ptr next_tp;
574
575 if (find_next_item(Vcb, &tp, &next_tp, false, NULL))
576 tp = next_tp;
577 else
578 break;
579
580 if (tp.item->key.obj_id == address) {
581 if (tp.item->key.obj_type == TYPE_TREE_BLOCK_REF)
582 log_tree_checksum_error(Vcb, address, devid, tp.item->key.offset, ei2 ? ei2->level : (uint8_t)tp.item->key.offset, ei2 ? &ei2->firstitem : NULL);
583 else if (tp.item->key.obj_type == TYPE_EXTENT_DATA_REF) {
584 EXTENT_DATA_REF* edr;
585
586 if (tp.item->size < sizeof(EXTENT_DATA_REF)) {
587 ERR("(%I64x,%x,%I64x) was %u bytes, expected %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset,
588 tp.item->size, sizeof(EXTENT_DATA_REF));
589 break;
590 }
591
592 edr = (EXTENT_DATA_REF*)tp.item->data;
593
594 log_file_checksum_error(Vcb, address, devid, edr->root, edr->objid, edr->offset + address - tp.item->key.obj_id);
595 } else if (tp.item->key.obj_type == TYPE_SHARED_BLOCK_REF)
596 log_tree_checksum_error_shared(Vcb, tp.item->key.offset, address, devid);
597 else if (tp.item->key.obj_type == TYPE_SHARED_DATA_REF)
598 log_file_checksum_error_shared(Vcb, tp.item->key.offset, address, devid, tp.item->key.obj_id);
599 } else
600 break;
601 } while (true);
602 }
603 }
604
log_error(device_extension * Vcb,uint64_t addr,uint64_t devid,bool metadata,bool recoverable,bool parity)605 static void log_error(device_extension* Vcb, uint64_t addr, uint64_t devid, bool metadata, bool recoverable, bool parity) {
606 if (recoverable) {
607 scrub_error* err;
608
609 if (parity) {
610 ERR("recovering from parity error at %I64x on device %I64x\n", addr, devid);
611 } else {
612 if (metadata)
613 ERR("recovering from metadata checksum error at %I64x on device %I64x\n", addr, devid);
614 else
615 ERR("recovering from data checksum error at %I64x on device %I64x\n", addr, devid);
616 }
617
618 err = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_error), ALLOC_TAG);
619 if (!err) {
620 ERR("out of memory\n");
621 return;
622 }
623
624 err->address = addr;
625 err->device = devid;
626 err->recovered = true;
627 err->is_metadata = metadata;
628 err->parity = parity;
629
630 if (metadata)
631 RtlZeroMemory(&err->metadata, sizeof(err->metadata));
632 else
633 RtlZeroMemory(&err->data, sizeof(err->data));
634
635 ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
636
637 Vcb->scrub.num_errors++;
638 InsertTailList(&Vcb->scrub.errors, &err->list_entry);
639
640 ExReleaseResourceLite(&Vcb->scrub.stats_lock);
641 } else {
642 if (metadata)
643 ERR("unrecoverable metadata checksum error at %I64x\n", addr);
644 else
645 ERR("unrecoverable data checksum error at %I64x\n", addr);
646
647 log_unrecoverable_error(Vcb, addr, devid);
648 }
649 }
650
_Function_class_(IO_COMPLETION_ROUTINE)651 _Function_class_(IO_COMPLETION_ROUTINE)
652 static NTSTATUS __stdcall scrub_read_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
653 scrub_context_stripe* stripe = conptr;
654 scrub_context* context = (scrub_context*)stripe->context;
655 ULONG left = InterlockedDecrement(&context->stripes_left);
656
657 UNUSED(DeviceObject);
658
659 stripe->iosb = Irp->IoStatus;
660
661 if (left == 0)
662 KeSetEvent(&context->Event, 0, false);
663
664 return STATUS_MORE_PROCESSING_REQUIRED;
665 }
666
scrub_extent_dup(device_extension * Vcb,chunk * c,uint64_t offset,void * csum,scrub_context * context)667 static NTSTATUS scrub_extent_dup(device_extension* Vcb, chunk* c, uint64_t offset, void* csum, scrub_context* context) {
668 NTSTATUS Status;
669 bool csum_error = false;
670 ULONG i;
671 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
672 uint16_t present_devices = 0;
673
674 if (csum) {
675 ULONG good_stripe = 0xffffffff;
676
677 for (i = 0; i < c->chunk_item->num_stripes; i++) {
678 if (c->devices[i]->devobj) {
679 present_devices++;
680
681 // if first stripe is okay, we only need to check that the others are identical to it
682 if (good_stripe != 0xffffffff) {
683 if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
684 context->stripes[good_stripe].length) != context->stripes[i].length) {
685 context->stripes[i].csum_error = true;
686 csum_error = true;
687 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
688 }
689 } else {
690 Status = check_csum(Vcb, context->stripes[i].buf, context->stripes[i].length >> Vcb->sector_shift, csum);
691 if (Status == STATUS_CRC_ERROR) {
692 context->stripes[i].csum_error = true;
693 csum_error = true;
694 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
695 } else if (!NT_SUCCESS(Status)) {
696 ERR("check_csum returned %08lx\n", Status);
697 return Status;
698 } else
699 good_stripe = i;
700 }
701 }
702 }
703 } else {
704 ULONG good_stripe = 0xffffffff;
705
706 for (i = 0; i < c->chunk_item->num_stripes; i++) {
707 ULONG j;
708
709 if (c->devices[i]->devobj) {
710 // if first stripe is okay, we only need to check that the others are identical to it
711 if (good_stripe != 0xffffffff) {
712 if (RtlCompareMemory(context->stripes[i].buf, context->stripes[good_stripe].buf,
713 context->stripes[good_stripe].length) != context->stripes[i].length) {
714 context->stripes[i].csum_error = true;
715 csum_error = true;
716 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
717 }
718 } else {
719 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
720 tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
721
722 if (!check_tree_checksum(Vcb, th) || th->address != offset + UInt32x32To64(j, Vcb->superblock.node_size)) {
723 context->stripes[i].csum_error = true;
724 csum_error = true;
725 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
726 }
727 }
728
729 if (!context->stripes[i].csum_error)
730 good_stripe = i;
731 }
732 }
733 }
734 }
735
736 if (!csum_error)
737 return STATUS_SUCCESS;
738
739 // handle checksum error
740
741 for (i = 0; i < c->chunk_item->num_stripes; i++) {
742 if (context->stripes[i].csum_error) {
743 if (csum) {
744 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[i].length * Vcb->csum_size) >> Vcb->sector_shift, ALLOC_TAG);
745 if (!context->stripes[i].bad_csums) {
746 ERR("out of memory\n");
747 return STATUS_INSUFFICIENT_RESOURCES;
748 }
749
750 do_calc_job(Vcb, context->stripes[i].buf, context->stripes[i].length >> Vcb->sector_shift, context->stripes[i].bad_csums);
751 } else {
752 ULONG j;
753
754 context->stripes[i].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[i].length * Vcb->csum_size) >> Vcb->sector_shift, ALLOC_TAG);
755 if (!context->stripes[i].bad_csums) {
756 ERR("out of memory\n");
757 return STATUS_INSUFFICIENT_RESOURCES;
758 }
759
760 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
761 tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
762
763 get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[i].bad_csums + (Vcb->csum_size * j));
764 }
765 }
766 }
767 }
768
769 if (present_devices > 1) {
770 ULONG good_stripe = 0xffffffff;
771
772 for (i = 0; i < c->chunk_item->num_stripes; i++) {
773 if (c->devices[i]->devobj && !context->stripes[i].csum_error) {
774 good_stripe = i;
775 break;
776 }
777 }
778
779 if (good_stripe != 0xffffffff) {
780 // log
781
782 for (i = 0; i < c->chunk_item->num_stripes; i++) {
783 if (context->stripes[i].csum_error) {
784 ULONG j;
785
786 if (csum) {
787 for (j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
788 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
789 uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
790
791 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
792 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
793 }
794 }
795 } else {
796 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
797 tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
798 uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
799
800 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
801 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
802 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
803 }
804 }
805 }
806 }
807 }
808
809 // write good data over bad
810
811 for (i = 0; i < c->chunk_item->num_stripes; i++) {
812 if (context->stripes[i].csum_error && !c->devices[i]->readonly) {
813 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
814 context->stripes[good_stripe].buf, context->stripes[i].length);
815
816 if (!NT_SUCCESS(Status)) {
817 ERR("write_data_phys returned %08lx\n", Status);
818 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
819 return Status;
820 }
821 }
822 }
823
824 return STATUS_SUCCESS;
825 }
826
827 // if csum errors on all stripes, check sector by sector
828
829 for (i = 0; i < c->chunk_item->num_stripes; i++) {
830 if (c->devices[i]->devobj) {
831 if (csum) {
832 for (ULONG j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
833 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
834 ULONG k;
835 uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
836 bool recovered = false;
837
838 for (k = 0; k < c->chunk_item->num_stripes; k++) {
839 if (i != k && c->devices[k]->devobj &&
840 RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size),
841 (uint8_t*)csum + (j * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
842 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, true, false);
843 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
844
845 RtlCopyMemory(context->stripes[i].buf + (j << Vcb->sector_shift),
846 context->stripes[k].buf + (j << Vcb->sector_shift), Vcb->superblock.sector_size);
847
848 recovered = true;
849 break;
850 }
851 }
852
853 if (!recovered) {
854 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
855 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
856 }
857 }
858 }
859 } else {
860 for (ULONG j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
861 tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
862 uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
863
864 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
865 ULONG k;
866 bool recovered = false;
867
868 for (k = 0; k < c->chunk_item->num_stripes; k++) {
869 if (i != k && c->devices[k]->devobj) {
870 tree_header* th2 = (tree_header*)&context->stripes[k].buf[j * Vcb->superblock.node_size];
871
872 if (RtlCompareMemory((uint8_t*)context->stripes[k].bad_csums + (j * Vcb->csum_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
873 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, true, false);
874 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
875
876 RtlCopyMemory(th, th2, Vcb->superblock.node_size);
877
878 recovered = true;
879 break;
880 }
881 }
882 }
883
884 if (!recovered) {
885 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
886 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
887 }
888 }
889 }
890 }
891 }
892 }
893
894 // write good data over bad
895
896 for (i = 0; i < c->chunk_item->num_stripes; i++) {
897 if (c->devices[i]->devobj && !c->devices[i]->readonly) {
898 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + offset - c->offset,
899 context->stripes[i].buf, context->stripes[i].length);
900 if (!NT_SUCCESS(Status)) {
901 ERR("write_data_phys returned %08lx\n", Status);
902 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
903 return Status;
904 }
905 }
906 }
907
908 return STATUS_SUCCESS;
909 }
910
911 for (i = 0; i < c->chunk_item->num_stripes; i++) {
912 if (c->devices[i]->devobj) {
913 ULONG j;
914
915 if (csum) {
916 for (j = 0; j < context->stripes[i].length >> Vcb->sector_shift; j++) {
917 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), (uint8_t*)csum + (j + Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
918 uint64_t addr = offset + ((uint64_t)j << Vcb->sector_shift);
919
920 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, false, false, false);
921 }
922 }
923 } else {
924 for (j = 0; j < context->stripes[i].length / Vcb->superblock.node_size; j++) {
925 tree_header* th = (tree_header*)&context->stripes[i].buf[j * Vcb->superblock.node_size];
926 uint64_t addr = offset + UInt32x32To64(j, Vcb->superblock.node_size);
927
928 if (RtlCompareMemory((uint8_t*)context->stripes[i].bad_csums + (j * Vcb->csum_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr)
929 log_error(Vcb, addr, c->devices[i]->devitem.dev_id, true, false, false);
930 }
931 }
932 }
933 }
934
935 return STATUS_SUCCESS;
936 }
937
scrub_extent_raid0(device_extension * Vcb,chunk * c,uint64_t offset,uint32_t length,uint16_t startoffstripe,void * csum,scrub_context * context)938 static NTSTATUS scrub_extent_raid0(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
939 ULONG j;
940 uint16_t stripe;
941 uint32_t pos, *stripeoff;
942
943 pos = 0;
944 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes, ALLOC_TAG);
945 if (!stripeoff) {
946 ERR("out of memory\n");
947 return STATUS_INSUFFICIENT_RESOURCES;
948 }
949
950 RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes);
951
952 stripe = startoffstripe;
953 while (pos < length) {
954 uint32_t readlen;
955
956 if (pos == 0)
957 readlen = (uint32_t)min(context->stripes[stripe].length, c->chunk_item->stripe_length - (context->stripes[stripe].start % c->chunk_item->stripe_length));
958 else
959 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
960
961 if (csum) {
962 for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
963 if (!check_sector_csum(Vcb, context->stripes[stripe].buf + stripeoff[stripe], (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift))) {
964 uint64_t addr = offset + pos;
965
966 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
967 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
968 }
969
970 pos += Vcb->superblock.sector_size;
971 stripeoff[stripe] += Vcb->superblock.sector_size;
972 }
973 } else {
974 for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
975 tree_header* th = (tree_header*)(context->stripes[stripe].buf + stripeoff[stripe]);
976 uint64_t addr = offset + pos;
977
978 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
979 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
980 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
981 }
982
983 pos += Vcb->superblock.node_size;
984 stripeoff[stripe] += Vcb->superblock.node_size;
985 }
986 }
987
988 stripe = (stripe + 1) % c->chunk_item->num_stripes;
989 }
990
991 ExFreePool(stripeoff);
992
993 return STATUS_SUCCESS;
994 }
995
scrub_extent_raid10(device_extension * Vcb,chunk * c,uint64_t offset,uint32_t length,uint16_t startoffstripe,void * csum,scrub_context * context)996 static NTSTATUS scrub_extent_raid10(device_extension* Vcb, chunk* c, uint64_t offset, uint32_t length, uint16_t startoffstripe, void* csum, scrub_context* context) {
997 ULONG j;
998 uint16_t stripe, sub_stripes = max(c->chunk_item->sub_stripes, 1);
999 uint32_t pos, *stripeoff;
1000 bool csum_error = false;
1001 NTSTATUS Status;
1002
1003 pos = 0;
1004 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes, ALLOC_TAG);
1005 if (!stripeoff) {
1006 ERR("out of memory\n");
1007 return STATUS_INSUFFICIENT_RESOURCES;
1008 }
1009
1010 RtlZeroMemory(stripeoff, sizeof(uint32_t) * c->chunk_item->num_stripes / sub_stripes);
1011
1012 stripe = startoffstripe;
1013 while (pos < length) {
1014 uint32_t readlen;
1015
1016 if (pos == 0)
1017 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1018 c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1019 else
1020 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1021
1022 if (csum) {
1023 ULONG good_stripe = 0xffffffff;
1024 uint16_t k;
1025
1026 for (k = 0; k < sub_stripes; k++) {
1027 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1028 // if first stripe is okay, we only need to check that the others are identical to it
1029 if (good_stripe != 0xffffffff) {
1030 if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1031 context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1032 readlen) != readlen) {
1033 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1034 csum_error = true;
1035 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1036 }
1037 } else {
1038 for (j = 0; j < readlen; j += Vcb->superblock.sector_size) {
1039 if (!check_sector_csum(Vcb, context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j,
1040 (uint8_t*)csum + (((pos + j) * Vcb->csum_size) >> Vcb->sector_shift))) {
1041 csum_error = true;
1042 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1043 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1044 break;
1045 }
1046 }
1047
1048 if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1049 good_stripe = k;
1050 }
1051 }
1052 }
1053
1054 pos += readlen;
1055 stripeoff[stripe] += readlen;
1056 } else {
1057 ULONG good_stripe = 0xffffffff;
1058 uint16_t k;
1059
1060 for (k = 0; k < sub_stripes; k++) {
1061 if (c->devices[(stripe * sub_stripes) + k]->devobj) {
1062 // if first stripe is okay, we only need to check that the others are identical to it
1063 if (good_stripe != 0xffffffff) {
1064 if (RtlCompareMemory(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe],
1065 context->stripes[(stripe * sub_stripes) + good_stripe].buf + stripeoff[stripe],
1066 readlen) != readlen) {
1067 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1068 csum_error = true;
1069 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1070 }
1071 } else {
1072 for (j = 0; j < readlen; j += Vcb->superblock.node_size) {
1073 tree_header* th = (tree_header*)(context->stripes[(stripe * sub_stripes) + k].buf + stripeoff[stripe] + j);
1074 uint64_t addr = offset + pos + j;
1075
1076 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1077 csum_error = true;
1078 context->stripes[(stripe * sub_stripes) + k].csum_error = true;
1079 log_device_error(Vcb, c->devices[(stripe * sub_stripes) + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1080 break;
1081 }
1082 }
1083
1084 if (!context->stripes[(stripe * sub_stripes) + k].csum_error)
1085 good_stripe = k;
1086 }
1087 }
1088 }
1089
1090 pos += readlen;
1091 stripeoff[stripe] += readlen;
1092 }
1093
1094 stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1095 }
1096
1097 if (!csum_error) {
1098 Status = STATUS_SUCCESS;
1099 goto end;
1100 }
1101
1102 for (j = 0; j < c->chunk_item->num_stripes; j += sub_stripes) {
1103 ULONG goodstripe = 0xffffffff;
1104 uint16_t k;
1105 bool hasbadstripe = false;
1106
1107 if (context->stripes[j].length == 0)
1108 continue;
1109
1110 for (k = 0; k < sub_stripes; k++) {
1111 if (c->devices[j + k]->devobj) {
1112 if (!context->stripes[j + k].csum_error)
1113 goodstripe = k;
1114 else
1115 hasbadstripe = true;
1116 }
1117 }
1118
1119 if (hasbadstripe) {
1120 if (goodstripe != 0xffffffff) {
1121 for (k = 0; k < sub_stripes; k++) {
1122 if (c->devices[j + k]->devobj && context->stripes[j + k].csum_error) {
1123 uint32_t so = 0;
1124 bool recovered = false;
1125
1126 pos = 0;
1127
1128 stripe = startoffstripe;
1129 while (pos < length) {
1130 uint32_t readlen;
1131
1132 if (pos == 0)
1133 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1134 c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1135 else
1136 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1137
1138 if (stripe == j / sub_stripes) {
1139 if (csum) {
1140 ULONG l;
1141
1142 for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1143 if (RtlCompareMemory(context->stripes[j + k].buf + so,
1144 context->stripes[j + goodstripe].buf + so,
1145 Vcb->superblock.sector_size) != Vcb->superblock.sector_size) {
1146 uint64_t addr = offset + pos;
1147
1148 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1149
1150 recovered = true;
1151 }
1152
1153 pos += Vcb->superblock.sector_size;
1154 so += Vcb->superblock.sector_size;
1155 }
1156 } else {
1157 ULONG l;
1158
1159 for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1160 if (RtlCompareMemory(context->stripes[j + k].buf + so,
1161 context->stripes[j + goodstripe].buf + so,
1162 Vcb->superblock.node_size) != Vcb->superblock.node_size) {
1163 uint64_t addr = offset + pos;
1164
1165 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1166
1167 recovered = true;
1168 }
1169
1170 pos += Vcb->superblock.node_size;
1171 so += Vcb->superblock.node_size;
1172 }
1173 }
1174 } else
1175 pos += readlen;
1176
1177 stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1178 }
1179
1180 if (recovered) {
1181 // write good data over bad
1182
1183 if (!c->devices[j + k]->readonly) {
1184 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1185
1186 Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1187 context->stripes[j + goodstripe].buf, context->stripes[j + goodstripe].length);
1188
1189 if (!NT_SUCCESS(Status)) {
1190 ERR("write_data_phys returned %08lx\n", Status);
1191 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1192 goto end;
1193 }
1194 }
1195 }
1196 }
1197 }
1198 } else {
1199 uint32_t so = 0;
1200 bool recovered = false;
1201
1202 if (csum) {
1203 for (k = 0; k < sub_stripes; k++) {
1204 if (c->devices[j + k]->devobj) {
1205 context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, (context->stripes[j + k].length * Vcb->csum_size) >> Vcb->sector_shift,
1206 ALLOC_TAG);
1207 if (!context->stripes[j + k].bad_csums) {
1208 ERR("out of memory\n");
1209 Status = STATUS_INSUFFICIENT_RESOURCES;
1210 goto end;
1211 }
1212
1213 do_calc_job(Vcb, context->stripes[j + k].buf, context->stripes[j + k].length >> Vcb->sector_shift, context->stripes[j + k].bad_csums);
1214 }
1215 }
1216 } else {
1217 for (k = 0; k < sub_stripes; k++) {
1218 if (c->devices[j + k]->devobj) {
1219 ULONG l;
1220
1221 context->stripes[j + k].bad_csums = ExAllocatePoolWithTag(PagedPool, context->stripes[j + k].length * Vcb->csum_size / Vcb->superblock.node_size,
1222 ALLOC_TAG);
1223 if (!context->stripes[j + k].bad_csums) {
1224 ERR("out of memory\n");
1225 Status = STATUS_INSUFFICIENT_RESOURCES;
1226 goto end;
1227 }
1228
1229 for (l = 0; l < context->stripes[j + k].length / Vcb->superblock.node_size; l++) {
1230 tree_header* th = (tree_header*)&context->stripes[j + k].buf[l * Vcb->superblock.node_size];
1231
1232 get_tree_checksum(Vcb, th, (uint8_t*)context->stripes[j + k].bad_csums + (Vcb->csum_size * l));
1233 }
1234 }
1235 }
1236 }
1237
1238 pos = 0;
1239
1240 stripe = startoffstripe;
1241 while (pos < length) {
1242 uint32_t readlen;
1243
1244 if (pos == 0)
1245 readlen = (uint32_t)min(context->stripes[stripe * sub_stripes].length,
1246 c->chunk_item->stripe_length - (context->stripes[stripe * sub_stripes].start % c->chunk_item->stripe_length));
1247 else
1248 readlen = min(length - pos, (uint32_t)c->chunk_item->stripe_length);
1249
1250 if (stripe == j / sub_stripes) {
1251 ULONG l;
1252
1253 if (csum) {
1254 for (l = 0; l < readlen; l += Vcb->superblock.sector_size) {
1255 bool has_error = false;
1256
1257 goodstripe = 0xffffffff;
1258 for (k = 0; k < sub_stripes; k++) {
1259 if (c->devices[j + k]->devobj) {
1260 if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + ((so * Vcb->csum_size) >> Vcb->sector_shift),
1261 (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift),
1262 Vcb->csum_size) != Vcb->csum_size) {
1263 has_error = true;
1264 } else
1265 goodstripe = k;
1266 }
1267 }
1268
1269 if (has_error) {
1270 if (goodstripe != 0xffffffff) {
1271 for (k = 0; k < sub_stripes; k++) {
1272 if (c->devices[j + k]->devobj &&
1273 RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + ((so * Vcb->csum_size) >> Vcb->sector_shift),
1274 (uint8_t*)csum + ((pos * Vcb->csum_size) >> Vcb->sector_shift),
1275 Vcb->csum_size) != Vcb->csum_size) {
1276 uint64_t addr = offset + pos;
1277
1278 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, true, false);
1279
1280 recovered = true;
1281
1282 RtlCopyMemory(context->stripes[j + k].buf + so, context->stripes[j + goodstripe].buf + so,
1283 Vcb->superblock.sector_size);
1284 }
1285 }
1286 } else {
1287 uint64_t addr = offset + pos;
1288
1289 for (k = 0; k < sub_stripes; k++) {
1290 if (c->devices[j + j]->devobj) {
1291 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, false, false, false);
1292 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1293 }
1294 }
1295 }
1296 }
1297
1298 pos += Vcb->superblock.sector_size;
1299 so += Vcb->superblock.sector_size;
1300 }
1301 } else {
1302 for (l = 0; l < readlen; l += Vcb->superblock.node_size) {
1303 for (k = 0; k < sub_stripes; k++) {
1304 if (c->devices[j + k]->devobj) {
1305 tree_header* th = (tree_header*)&context->stripes[j + k].buf[so];
1306 uint64_t addr = offset + pos;
1307
1308 if (RtlCompareMemory((uint8_t*)context->stripes[j + k].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th, Vcb->csum_size) != Vcb->csum_size || th->address != addr) {
1309 ULONG m;
1310
1311 recovered = false;
1312
1313 for (m = 0; m < sub_stripes; m++) {
1314 if (m != k) {
1315 tree_header* th2 = (tree_header*)&context->stripes[j + m].buf[so];
1316
1317 if (RtlCompareMemory((uint8_t*)context->stripes[j + m].bad_csums + (so * Vcb->csum_size / Vcb->superblock.node_size), th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr) {
1318 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, true, false);
1319
1320 RtlCopyMemory(th, th2, Vcb->superblock.node_size);
1321
1322 recovered = true;
1323 break;
1324 } else
1325 log_device_error(Vcb, c->devices[j + m], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1326 }
1327 }
1328
1329 if (!recovered)
1330 log_error(Vcb, addr, c->devices[j + k]->devitem.dev_id, true, false, false);
1331 }
1332 }
1333 }
1334
1335 pos += Vcb->superblock.node_size;
1336 so += Vcb->superblock.node_size;
1337 }
1338 }
1339 } else
1340 pos += readlen;
1341
1342 stripe = (stripe + 1) % (c->chunk_item->num_stripes / sub_stripes);
1343 }
1344
1345 if (recovered) {
1346 // write good data over bad
1347
1348 for (k = 0; k < sub_stripes; k++) {
1349 if (c->devices[j + k]->devobj && !c->devices[j + k]->readonly) {
1350 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1351
1352 Status = write_data_phys(c->devices[j + k]->devobj, c->devices[j + k]->fileobj, cis[j + k].offset + offset - c->offset,
1353 context->stripes[j + k].buf, context->stripes[j + k].length);
1354
1355 if (!NT_SUCCESS(Status)) {
1356 ERR("write_data_phys returned %08lx\n", Status);
1357 log_device_error(Vcb, c->devices[j + k], BTRFS_DEV_STAT_WRITE_ERRORS);
1358 goto end;
1359 }
1360 }
1361 }
1362 }
1363 }
1364 }
1365 }
1366
1367 Status = STATUS_SUCCESS;
1368
1369 end:
1370 ExFreePool(stripeoff);
1371
1372 return Status;
1373 }
1374
scrub_extent(device_extension * Vcb,chunk * c,ULONG type,uint64_t offset,uint32_t size,void * csum)1375 static NTSTATUS scrub_extent(device_extension* Vcb, chunk* c, ULONG type, uint64_t offset, uint32_t size, void* csum) {
1376 ULONG i;
1377 scrub_context context;
1378 CHUNK_ITEM_STRIPE* cis;
1379 NTSTATUS Status;
1380 uint16_t startoffstripe = 0, num_missing, allowed_missing;
1381
1382 TRACE("(%p, %p, %lx, %I64x, %x, %p)\n", Vcb, c, type, offset, size, csum);
1383
1384 context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
1385 if (!context.stripes) {
1386 ERR("out of memory\n");
1387 Status = STATUS_INSUFFICIENT_RESOURCES;
1388 goto end;
1389 }
1390
1391 RtlZeroMemory(context.stripes, sizeof(scrub_context_stripe) * c->chunk_item->num_stripes);
1392
1393 context.stripes_left = 0;
1394
1395 cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
1396
1397 if (type == BLOCK_FLAG_RAID0) {
1398 uint64_t startoff, endoff;
1399 uint16_t endoffstripe;
1400
1401 get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &startoff, &startoffstripe);
1402 get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes, &endoff, &endoffstripe);
1403
1404 for (i = 0; i < c->chunk_item->num_stripes; i++) {
1405 if (startoffstripe > i)
1406 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1407 else if (startoffstripe == i)
1408 context.stripes[i].start = startoff;
1409 else
1410 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1411
1412 if (endoffstripe > i)
1413 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1414 else if (endoffstripe == i)
1415 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1416 else
1417 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1418 }
1419
1420 allowed_missing = 0;
1421 } else if (type == BLOCK_FLAG_RAID10) {
1422 uint64_t startoff, endoff;
1423 uint16_t endoffstripe, j, sub_stripes = max(c->chunk_item->sub_stripes, 1);
1424
1425 get_raid0_offset(offset - c->offset, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &startoff, &startoffstripe);
1426 get_raid0_offset(offset + size - c->offset - 1, c->chunk_item->stripe_length, c->chunk_item->num_stripes / sub_stripes, &endoff, &endoffstripe);
1427
1428 if ((c->chunk_item->num_stripes % sub_stripes) != 0) {
1429 ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", c->offset, c->chunk_item->num_stripes, sub_stripes);
1430 Status = STATUS_INTERNAL_ERROR;
1431 goto end;
1432 }
1433
1434 startoffstripe *= sub_stripes;
1435 endoffstripe *= sub_stripes;
1436
1437 for (i = 0; i < c->chunk_item->num_stripes; i += sub_stripes) {
1438 if (startoffstripe > i)
1439 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length;
1440 else if (startoffstripe == i)
1441 context.stripes[i].start = startoff;
1442 else
1443 context.stripes[i].start = startoff - (startoff % c->chunk_item->stripe_length);
1444
1445 if (endoffstripe > i)
1446 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) + c->chunk_item->stripe_length - context.stripes[i].start);
1447 else if (endoffstripe == i)
1448 context.stripes[i].length = (uint32_t)(endoff + 1 - context.stripes[i].start);
1449 else
1450 context.stripes[i].length = (uint32_t)(endoff - (endoff % c->chunk_item->stripe_length) - context.stripes[i].start);
1451
1452 for (j = 1; j < sub_stripes; j++) {
1453 context.stripes[i+j].start = context.stripes[i].start;
1454 context.stripes[i+j].length = context.stripes[i].length;
1455 }
1456 }
1457
1458 startoffstripe /= sub_stripes;
1459 allowed_missing = 1;
1460 } else
1461 allowed_missing = c->chunk_item->num_stripes - 1;
1462
1463 num_missing = 0;
1464
1465 for (i = 0; i < c->chunk_item->num_stripes; i++) {
1466 PIO_STACK_LOCATION IrpSp;
1467
1468 context.stripes[i].context = (struct _scrub_context*)&context;
1469
1470 if (type == BLOCK_FLAG_DUPLICATE) {
1471 context.stripes[i].start = offset - c->offset;
1472 context.stripes[i].length = size;
1473 } else if (type != BLOCK_FLAG_RAID0 && type != BLOCK_FLAG_RAID10) {
1474 ERR("unexpected chunk type %lx\n", type);
1475 Status = STATUS_INTERNAL_ERROR;
1476 goto end;
1477 }
1478
1479 if (!c->devices[i]->devobj) {
1480 num_missing++;
1481
1482 if (num_missing > allowed_missing) {
1483 ERR("too many missing devices (at least %u, maximum allowed %u)\n", num_missing, allowed_missing);
1484 Status = STATUS_INTERNAL_ERROR;
1485 goto end;
1486 }
1487 } else if (context.stripes[i].length > 0) {
1488 context.stripes[i].buf = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1489
1490 if (!context.stripes[i].buf) {
1491 ERR("out of memory\n");
1492 Status = STATUS_INSUFFICIENT_RESOURCES;
1493 goto end;
1494 }
1495
1496 context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
1497
1498 if (!context.stripes[i].Irp) {
1499 ERR("IoAllocateIrp failed\n");
1500 Status = STATUS_INSUFFICIENT_RESOURCES;
1501 goto end;
1502 }
1503
1504 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
1505 IrpSp->MajorFunction = IRP_MJ_READ;
1506 IrpSp->FileObject = c->devices[i]->fileobj;
1507
1508 if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
1509 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, context.stripes[i].length, ALLOC_TAG);
1510 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
1511 ERR("out of memory\n");
1512 Status = STATUS_INSUFFICIENT_RESOURCES;
1513 goto end;
1514 }
1515
1516 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
1517
1518 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1519 } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
1520 context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, context.stripes[i].length, false, false, NULL);
1521 if (!context.stripes[i].Irp->MdlAddress) {
1522 ERR("IoAllocateMdl failed\n");
1523 Status = STATUS_INSUFFICIENT_RESOURCES;
1524 goto end;
1525 }
1526
1527 Status = STATUS_SUCCESS;
1528
1529 _SEH2_TRY {
1530 MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
1531 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1532 Status = _SEH2_GetExceptionCode();
1533 } _SEH2_END;
1534
1535 if (!NT_SUCCESS(Status)) {
1536 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1537 IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1538 context.stripes[i].Irp->MdlAddress = NULL;
1539 goto end;
1540 }
1541 } else
1542 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
1543
1544 IrpSp->Parameters.Read.Length = context.stripes[i].length;
1545 IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].start + cis[i].offset;
1546
1547 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
1548
1549 IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion, &context.stripes[i], true, true, true);
1550
1551 context.stripes_left++;
1552
1553 Vcb->scrub.data_scrubbed += context.stripes[i].length;
1554 }
1555 }
1556
1557 if (context.stripes_left == 0) {
1558 ERR("error - not reading any stripes\n");
1559 Status = STATUS_INTERNAL_ERROR;
1560 goto end;
1561 }
1562
1563 KeInitializeEvent(&context.Event, NotificationEvent, false);
1564
1565 for (i = 0; i < c->chunk_item->num_stripes; i++) {
1566 if (c->devices[i]->devobj && context.stripes[i].length > 0)
1567 IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
1568 }
1569
1570 KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
1571
1572 // return an error if any of the stripes returned an error
1573 for (i = 0; i < c->chunk_item->num_stripes; i++) {
1574 if (!NT_SUCCESS(context.stripes[i].iosb.Status)) {
1575 Status = context.stripes[i].iosb.Status;
1576 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
1577 goto end;
1578 }
1579 }
1580
1581 if (type == BLOCK_FLAG_DUPLICATE) {
1582 Status = scrub_extent_dup(Vcb, c, offset, csum, &context);
1583 if (!NT_SUCCESS(Status)) {
1584 ERR("scrub_extent_dup returned %08lx\n", Status);
1585 goto end;
1586 }
1587 } else if (type == BLOCK_FLAG_RAID0) {
1588 Status = scrub_extent_raid0(Vcb, c, offset, size, startoffstripe, csum, &context);
1589 if (!NT_SUCCESS(Status)) {
1590 ERR("scrub_extent_raid0 returned %08lx\n", Status);
1591 goto end;
1592 }
1593 } else if (type == BLOCK_FLAG_RAID10) {
1594 Status = scrub_extent_raid10(Vcb, c, offset, size, startoffstripe, csum, &context);
1595 if (!NT_SUCCESS(Status)) {
1596 ERR("scrub_extent_raid10 returned %08lx\n", Status);
1597 goto end;
1598 }
1599 }
1600
1601 end:
1602 if (context.stripes) {
1603 for (i = 0; i < c->chunk_item->num_stripes; i++) {
1604 if (context.stripes[i].Irp) {
1605 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
1606 MmUnlockPages(context.stripes[i].Irp->MdlAddress);
1607 IoFreeMdl(context.stripes[i].Irp->MdlAddress);
1608 }
1609 IoFreeIrp(context.stripes[i].Irp);
1610 }
1611
1612 if (context.stripes[i].buf)
1613 ExFreePool(context.stripes[i].buf);
1614
1615 if (context.stripes[i].bad_csums)
1616 ExFreePool(context.stripes[i].bad_csums);
1617 }
1618
1619 ExFreePool(context.stripes);
1620 }
1621
1622 return Status;
1623 }
1624
scrub_data_extent(device_extension * Vcb,chunk * c,uint64_t offset,ULONG type,void * csum,RTL_BITMAP * bmp,ULONG bmplen)1625 static NTSTATUS scrub_data_extent(device_extension* Vcb, chunk* c, uint64_t offset, ULONG type, void* csum, RTL_BITMAP* bmp, ULONG bmplen) {
1626 NTSTATUS Status;
1627 ULONG runlength, index;
1628
1629 runlength = RtlFindFirstRunClear(bmp, &index);
1630
1631 while (runlength != 0) {
1632 if (index >= bmplen)
1633 break;
1634
1635 if (index + runlength >= bmplen) {
1636 runlength = bmplen - index;
1637
1638 if (runlength == 0)
1639 break;
1640 }
1641
1642 do {
1643 ULONG rl;
1644
1645 if (runlength << Vcb->sector_shift > SCRUB_UNIT)
1646 rl = SCRUB_UNIT >> Vcb->sector_shift;
1647 else
1648 rl = runlength;
1649
1650 Status = scrub_extent(Vcb, c, type, offset + ((uint64_t)index << Vcb->sector_shift),
1651 rl << Vcb->sector_shift, (uint8_t*)csum + (index * Vcb->csum_size));
1652 if (!NT_SUCCESS(Status)) {
1653 ERR("scrub_data_extent_dup returned %08lx\n", Status);
1654 return Status;
1655 }
1656
1657 runlength -= rl;
1658 index += rl;
1659 } while (runlength > 0);
1660
1661 runlength = RtlFindNextForwardRunClear(bmp, index, &index);
1662 }
1663
1664 return STATUS_SUCCESS;
1665 }
1666
1667 typedef struct {
1668 uint8_t* buf;
1669 PIRP Irp;
1670 void* context;
1671 IO_STATUS_BLOCK iosb;
1672 uint64_t offset;
1673 bool rewrite, missing;
1674 RTL_BITMAP error;
1675 ULONG* errorarr;
1676 } scrub_context_raid56_stripe;
1677
1678 typedef struct {
1679 scrub_context_raid56_stripe* stripes;
1680 LONG stripes_left;
1681 KEVENT Event;
1682 RTL_BITMAP alloc;
1683 RTL_BITMAP has_csum;
1684 RTL_BITMAP is_tree;
1685 void* csum;
1686 uint8_t* parity_scratch;
1687 uint8_t* parity_scratch2;
1688 } scrub_context_raid56;
1689
_Function_class_(IO_COMPLETION_ROUTINE)1690 _Function_class_(IO_COMPLETION_ROUTINE)
1691 static NTSTATUS __stdcall scrub_read_completion_raid56(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
1692 scrub_context_raid56_stripe* stripe = conptr;
1693 scrub_context_raid56* context = (scrub_context_raid56*)stripe->context;
1694 LONG left = InterlockedDecrement(&context->stripes_left);
1695
1696 UNUSED(DeviceObject);
1697
1698 stripe->iosb = Irp->IoStatus;
1699
1700 if (left == 0)
1701 KeSetEvent(&context->Event, 0, false);
1702
1703 return STATUS_MORE_PROCESSING_REQUIRED;
1704 }
1705
scrub_raid5_stripe(device_extension * Vcb,chunk * c,scrub_context_raid56 * context,uint64_t stripe_start,uint64_t bit_start,uint64_t num,uint16_t missing_devices)1706 static void scrub_raid5_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1707 uint64_t num, uint16_t missing_devices) {
1708 ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift), off;
1709 uint16_t stripe, parity = (bit_start + num + c->chunk_item->num_stripes - 1) % c->chunk_item->num_stripes;
1710 uint64_t stripeoff;
1711
1712 stripe = (parity + 1) % c->chunk_item->num_stripes;
1713 off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1);
1714 stripeoff = num * sectors_per_stripe;
1715
1716 if (missing_devices == 0)
1717 RtlCopyMemory(context->parity_scratch, &context->stripes[parity].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1718
1719 while (stripe != parity) {
1720 RtlClearAllBits(&context->stripes[stripe].error);
1721
1722 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1723 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1724 if (RtlCheckBit(&context->is_tree, off)) {
1725 tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff << Vcb->sector_shift];
1726 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1727
1728 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1729 RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size >> Vcb->sector_shift);
1730 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1731
1732 if (missing_devices > 0)
1733 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1734 }
1735
1736 off += Vcb->superblock.node_size >> Vcb->sector_shift;
1737 stripeoff += Vcb->superblock.node_size >> Vcb->sector_shift;
1738 i += (Vcb->superblock.node_size >> Vcb->sector_shift) - 1;
1739
1740 continue;
1741 } else if (RtlCheckBit(&context->has_csum, off)) {
1742 if (!check_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff << Vcb->sector_shift), (uint8_t*)context->csum + (Vcb->csum_size * off))) {
1743 RtlSetBit(&context->stripes[stripe].error, i);
1744 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1745
1746 if (missing_devices > 0) {
1747 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1748
1749 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1750 }
1751 }
1752 }
1753 }
1754
1755 off++;
1756 stripeoff++;
1757 }
1758
1759 if (missing_devices == 0)
1760 do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1761
1762 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1763 stripeoff = num * sectors_per_stripe;
1764 }
1765
1766 // check parity
1767
1768 if (missing_devices == 0) {
1769 RtlClearAllBits(&context->stripes[parity].error);
1770
1771 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1772 ULONG o, j;
1773
1774 o = i << Vcb->sector_shift;
1775 for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1776 if (context->parity_scratch[o] != 0) {
1777 RtlSetBit(&context->stripes[parity].error, i);
1778 break;
1779 }
1780 o++;
1781 }
1782 }
1783 }
1784
1785 // log and fix errors
1786
1787 if (missing_devices > 0)
1788 return;
1789
1790 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1791 ULONG num_errors = 0, bad_off = 0;
1792 uint64_t bad_stripe = 0;
1793 bool alloc = false;
1794
1795 stripe = (parity + 1) % c->chunk_item->num_stripes;
1796 off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1797
1798 while (stripe != parity) {
1799 if (RtlCheckBit(&context->alloc, off)) {
1800 alloc = true;
1801
1802 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1803 bad_stripe = stripe;
1804 bad_off = off;
1805 num_errors++;
1806 }
1807 }
1808
1809 off += sectors_per_stripe;
1810 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1811 }
1812
1813 if (!alloc)
1814 continue;
1815
1816 if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity].error, i)) // everything fine
1817 continue;
1818
1819 if (num_errors == 0 && RtlCheckBit(&context->stripes[parity].error, i)) { // parity error
1820 uint64_t addr;
1821
1822 do_xor(&context->stripes[parity].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1823 &context->parity_scratch[i << Vcb->sector_shift],
1824 Vcb->superblock.sector_size);
1825
1826 bad_off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1827 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off << Vcb->sector_shift);
1828
1829 context->stripes[parity].rewrite = true;
1830
1831 log_error(Vcb, addr, c->devices[parity]->devitem.dev_id, false, true, true);
1832 log_device_error(Vcb, c->devices[parity], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1833 } else if (num_errors == 1) {
1834 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (bad_off << Vcb->sector_shift);
1835
1836 if (RtlCheckBit(&context->is_tree, bad_off)) {
1837 tree_header* th;
1838
1839 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
1840 &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1841 Vcb->superblock.node_size);
1842
1843 th = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
1844
1845 if (check_tree_checksum(Vcb, th) && th->address == addr) {
1846 RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1847 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
1848
1849 context->stripes[bad_stripe].rewrite = true;
1850
1851 RtlClearBits(&context->stripes[bad_stripe].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
1852
1853 log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, true, false);
1854 } else
1855 log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, true, false, false);
1856 } else {
1857 uint8_t hash[MAX_HASH_SIZE];
1858
1859 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
1860 &context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1861 Vcb->superblock.sector_size);
1862
1863 get_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], hash);
1864
1865 if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * bad_off), Vcb->csum_size) == Vcb->csum_size) {
1866 RtlCopyMemory(&context->stripes[bad_stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1867 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
1868
1869 context->stripes[bad_stripe].rewrite = true;
1870
1871 log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, true, false);
1872 } else
1873 log_error(Vcb, addr, c->devices[bad_stripe]->devitem.dev_id, false, false, false);
1874 }
1875 } else {
1876 stripe = (parity + 1) % c->chunk_item->num_stripes;
1877 off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 1)) + i;
1878
1879 while (stripe != parity) {
1880 if (RtlCheckBit(&context->alloc, off)) {
1881 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
1882 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 1) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1883
1884 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
1885 }
1886 }
1887
1888 off += sectors_per_stripe;
1889 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1890 }
1891 }
1892 }
1893 }
1894
scrub_raid6_stripe(device_extension * Vcb,chunk * c,scrub_context_raid56 * context,uint64_t stripe_start,uint64_t bit_start,uint64_t num,uint16_t missing_devices)1895 static void scrub_raid6_stripe(device_extension* Vcb, chunk* c, scrub_context_raid56* context, uint64_t stripe_start, uint64_t bit_start,
1896 uint64_t num, uint16_t missing_devices) {
1897 ULONG sectors_per_stripe = (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift), off;
1898 uint16_t stripe, parity1 = (bit_start + num + c->chunk_item->num_stripes - 2) % c->chunk_item->num_stripes;
1899 uint16_t parity2 = (parity1 + 1) % c->chunk_item->num_stripes;
1900 uint64_t stripeoff;
1901
1902 stripe = (parity1 + 2) % c->chunk_item->num_stripes;
1903 off = (ULONG)(bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2);
1904 stripeoff = num * sectors_per_stripe;
1905
1906 if (c->devices[parity1]->devobj)
1907 RtlCopyMemory(context->parity_scratch, &context->stripes[parity1].buf[num * c->chunk_item->stripe_length], (ULONG)c->chunk_item->stripe_length);
1908
1909 if (c->devices[parity2]->devobj)
1910 RtlZeroMemory(context->parity_scratch2, (ULONG)c->chunk_item->stripe_length);
1911
1912 while (stripe != parity1) {
1913 RtlClearAllBits(&context->stripes[stripe].error);
1914
1915 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1916 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
1917 if (RtlCheckBit(&context->is_tree, off)) {
1918 tree_header* th = (tree_header*)&context->stripes[stripe].buf[stripeoff << Vcb->sector_shift];
1919 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1920
1921 if (!check_tree_checksum(Vcb, th) || th->address != addr) {
1922 RtlSetBits(&context->stripes[stripe].error, i, Vcb->superblock.node_size >> Vcb->sector_shift);
1923 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1924
1925 if (missing_devices == 2)
1926 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, true, false, false);
1927 }
1928
1929 off += Vcb->superblock.node_size >> Vcb->sector_shift;
1930 stripeoff += Vcb->superblock.node_size >> Vcb->sector_shift;
1931 i += (Vcb->superblock.node_size >> Vcb->sector_shift) - 1;
1932
1933 continue;
1934 } else if (RtlCheckBit(&context->has_csum, off)) {
1935 uint8_t hash[MAX_HASH_SIZE];
1936
1937 get_sector_csum(Vcb, context->stripes[stripe].buf + (stripeoff << Vcb->sector_shift), hash);
1938
1939 if (RtlCompareMemory(hash, (uint8_t*)context->csum + (Vcb->csum_size * off), Vcb->csum_size) != Vcb->csum_size) {
1940 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
1941
1942 RtlSetBit(&context->stripes[stripe].error, i);
1943 log_device_error(Vcb, c->devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1944
1945 if (missing_devices == 2)
1946 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, false, false, false);
1947 }
1948 }
1949 }
1950
1951 off++;
1952 stripeoff++;
1953 }
1954
1955 if (c->devices[parity1]->devobj)
1956 do_xor(context->parity_scratch, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1957
1958 stripe = (stripe + 1) % c->chunk_item->num_stripes;
1959 stripeoff = num * sectors_per_stripe;
1960 }
1961
1962 RtlClearAllBits(&context->stripes[parity1].error);
1963
1964 if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity2]->devobj)) {
1965 // check parity 1
1966
1967 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1968 ULONG o, j;
1969
1970 o = i << Vcb->sector_shift;
1971 for (j = 0; j < Vcb->superblock.sector_size; j++) { // FIXME - use SSE
1972 if (context->parity_scratch[o] != 0) {
1973 RtlSetBit(&context->stripes[parity1].error, i);
1974 break;
1975 }
1976 o++;
1977 }
1978 }
1979 }
1980
1981 RtlClearAllBits(&context->stripes[parity2].error);
1982
1983 if (missing_devices == 0 || (missing_devices == 1 && !c->devices[parity1]->devobj)) {
1984 // check parity 2
1985
1986 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
1987
1988 while (stripe != parity2) {
1989 galois_double(context->parity_scratch2, (uint32_t)c->chunk_item->stripe_length);
1990 do_xor(context->parity_scratch2, &context->stripes[stripe].buf[num * c->chunk_item->stripe_length], (uint32_t)c->chunk_item->stripe_length);
1991
1992 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
1993 }
1994
1995 for (ULONG i = 0; i < sectors_per_stripe; i++) {
1996 if (RtlCompareMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
1997 &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.sector_size) != Vcb->superblock.sector_size)
1998 RtlSetBit(&context->stripes[parity2].error, i);
1999 }
2000 }
2001
2002 if (missing_devices == 2)
2003 return;
2004
2005 // log and fix errors
2006
2007 for (ULONG i = 0; i < sectors_per_stripe; i++) {
2008 ULONG num_errors = 0;
2009 uint64_t bad_stripe1 = 0, bad_stripe2 = 0;
2010 ULONG bad_off1 = 0, bad_off2 = 0;
2011 bool alloc = false;
2012
2013 stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2014 off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2015
2016 while (stripe != parity1) {
2017 if (RtlCheckBit(&context->alloc, off)) {
2018 alloc = true;
2019
2020 if (!c->devices[stripe]->devobj || RtlCheckBit(&context->stripes[stripe].error, i)) {
2021 if (num_errors == 0) {
2022 bad_stripe1 = stripe;
2023 bad_off1 = off;
2024 } else if (num_errors == 1) {
2025 bad_stripe2 = stripe;
2026 bad_off2 = off;
2027 }
2028 num_errors++;
2029 }
2030 }
2031
2032 off += sectors_per_stripe;
2033 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2034 }
2035
2036 if (!alloc)
2037 continue;
2038
2039 if (num_errors == 0 && !RtlCheckBit(&context->stripes[parity1].error, i) && !RtlCheckBit(&context->stripes[parity2].error, i)) // everything fine
2040 continue;
2041
2042 if (num_errors == 0) { // parity error
2043 uint64_t addr;
2044
2045 if (RtlCheckBit(&context->stripes[parity1].error, i)) {
2046 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2047 &context->parity_scratch[i << Vcb->sector_shift],
2048 Vcb->superblock.sector_size);
2049
2050 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2051 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2052
2053 context->stripes[parity1].rewrite = true;
2054
2055 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2056 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2057 }
2058
2059 if (RtlCheckBit(&context->stripes[parity2].error, i)) {
2060 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2061 &context->parity_scratch2[i << Vcb->sector_shift],
2062 Vcb->superblock.sector_size);
2063
2064 bad_off1 = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2065 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2066
2067 context->stripes[parity2].rewrite = true;
2068
2069 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2070 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2071 }
2072 } else if (num_errors == 1) {
2073 uint32_t len;
2074 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2075 uint8_t* scratch;
2076
2077 len = RtlCheckBit(&context->is_tree, bad_off1) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2078
2079 scratch = ExAllocatePoolWithTag(PagedPool, len, ALLOC_TAG);
2080 if (!scratch) {
2081 ERR("out of memory\n");
2082 return;
2083 }
2084
2085 RtlZeroMemory(scratch, len);
2086
2087 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
2088 &context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2089
2090 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2091
2092 if (c->devices[parity2]->devobj) {
2093 uint16_t stripe_num, bad_stripe_num = 0;
2094
2095 stripe_num = c->chunk_item->num_stripes - 3;
2096 while (stripe != parity2) {
2097 galois_double(scratch, len);
2098
2099 if (stripe != bad_stripe1)
2100 do_xor(scratch, &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2101 else
2102 bad_stripe_num = stripe_num;
2103
2104 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2105 stripe_num--;
2106 }
2107
2108 do_xor(scratch, &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2109
2110 if (bad_stripe_num != 0)
2111 galois_divpower(scratch, (uint8_t)bad_stripe_num, len);
2112 }
2113
2114 if (RtlCheckBit(&context->is_tree, bad_off1)) {
2115 uint8_t hash1[MAX_HASH_SIZE];
2116 uint8_t hash2[MAX_HASH_SIZE];
2117 tree_header *th1 = NULL, *th2 = NULL;
2118
2119 if (c->devices[parity1]->devobj) {
2120 th1 = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
2121 get_tree_checksum(Vcb, th1, hash1);
2122 }
2123
2124 if (c->devices[parity2]->devobj) {
2125 th2 = (tree_header*)scratch;
2126 get_tree_checksum(Vcb, th2, hash2);
2127 }
2128
2129 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, th1, Vcb->csum_size) == Vcb->csum_size && th1->address == addr) ||
2130 (c->devices[parity2]->devobj && RtlCompareMemory(hash2, th2, Vcb->csum_size) == Vcb->csum_size && th2->address == addr)) {
2131 if (!c->devices[parity1]->devobj || RtlCompareMemory(hash1, th1, Vcb->csum_size) != Vcb->csum_size || th1->address != addr) {
2132 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2133 scratch, Vcb->superblock.node_size);
2134
2135 if (c->devices[parity1]->devobj) {
2136 // fix parity 1
2137
2138 stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2139
2140 RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2141 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2142 Vcb->superblock.node_size);
2143
2144 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2145
2146 while (stripe != parity1) {
2147 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2148 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2149 Vcb->superblock.node_size);
2150
2151 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2152 }
2153
2154 context->stripes[parity1].rewrite = true;
2155
2156 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2157 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2158 }
2159 } else {
2160 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2161 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
2162
2163 if (!c->devices[parity2]->devobj || RtlCompareMemory(hash2, th2, Vcb->csum_size) != Vcb->csum_size || th2->address != addr) {
2164 // fix parity 2
2165 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2166
2167 if (c->devices[parity2]->devobj) {
2168 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2169 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2170 Vcb->superblock.node_size);
2171
2172 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2173
2174 while (stripe != parity2) {
2175 galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], Vcb->superblock.node_size);
2176
2177 do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2178 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2179 Vcb->superblock.node_size);
2180
2181 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2182 }
2183
2184 context->stripes[parity2].rewrite = true;
2185
2186 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2187 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2188 }
2189 }
2190 }
2191
2192 context->stripes[bad_stripe1].rewrite = true;
2193
2194 RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2195
2196 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2197 } else
2198 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2199 } else {
2200 uint8_t hash1[MAX_HASH_SIZE];
2201 uint8_t hash2[MAX_HASH_SIZE];
2202
2203 if (c->devices[parity1]->devobj)
2204 get_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], hash1);
2205
2206 if (c->devices[parity2]->devobj)
2207 get_sector_csum(Vcb, scratch, hash2);
2208
2209 if ((c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) ||
2210 (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size)) {
2211 if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) == Vcb->csum_size) {
2212 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2213 scratch, Vcb->superblock.sector_size);
2214
2215 if (c->devices[parity1]->devobj && RtlCompareMemory(hash1, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2216 // fix parity 1
2217
2218 stripe = (parity1 + 2) % c->chunk_item->num_stripes;
2219
2220 RtlCopyMemory(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2221 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2222 Vcb->superblock.sector_size);
2223
2224 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2225
2226 while (stripe != parity1) {
2227 do_xor(&context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2228 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2229 Vcb->superblock.sector_size);
2230
2231 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2232 }
2233
2234 context->stripes[parity1].rewrite = true;
2235
2236 log_error(Vcb, addr, c->devices[parity1]->devitem.dev_id, false, true, true);
2237 log_device_error(Vcb, c->devices[parity1], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2238 }
2239 } else {
2240 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2241 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2242
2243 if (c->devices[parity2]->devobj && RtlCompareMemory(hash2, (uint8_t*)context->csum + (bad_off1 * Vcb->csum_size), Vcb->csum_size) != Vcb->csum_size) {
2244 // fix parity 2
2245 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2246
2247 RtlCopyMemory(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2248 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2249 Vcb->superblock.sector_size);
2250
2251 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2252
2253 while (stripe != parity2) {
2254 galois_double(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], Vcb->superblock.sector_size);
2255
2256 do_xor(&context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2257 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2258 Vcb->superblock.sector_size);
2259
2260 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2261 }
2262
2263 context->stripes[parity2].rewrite = true;
2264
2265 log_error(Vcb, addr, c->devices[parity2]->devitem.dev_id, false, true, true);
2266 log_device_error(Vcb, c->devices[parity2], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
2267 }
2268 }
2269
2270 context->stripes[bad_stripe1].rewrite = true;
2271
2272 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2273 } else
2274 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2275 }
2276
2277 ExFreePool(scratch);
2278 } else if (num_errors == 2 && missing_devices == 0) {
2279 uint16_t x = 0, y = 0, k;
2280 uint64_t addr;
2281 uint32_t len = (RtlCheckBit(&context->is_tree, bad_off1) || RtlCheckBit(&context->is_tree, bad_off2)) ? Vcb->superblock.node_size : Vcb->superblock.sector_size;
2282 uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
2283 uint32_t j;
2284
2285 stripe = parity1 == 0 ? (c->chunk_item->num_stripes - 1) : (parity1 - 1);
2286
2287 // put qxy in parity_scratch
2288 // put pxy in parity_scratch2
2289
2290 k = c->chunk_item->num_stripes - 3;
2291 if (stripe == bad_stripe1 || stripe == bad_stripe2) {
2292 RtlZeroMemory(&context->parity_scratch[i << Vcb->sector_shift], len);
2293 RtlZeroMemory(&context->parity_scratch2[i << Vcb->sector_shift], len);
2294
2295 if (stripe == bad_stripe1)
2296 x = k;
2297 else
2298 y = k;
2299 } else {
2300 RtlCopyMemory(&context->parity_scratch[i << Vcb->sector_shift],
2301 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2302 RtlCopyMemory(&context->parity_scratch2[i << Vcb->sector_shift],
2303 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2304 }
2305
2306 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2307
2308 k--;
2309 do {
2310 galois_double(&context->parity_scratch[i << Vcb->sector_shift], len);
2311
2312 if (stripe != bad_stripe1 && stripe != bad_stripe2) {
2313 do_xor(&context->parity_scratch[i << Vcb->sector_shift],
2314 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2315 do_xor(&context->parity_scratch2[i << Vcb->sector_shift],
2316 &context->stripes[stripe].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2317 } else if (stripe == bad_stripe1)
2318 x = k;
2319 else if (stripe == bad_stripe2)
2320 y = k;
2321
2322 stripe = stripe == 0 ? (c->chunk_item->num_stripes - 1) : (stripe - 1);
2323 k--;
2324 } while (stripe != parity2);
2325
2326 gyx = gpow2(y > x ? (y-x) : (255-x+y));
2327 gx = gpow2(255-x);
2328
2329 denom = gdiv(1, gyx ^ 1);
2330 a = gmul(gyx, denom);
2331 b = gmul(gx, denom);
2332
2333 p = &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)];
2334 q = &context->stripes[parity2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)];
2335 pxy = &context->parity_scratch2[i << Vcb->sector_shift];
2336 qxy = &context->parity_scratch[i << Vcb->sector_shift];
2337
2338 for (j = 0; j < len; j++) {
2339 *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
2340
2341 p++;
2342 q++;
2343 pxy++;
2344 qxy++;
2345 }
2346
2347 do_xor(&context->parity_scratch2[i << Vcb->sector_shift], &context->parity_scratch[i << Vcb->sector_shift], len);
2348 do_xor(&context->parity_scratch2[i << Vcb->sector_shift], &context->stripes[parity1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)], len);
2349
2350 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off1 << Vcb->sector_shift);
2351
2352 if (RtlCheckBit(&context->is_tree, bad_off1)) {
2353 tree_header* th = (tree_header*)&context->parity_scratch[i << Vcb->sector_shift];
2354
2355 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2356 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2357 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.node_size);
2358
2359 context->stripes[bad_stripe1].rewrite = true;
2360
2361 RtlClearBits(&context->stripes[bad_stripe1].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2362
2363 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, true, false);
2364 } else
2365 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, true, false, false);
2366 } else {
2367 if (check_sector_csum(Vcb, &context->parity_scratch[i << Vcb->sector_shift], (uint8_t*)context->csum + (Vcb->csum_size * bad_off1))) {
2368 RtlCopyMemory(&context->stripes[bad_stripe1].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2369 &context->parity_scratch[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2370
2371 context->stripes[bad_stripe1].rewrite = true;
2372
2373 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, true, false);
2374 } else
2375 log_error(Vcb, addr, c->devices[bad_stripe1]->devitem.dev_id, false, false, false);
2376 }
2377
2378 addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (bad_off2 << Vcb->sector_shift);
2379
2380 if (RtlCheckBit(&context->is_tree, bad_off2)) {
2381 tree_header* th = (tree_header*)&context->parity_scratch2[i << Vcb->sector_shift];
2382
2383 if (check_tree_checksum(Vcb, th) && th->address == addr) {
2384 RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2385 &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.node_size);
2386
2387 context->stripes[bad_stripe2].rewrite = true;
2388
2389 RtlClearBits(&context->stripes[bad_stripe2].error, i + 1, (Vcb->superblock.node_size >> Vcb->sector_shift) - 1);
2390
2391 log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, true, false);
2392 } else
2393 log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, true, false, false);
2394 } else {
2395 if (check_sector_csum(Vcb, &context->parity_scratch2[i << Vcb->sector_shift], (uint8_t*)context->csum + (Vcb->csum_size * bad_off2))) {
2396 RtlCopyMemory(&context->stripes[bad_stripe2].buf[(num * c->chunk_item->stripe_length) + (i << Vcb->sector_shift)],
2397 &context->parity_scratch2[i << Vcb->sector_shift], Vcb->superblock.sector_size);
2398
2399 context->stripes[bad_stripe2].rewrite = true;
2400
2401 log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, true, false);
2402 } else
2403 log_error(Vcb, addr, c->devices[bad_stripe2]->devitem.dev_id, false, false, false);
2404 }
2405 } else {
2406 stripe = (parity2 + 1) % c->chunk_item->num_stripes;
2407 off = (ULONG)((bit_start + num - stripe_start) * sectors_per_stripe * (c->chunk_item->num_stripes - 2)) + i;
2408
2409 while (stripe != parity1) {
2410 if (c->devices[stripe]->devobj && RtlCheckBit(&context->alloc, off)) {
2411 if (RtlCheckBit(&context->stripes[stripe].error, i)) {
2412 uint64_t addr = c->offset + (stripe_start * (c->chunk_item->num_stripes - 2) * c->chunk_item->stripe_length) + (off << Vcb->sector_shift);
2413
2414 log_error(Vcb, addr, c->devices[stripe]->devitem.dev_id, RtlCheckBit(&context->is_tree, off), false, false);
2415 }
2416 }
2417
2418 off += sectors_per_stripe;
2419 stripe = (stripe + 1) % c->chunk_item->num_stripes;
2420 }
2421 }
2422 }
2423 }
2424
scrub_chunk_raid56_stripe_run(device_extension * Vcb,chunk * c,uint64_t stripe_start,uint64_t stripe_end)2425 static NTSTATUS scrub_chunk_raid56_stripe_run(device_extension* Vcb, chunk* c, uint64_t stripe_start, uint64_t stripe_end) {
2426 NTSTATUS Status;
2427 KEY searchkey;
2428 traverse_ptr tp;
2429 bool b;
2430 uint64_t run_start, run_end, full_stripe_len, stripe;
2431 uint32_t max_read, num_sectors;
2432 ULONG arrlen, *allocarr, *csumarr = NULL, *treearr, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2433 scrub_context_raid56 context;
2434 uint16_t i;
2435 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&c->chunk_item[1];
2436
2437 TRACE("(%p, %p, %I64x, %I64x)\n", Vcb, c, stripe_start, stripe_end);
2438
2439 full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2440 run_start = c->offset + (stripe_start * full_stripe_len);
2441 run_end = c->offset + ((stripe_end + 1) * full_stripe_len);
2442
2443 searchkey.obj_id = run_start;
2444 searchkey.obj_type = TYPE_METADATA_ITEM;
2445 searchkey.offset = 0xffffffffffffffff;
2446
2447 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2448 if (!NT_SUCCESS(Status)) {
2449 ERR("find_item returned %08lx\n", Status);
2450 return Status;
2451 }
2452
2453 num_sectors = (uint32_t)(((stripe_end - stripe_start + 1) * full_stripe_len) >> Vcb->sector_shift);
2454 arrlen = (ULONG)sector_align((num_sectors / 8) + 1, sizeof(ULONG));
2455
2456 allocarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2457 if (!allocarr) {
2458 ERR("out of memory\n");
2459 return STATUS_INSUFFICIENT_RESOURCES;
2460 }
2461
2462 treearr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2463 if (!treearr) {
2464 ERR("out of memory\n");
2465 ExFreePool(allocarr);
2466 return STATUS_INSUFFICIENT_RESOURCES;
2467 }
2468
2469 RtlInitializeBitMap(&context.alloc, allocarr, num_sectors);
2470 RtlClearAllBits(&context.alloc);
2471
2472 RtlInitializeBitMap(&context.is_tree, treearr, num_sectors);
2473 RtlClearAllBits(&context.is_tree);
2474
2475 context.parity_scratch = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2476 if (!context.parity_scratch) {
2477 ERR("out of memory\n");
2478 ExFreePool(allocarr);
2479 ExFreePool(treearr);
2480 return STATUS_INSUFFICIENT_RESOURCES;
2481 }
2482
2483 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2484 csumarr = ExAllocatePoolWithTag(PagedPool, arrlen, ALLOC_TAG);
2485 if (!csumarr) {
2486 ERR("out of memory\n");
2487 ExFreePool(allocarr);
2488 ExFreePool(treearr);
2489 ExFreePool(context.parity_scratch);
2490 return STATUS_INSUFFICIENT_RESOURCES;
2491 }
2492
2493 RtlInitializeBitMap(&context.has_csum, csumarr, num_sectors);
2494 RtlClearAllBits(&context.has_csum);
2495
2496 context.csum = ExAllocatePoolWithTag(PagedPool, num_sectors * Vcb->csum_size, ALLOC_TAG);
2497 if (!context.csum) {
2498 ERR("out of memory\n");
2499 ExFreePool(allocarr);
2500 ExFreePool(treearr);
2501 ExFreePool(context.parity_scratch);
2502 ExFreePool(csumarr);
2503 return STATUS_INSUFFICIENT_RESOURCES;
2504 }
2505 }
2506
2507 if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2508 context.parity_scratch2 = ExAllocatePoolWithTag(PagedPool, (ULONG)c->chunk_item->stripe_length, ALLOC_TAG);
2509 if (!context.parity_scratch2) {
2510 ERR("out of memory\n");
2511 ExFreePool(allocarr);
2512 ExFreePool(treearr);
2513 ExFreePool(context.parity_scratch);
2514
2515 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2516 ExFreePool(csumarr);
2517 ExFreePool(context.csum);
2518 }
2519
2520 return STATUS_INSUFFICIENT_RESOURCES;
2521 }
2522 }
2523
2524 do {
2525 traverse_ptr next_tp;
2526
2527 if (tp.item->key.obj_id >= run_end)
2528 break;
2529
2530 if (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM) {
2531 uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2532
2533 if (tp.item->key.obj_id + size > run_start) {
2534 uint64_t extent_start = max(run_start, tp.item->key.obj_id);
2535 uint64_t extent_end = min(tp.item->key.obj_id + size, run_end);
2536 bool extent_is_tree = false;
2537
2538 RtlSetBits(&context.alloc, (ULONG)((extent_start - run_start) >> Vcb->sector_shift), (ULONG)((extent_end - extent_start) >> Vcb->sector_shift));
2539
2540 if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2541 extent_is_tree = true;
2542 else {
2543 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2544
2545 if (tp.item->size < sizeof(EXTENT_ITEM)) {
2546 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2547 Status = STATUS_INTERNAL_ERROR;
2548 goto end;
2549 }
2550
2551 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2552 extent_is_tree = true;
2553 }
2554
2555 if (extent_is_tree)
2556 RtlSetBits(&context.is_tree, (ULONG)((extent_start - run_start) >> Vcb->sector_shift), (ULONG)((extent_end - extent_start) >> Vcb->sector_shift));
2557 else if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2558 traverse_ptr tp2;
2559 bool b2;
2560
2561 searchkey.obj_id = EXTENT_CSUM_ID;
2562 searchkey.obj_type = TYPE_EXTENT_CSUM;
2563 searchkey.offset = extent_start;
2564
2565 Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
2566 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
2567 ERR("find_item returned %08lx\n", Status);
2568 goto end;
2569 }
2570
2571 do {
2572 traverse_ptr next_tp2;
2573
2574 if (tp2.item->key.offset >= extent_end)
2575 break;
2576
2577 if (tp2.item->key.offset >= extent_start) {
2578 uint64_t csum_start = max(extent_start, tp2.item->key.offset);
2579 uint64_t csum_end = min(extent_end, tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size));
2580
2581 RtlSetBits(&context.has_csum, (ULONG)((csum_start - run_start) >> Vcb->sector_shift), (ULONG)((csum_end - csum_start) >> Vcb->sector_shift));
2582
2583 RtlCopyMemory((uint8_t*)context.csum + (((csum_start - run_start) * Vcb->csum_size) >> Vcb->sector_shift),
2584 tp2.item->data + (((csum_start - tp2.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
2585 (ULONG)(((csum_end - csum_start) * Vcb->csum_size) >> Vcb->sector_shift));
2586 }
2587
2588 b2 = find_next_item(Vcb, &tp2, &next_tp2, false, NULL);
2589
2590 if (b2)
2591 tp2 = next_tp2;
2592 } while (b2);
2593 }
2594 }
2595 }
2596
2597 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2598
2599 if (b)
2600 tp = next_tp;
2601 } while (b);
2602
2603 context.stripes = ExAllocatePoolWithTag(PagedPool, sizeof(scrub_context_raid56_stripe) * c->chunk_item->num_stripes, ALLOC_TAG);
2604 if (!context.stripes) {
2605 ERR("out of memory\n");
2606 Status = STATUS_INSUFFICIENT_RESOURCES;
2607 goto end;
2608 }
2609
2610 max_read = (uint32_t)min(1048576 / c->chunk_item->stripe_length, stripe_end - stripe_start + 1); // only process 1 MB of data at a time
2611
2612 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2613 context.stripes[i].buf = ExAllocatePoolWithTag(PagedPool, (ULONG)(max_read * c->chunk_item->stripe_length), ALLOC_TAG);
2614 if (!context.stripes[i].buf) {
2615 uint64_t j;
2616
2617 ERR("out of memory\n");
2618
2619 for (j = 0; j < i; j++) {
2620 ExFreePool(context.stripes[j].buf);
2621 }
2622 ExFreePool(context.stripes);
2623
2624 Status = STATUS_INSUFFICIENT_RESOURCES;
2625 goto end;
2626 }
2627
2628 context.stripes[i].errorarr = ExAllocatePoolWithTag(PagedPool, (ULONG)sector_align(((c->chunk_item->stripe_length >> Vcb->sector_shift) / 8) + 1, sizeof(ULONG)), ALLOC_TAG);
2629 if (!context.stripes[i].errorarr) {
2630 uint64_t j;
2631
2632 ERR("out of memory\n");
2633
2634 ExFreePool(context.stripes[i].buf);
2635
2636 for (j = 0; j < i; j++) {
2637 ExFreePool(context.stripes[j].buf);
2638 }
2639 ExFreePool(context.stripes);
2640
2641 Status = STATUS_INSUFFICIENT_RESOURCES;
2642 goto end;
2643 }
2644
2645 RtlInitializeBitMap(&context.stripes[i].error, context.stripes[i].errorarr, (ULONG)(c->chunk_item->stripe_length >> Vcb->sector_shift));
2646
2647 context.stripes[i].context = &context;
2648 context.stripes[i].rewrite = false;
2649 }
2650
2651 stripe = stripe_start;
2652
2653 Status = STATUS_SUCCESS;
2654
2655 chunk_lock_range(Vcb, c, run_start, run_end - run_start);
2656
2657 do {
2658 ULONG read_stripes;
2659 uint16_t missing_devices = 0;
2660 bool need_wait = false;
2661
2662 if (max_read < stripe_end + 1 - stripe)
2663 read_stripes = max_read;
2664 else
2665 read_stripes = (ULONG)(stripe_end + 1 - stripe);
2666
2667 context.stripes_left = c->chunk_item->num_stripes;
2668
2669 // read megabyte by megabyte
2670 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2671 if (c->devices[i]->devobj) {
2672 PIO_STACK_LOCATION IrpSp;
2673
2674 context.stripes[i].Irp = IoAllocateIrp(c->devices[i]->devobj->StackSize, false);
2675
2676 if (!context.stripes[i].Irp) {
2677 ERR("IoAllocateIrp failed\n");
2678 Status = STATUS_INSUFFICIENT_RESOURCES;
2679 goto end3;
2680 }
2681
2682 context.stripes[i].Irp->MdlAddress = NULL;
2683
2684 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2685 IrpSp->MajorFunction = IRP_MJ_READ;
2686 IrpSp->FileObject = c->devices[i]->fileobj;
2687
2688 if (c->devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2689 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(read_stripes * c->chunk_item->stripe_length), ALLOC_TAG);
2690 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2691 ERR("out of memory\n");
2692 Status = STATUS_INSUFFICIENT_RESOURCES;
2693 goto end3;
2694 }
2695
2696 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2697
2698 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2699 } else if (c->devices[i]->devobj->Flags & DO_DIRECT_IO) {
2700 context.stripes[i].Irp->MdlAddress = IoAllocateMdl(context.stripes[i].buf, (ULONG)(read_stripes * c->chunk_item->stripe_length), false, false, NULL);
2701 if (!context.stripes[i].Irp->MdlAddress) {
2702 ERR("IoAllocateMdl failed\n");
2703 Status = STATUS_INSUFFICIENT_RESOURCES;
2704 goto end3;
2705 }
2706
2707 Status = STATUS_SUCCESS;
2708
2709 _SEH2_TRY {
2710 MmProbeAndLockPages(context.stripes[i].Irp->MdlAddress, KernelMode, IoWriteAccess);
2711 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2712 Status = _SEH2_GetExceptionCode();
2713 } _SEH2_END;
2714
2715 if (!NT_SUCCESS(Status)) {
2716 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2717 IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2718 goto end3;
2719 }
2720 } else
2721 context.stripes[i].Irp->UserBuffer = context.stripes[i].buf;
2722
2723 context.stripes[i].offset = stripe * c->chunk_item->stripe_length;
2724
2725 IrpSp->Parameters.Read.Length = (ULONG)(read_stripes * c->chunk_item->stripe_length);
2726 IrpSp->Parameters.Read.ByteOffset.QuadPart = cis[i].offset + context.stripes[i].offset;
2727
2728 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2729 context.stripes[i].missing = false;
2730
2731 IoSetCompletionRoutine(context.stripes[i].Irp, scrub_read_completion_raid56, &context.stripes[i], true, true, true);
2732
2733 Vcb->scrub.data_scrubbed += read_stripes * c->chunk_item->stripe_length;
2734 need_wait = true;
2735 } else {
2736 context.stripes[i].Irp = NULL;
2737 context.stripes[i].missing = true;
2738 missing_devices++;
2739 InterlockedDecrement(&context.stripes_left);
2740 }
2741 }
2742
2743 if (c->chunk_item->type & BLOCK_FLAG_RAID5 && missing_devices > 1) {
2744 ERR("too many missing devices (%u, maximum 1)\n", missing_devices);
2745 Status = STATUS_UNEXPECTED_IO_ERROR;
2746 goto end3;
2747 } else if (c->chunk_item->type & BLOCK_FLAG_RAID6 && missing_devices > 2) {
2748 ERR("too many missing devices (%u, maximum 2)\n", missing_devices);
2749 Status = STATUS_UNEXPECTED_IO_ERROR;
2750 goto end3;
2751 }
2752
2753 if (need_wait) {
2754 KeInitializeEvent(&context.Event, NotificationEvent, false);
2755
2756 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2757 if (c->devices[i]->devobj)
2758 IoCallDriver(c->devices[i]->devobj, context.stripes[i].Irp);
2759 }
2760
2761 KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2762 }
2763
2764 // return an error if any of the stripes returned an error
2765 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2766 if (!context.stripes[i].missing && !NT_SUCCESS(context.stripes[i].iosb.Status)) {
2767 Status = context.stripes[i].iosb.Status;
2768 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_READ_ERRORS);
2769 goto end3;
2770 }
2771 }
2772
2773 if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2774 for (i = 0; i < read_stripes; i++) {
2775 scrub_raid6_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2776 }
2777 } else {
2778 for (i = 0; i < read_stripes; i++) {
2779 scrub_raid5_stripe(Vcb, c, &context, stripe_start, stripe, i, missing_devices);
2780 }
2781 }
2782 stripe += read_stripes;
2783
2784 end3:
2785 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2786 if (context.stripes[i].Irp) {
2787 if (c->devices[i]->devobj->Flags & DO_DIRECT_IO && context.stripes[i].Irp->MdlAddress) {
2788 MmUnlockPages(context.stripes[i].Irp->MdlAddress);
2789 IoFreeMdl(context.stripes[i].Irp->MdlAddress);
2790 }
2791 IoFreeIrp(context.stripes[i].Irp);
2792 context.stripes[i].Irp = NULL;
2793
2794 if (context.stripes[i].rewrite) {
2795 Status = write_data_phys(c->devices[i]->devobj, c->devices[i]->fileobj, cis[i].offset + context.stripes[i].offset,
2796 context.stripes[i].buf, (uint32_t)(read_stripes * c->chunk_item->stripe_length));
2797
2798 if (!NT_SUCCESS(Status)) {
2799 ERR("write_data_phys returned %08lx\n", Status);
2800 log_device_error(Vcb, c->devices[i], BTRFS_DEV_STAT_WRITE_ERRORS);
2801 goto end2;
2802 }
2803 }
2804 }
2805 }
2806
2807 if (!NT_SUCCESS(Status))
2808 break;
2809 } while (stripe < stripe_end);
2810
2811 end2:
2812 chunk_unlock_range(Vcb, c, run_start, run_end - run_start);
2813
2814 for (i = 0; i < c->chunk_item->num_stripes; i++) {
2815 ExFreePool(context.stripes[i].buf);
2816 ExFreePool(context.stripes[i].errorarr);
2817 }
2818 ExFreePool(context.stripes);
2819
2820 end:
2821 ExFreePool(treearr);
2822 ExFreePool(allocarr);
2823 ExFreePool(context.parity_scratch);
2824
2825 if (c->chunk_item->type & BLOCK_FLAG_RAID6)
2826 ExFreePool(context.parity_scratch2);
2827
2828 if (c->chunk_item->type & BLOCK_FLAG_DATA) {
2829 ExFreePool(csumarr);
2830 ExFreePool(context.csum);
2831 }
2832
2833 return Status;
2834 }
2835
scrub_chunk_raid56(device_extension * Vcb,chunk * c,uint64_t * offset,bool * changed)2836 static NTSTATUS scrub_chunk_raid56(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2837 NTSTATUS Status;
2838 KEY searchkey;
2839 traverse_ptr tp;
2840 bool b;
2841 uint64_t full_stripe_len, stripe, stripe_start = 0, stripe_end = 0, total_data = 0;
2842 ULONG num_extents = 0, num_parity_stripes = c->chunk_item->type & BLOCK_FLAG_RAID6 ? 2 : 1;
2843
2844 full_stripe_len = (c->chunk_item->num_stripes - num_parity_stripes) * c->chunk_item->stripe_length;
2845 stripe = (*offset - c->offset) / full_stripe_len;
2846
2847 *offset = c->offset + (stripe * full_stripe_len);
2848
2849 searchkey.obj_id = *offset;
2850 searchkey.obj_type = TYPE_METADATA_ITEM;
2851 searchkey.offset = 0xffffffffffffffff;
2852
2853 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2854 if (!NT_SUCCESS(Status)) {
2855 ERR("find_item returned %08lx\n", Status);
2856 return Status;
2857 }
2858
2859 *changed = false;
2860
2861 do {
2862 traverse_ptr next_tp;
2863
2864 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2865 break;
2866
2867 if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2868 uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2869
2870 TRACE("%I64x\n", tp.item->key.obj_id);
2871
2872 if (size < Vcb->superblock.sector_size) {
2873 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
2874 return STATUS_INTERNAL_ERROR;
2875 }
2876
2877 stripe = (tp.item->key.obj_id - c->offset) / full_stripe_len;
2878
2879 if (*changed) {
2880 if (stripe > stripe_end + 1) {
2881 Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2882 if (!NT_SUCCESS(Status)) {
2883 ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2884 return Status;
2885 }
2886
2887 stripe_start = stripe;
2888 }
2889 } else
2890 stripe_start = stripe;
2891
2892 stripe_end = (tp.item->key.obj_id + size - 1 - c->offset) / full_stripe_len;
2893
2894 *changed = true;
2895
2896 total_data += size;
2897 num_extents++;
2898
2899 // only do so much at a time
2900 if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
2901 break;
2902 }
2903
2904 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
2905
2906 if (b)
2907 tp = next_tp;
2908 } while (b);
2909
2910 if (*changed) {
2911 Status = scrub_chunk_raid56_stripe_run(Vcb, c, stripe_start, stripe_end);
2912 if (!NT_SUCCESS(Status)) {
2913 ERR("scrub_chunk_raid56_stripe_run returned %08lx\n", Status);
2914 return Status;
2915 }
2916
2917 *offset = c->offset + ((stripe_end + 1) * full_stripe_len);
2918 }
2919
2920 return STATUS_SUCCESS;
2921 }
2922
scrub_chunk(device_extension * Vcb,chunk * c,uint64_t * offset,bool * changed)2923 static NTSTATUS scrub_chunk(device_extension* Vcb, chunk* c, uint64_t* offset, bool* changed) {
2924 NTSTATUS Status;
2925 KEY searchkey;
2926 traverse_ptr tp;
2927 bool b = false, tree_run = false;
2928 ULONG type, num_extents = 0;
2929 uint64_t total_data = 0, tree_run_start = 0, tree_run_end = 0;
2930
2931 TRACE("chunk %I64x\n", c->offset);
2932
2933 ExAcquireResourceSharedLite(&Vcb->tree_lock, true);
2934
2935 if (c->chunk_item->type & BLOCK_FLAG_DUPLICATE)
2936 type = BLOCK_FLAG_DUPLICATE;
2937 else if (c->chunk_item->type & BLOCK_FLAG_RAID0)
2938 type = BLOCK_FLAG_RAID0;
2939 else if (c->chunk_item->type & BLOCK_FLAG_RAID1)
2940 type = BLOCK_FLAG_DUPLICATE;
2941 else if (c->chunk_item->type & BLOCK_FLAG_RAID10)
2942 type = BLOCK_FLAG_RAID10;
2943 else if (c->chunk_item->type & BLOCK_FLAG_RAID5) {
2944 Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2945 goto end;
2946 } else if (c->chunk_item->type & BLOCK_FLAG_RAID6) {
2947 Status = scrub_chunk_raid56(Vcb, c, offset, changed);
2948 goto end;
2949 } else if (c->chunk_item->type & BLOCK_FLAG_RAID1C3)
2950 type = BLOCK_FLAG_DUPLICATE;
2951 else if (c->chunk_item->type & BLOCK_FLAG_RAID1C4)
2952 type = BLOCK_FLAG_DUPLICATE;
2953 else // SINGLE
2954 type = BLOCK_FLAG_DUPLICATE;
2955
2956 searchkey.obj_id = *offset;
2957 searchkey.obj_type = TYPE_METADATA_ITEM;
2958 searchkey.offset = 0xffffffffffffffff;
2959
2960 Status = find_item(Vcb, Vcb->extent_root, &tp, &searchkey, false, NULL);
2961 if (!NT_SUCCESS(Status)) {
2962 ERR("error - find_item returned %08lx\n", Status);
2963 goto end;
2964 }
2965
2966 do {
2967 traverse_ptr next_tp;
2968
2969 if (tp.item->key.obj_id >= c->offset + c->chunk_item->size)
2970 break;
2971
2972 if (tp.item->key.obj_id >= *offset && (tp.item->key.obj_type == TYPE_EXTENT_ITEM || tp.item->key.obj_type == TYPE_METADATA_ITEM)) {
2973 uint64_t size = tp.item->key.obj_type == TYPE_METADATA_ITEM ? Vcb->superblock.node_size : tp.item->key.offset;
2974 bool is_tree;
2975 void* csum = NULL;
2976 RTL_BITMAP bmp;
2977 ULONG* bmparr = NULL, bmplen;
2978
2979 TRACE("%I64x\n", tp.item->key.obj_id);
2980
2981 is_tree = false;
2982
2983 if (tp.item->key.obj_type == TYPE_METADATA_ITEM)
2984 is_tree = true;
2985 else {
2986 EXTENT_ITEM* ei = (EXTENT_ITEM*)tp.item->data;
2987
2988 if (tp.item->size < sizeof(EXTENT_ITEM)) {
2989 ERR("(%I64x,%x,%I64x) was %u bytes, expected at least %Iu\n", tp.item->key.obj_id, tp.item->key.obj_type, tp.item->key.offset, tp.item->size, sizeof(EXTENT_ITEM));
2990 Status = STATUS_INTERNAL_ERROR;
2991 goto end;
2992 }
2993
2994 if (ei->flags & EXTENT_ITEM_TREE_BLOCK)
2995 is_tree = true;
2996 }
2997
2998 if (size < Vcb->superblock.sector_size) {
2999 ERR("extent %I64x has size less than sector_size (%I64x < %x)\n", tp.item->key.obj_id, size, Vcb->superblock.sector_size);
3000 Status = STATUS_INTERNAL_ERROR;
3001 goto end;
3002 }
3003
3004 // load csum
3005 if (!is_tree) {
3006 traverse_ptr tp2;
3007
3008 csum = ExAllocatePoolWithTag(PagedPool, (ULONG)((Vcb->csum_size * size) >> Vcb->sector_shift), ALLOC_TAG);
3009 if (!csum) {
3010 ERR("out of memory\n");
3011 Status = STATUS_INSUFFICIENT_RESOURCES;
3012 goto end;
3013 }
3014
3015 bmplen = (ULONG)(size >> Vcb->sector_shift);
3016
3017 bmparr = ExAllocatePoolWithTag(PagedPool, (ULONG)(sector_align((bmplen >> 3) + 1, sizeof(ULONG))), ALLOC_TAG);
3018 if (!bmparr) {
3019 ERR("out of memory\n");
3020 ExFreePool(csum);
3021 Status = STATUS_INSUFFICIENT_RESOURCES;
3022 goto end;
3023 }
3024
3025 RtlInitializeBitMap(&bmp, bmparr, bmplen);
3026 RtlSetAllBits(&bmp); // 1 = no csum, 0 = csum
3027
3028 searchkey.obj_id = EXTENT_CSUM_ID;
3029 searchkey.obj_type = TYPE_EXTENT_CSUM;
3030 searchkey.offset = tp.item->key.obj_id;
3031
3032 Status = find_item(Vcb, Vcb->checksum_root, &tp2, &searchkey, false, NULL);
3033 if (!NT_SUCCESS(Status) && Status != STATUS_NOT_FOUND) {
3034 ERR("find_item returned %08lx\n", Status);
3035 ExFreePool(csum);
3036 ExFreePool(bmparr);
3037 goto end;
3038 }
3039
3040 if (Status != STATUS_NOT_FOUND) {
3041 do {
3042 traverse_ptr next_tp2;
3043
3044 if (tp2.item->key.obj_type == TYPE_EXTENT_CSUM) {
3045 if (tp2.item->key.offset >= tp.item->key.obj_id + size)
3046 break;
3047 else if (tp2.item->size >= Vcb->csum_size && tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size) >= tp.item->key.obj_id) {
3048 uint64_t cs = max(tp.item->key.obj_id, tp2.item->key.offset);
3049 uint64_t ce = min(tp.item->key.obj_id + size, tp2.item->key.offset + (((uint64_t)tp2.item->size << Vcb->sector_shift) / Vcb->csum_size));
3050
3051 RtlCopyMemory((uint8_t*)csum + (((cs - tp.item->key.obj_id) * Vcb->csum_size) >> Vcb->sector_shift),
3052 tp2.item->data + (((cs - tp2.item->key.offset) * Vcb->csum_size) >> Vcb->sector_shift),
3053 (ULONG)(((ce - cs) * Vcb->csum_size) >> Vcb->sector_shift));
3054
3055 RtlClearBits(&bmp, (ULONG)((cs - tp.item->key.obj_id) >> Vcb->sector_shift), (ULONG)((ce - cs) >> Vcb->sector_shift));
3056
3057 if (ce == tp.item->key.obj_id + size)
3058 break;
3059 }
3060 }
3061
3062 if (find_next_item(Vcb, &tp2, &next_tp2, false, NULL))
3063 tp2 = next_tp2;
3064 else
3065 break;
3066 } while (true);
3067 }
3068 }
3069
3070 if (tree_run) {
3071 if (!is_tree || tp.item->key.obj_id > tree_run_end) {
3072 Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3073 if (!NT_SUCCESS(Status)) {
3074 ERR("scrub_extent returned %08lx\n", Status);
3075 goto end;
3076 }
3077
3078 if (!is_tree)
3079 tree_run = false;
3080 else {
3081 tree_run_start = tp.item->key.obj_id;
3082 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3083 }
3084 } else
3085 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3086 } else if (is_tree) {
3087 tree_run = true;
3088 tree_run_start = tp.item->key.obj_id;
3089 tree_run_end = tp.item->key.obj_id + Vcb->superblock.node_size;
3090 }
3091
3092 if (!is_tree) {
3093 Status = scrub_data_extent(Vcb, c, tp.item->key.obj_id, type, csum, &bmp, bmplen);
3094 if (!NT_SUCCESS(Status)) {
3095 ERR("scrub_data_extent returned %08lx\n", Status);
3096 ExFreePool(csum);
3097 ExFreePool(bmparr);
3098 goto end;
3099 }
3100
3101 ExFreePool(csum);
3102 ExFreePool(bmparr);
3103 }
3104
3105 *offset = tp.item->key.obj_id + size;
3106 *changed = true;
3107
3108 total_data += size;
3109 num_extents++;
3110
3111 // only do so much at a time
3112 if (num_extents >= 64 || total_data >= 0x8000000) // 128 MB
3113 break;
3114 }
3115
3116 b = find_next_item(Vcb, &tp, &next_tp, false, NULL);
3117
3118 if (b)
3119 tp = next_tp;
3120 } while (b);
3121
3122 if (tree_run) {
3123 Status = scrub_extent(Vcb, c, type, tree_run_start, (uint32_t)(tree_run_end - tree_run_start), NULL);
3124 if (!NT_SUCCESS(Status)) {
3125 ERR("scrub_extent returned %08lx\n", Status);
3126 goto end;
3127 }
3128 }
3129
3130 Status = STATUS_SUCCESS;
3131
3132 end:
3133 ExReleaseResourceLite(&Vcb->tree_lock);
3134
3135 return Status;
3136 }
3137
_Function_class_(KSTART_ROUTINE)3138 _Function_class_(KSTART_ROUTINE)
3139 static void __stdcall scrub_thread(void* context) {
3140 device_extension* Vcb = context;
3141 LIST_ENTRY chunks, *le;
3142 NTSTATUS Status;
3143 LARGE_INTEGER time;
3144
3145 KeInitializeEvent(&Vcb->scrub.finished, NotificationEvent, false);
3146
3147 InitializeListHead(&chunks);
3148
3149 ExAcquireResourceExclusiveLite(&Vcb->tree_lock, true);
3150
3151 if (Vcb->need_write && !Vcb->readonly)
3152 Status = do_write(Vcb, NULL);
3153 else
3154 Status = STATUS_SUCCESS;
3155
3156 free_trees(Vcb);
3157
3158 if (!NT_SUCCESS(Status)) {
3159 ExReleaseResourceLite(&Vcb->tree_lock);
3160 ERR("do_write returned %08lx\n", Status);
3161 Vcb->scrub.error = Status;
3162 goto end;
3163 }
3164
3165 ExConvertExclusiveToSharedLite(&Vcb->tree_lock);
3166
3167 ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3168
3169 KeQuerySystemTime(&Vcb->scrub.start_time);
3170 Vcb->scrub.finish_time.QuadPart = 0;
3171 Vcb->scrub.resume_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3172 Vcb->scrub.duration.QuadPart = 0;
3173 Vcb->scrub.total_chunks = 0;
3174 Vcb->scrub.chunks_left = 0;
3175 Vcb->scrub.data_scrubbed = 0;
3176 Vcb->scrub.num_errors = 0;
3177
3178 while (!IsListEmpty(&Vcb->scrub.errors)) {
3179 scrub_error* err = CONTAINING_RECORD(RemoveHeadList(&Vcb->scrub.errors), scrub_error, list_entry);
3180 ExFreePool(err);
3181 }
3182
3183 ExAcquireResourceSharedLite(&Vcb->chunk_lock, true);
3184
3185 le = Vcb->chunks.Flink;
3186 while (le != &Vcb->chunks) {
3187 chunk* c = CONTAINING_RECORD(le, chunk, list_entry);
3188
3189 acquire_chunk_lock(c, Vcb);
3190
3191 if (!c->readonly) {
3192 InsertTailList(&chunks, &c->list_entry_balance);
3193 Vcb->scrub.total_chunks++;
3194 Vcb->scrub.chunks_left++;
3195 }
3196
3197 release_chunk_lock(c, Vcb);
3198
3199 le = le->Flink;
3200 }
3201
3202 ExReleaseResourceLite(&Vcb->chunk_lock);
3203
3204 ExReleaseResource(&Vcb->scrub.stats_lock);
3205
3206 ExReleaseResourceLite(&Vcb->tree_lock);
3207
3208 while (!IsListEmpty(&chunks)) {
3209 chunk* c = CONTAINING_RECORD(RemoveHeadList(&chunks), chunk, list_entry_balance);
3210 uint64_t offset = c->offset;
3211 bool changed;
3212
3213 c->reloc = true;
3214
3215 KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3216
3217 if (!Vcb->scrub.stopping) {
3218 do {
3219 changed = false;
3220
3221 Status = scrub_chunk(Vcb, c, &offset, &changed);
3222 if (!NT_SUCCESS(Status)) {
3223 ERR("scrub_chunk returned %08lx\n", Status);
3224 Vcb->scrub.stopping = true;
3225 Vcb->scrub.error = Status;
3226 break;
3227 }
3228
3229 if (offset == c->offset + c->chunk_item->size || Vcb->scrub.stopping)
3230 break;
3231
3232 KeWaitForSingleObject(&Vcb->scrub.event, Executive, KernelMode, false, NULL);
3233 } while (changed);
3234 }
3235
3236 ExAcquireResourceExclusiveLite(&Vcb->scrub.stats_lock, true);
3237
3238 if (!Vcb->scrub.stopping)
3239 Vcb->scrub.chunks_left--;
3240
3241 if (IsListEmpty(&chunks))
3242 KeQuerySystemTime(&Vcb->scrub.finish_time);
3243
3244 ExReleaseResource(&Vcb->scrub.stats_lock);
3245
3246 c->reloc = false;
3247 c->list_entry_balance.Flink = NULL;
3248 }
3249
3250 KeQuerySystemTime(&time);
3251 Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3252
3253 end:
3254 ZwClose(Vcb->scrub.thread);
3255 Vcb->scrub.thread = NULL;
3256
3257 KeSetEvent(&Vcb->scrub.finished, 0, false);
3258 }
3259
start_scrub(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3260 NTSTATUS start_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3261 NTSTATUS Status;
3262 OBJECT_ATTRIBUTES oa;
3263
3264 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3265 return STATUS_PRIVILEGE_NOT_HELD;
3266
3267 if (Vcb->locked) {
3268 WARN("cannot start scrub while locked\n");
3269 return STATUS_DEVICE_NOT_READY;
3270 }
3271
3272 if (Vcb->balance.thread) {
3273 WARN("cannot start scrub while balance running\n");
3274 return STATUS_DEVICE_NOT_READY;
3275 }
3276
3277 if (Vcb->scrub.thread) {
3278 WARN("scrub already running\n");
3279 return STATUS_DEVICE_NOT_READY;
3280 }
3281
3282 if (Vcb->readonly)
3283 return STATUS_MEDIA_WRITE_PROTECTED;
3284
3285 Vcb->scrub.stopping = false;
3286 Vcb->scrub.paused = false;
3287 Vcb->scrub.error = STATUS_SUCCESS;
3288 KeInitializeEvent(&Vcb->scrub.event, NotificationEvent, !Vcb->scrub.paused);
3289
3290 InitializeObjectAttributes(&oa, NULL, OBJ_KERNEL_HANDLE, NULL, NULL);
3291
3292 Status = PsCreateSystemThread(&Vcb->scrub.thread, 0, &oa, NULL, NULL, scrub_thread, Vcb);
3293 if (!NT_SUCCESS(Status)) {
3294 ERR("PsCreateSystemThread returned %08lx\n", Status);
3295 return Status;
3296 }
3297
3298 return STATUS_SUCCESS;
3299 }
3300
query_scrub(device_extension * Vcb,KPROCESSOR_MODE processor_mode,void * data,ULONG length)3301 NTSTATUS query_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode, void* data, ULONG length) {
3302 btrfs_query_scrub* bqs = (btrfs_query_scrub*)data;
3303 ULONG len;
3304 NTSTATUS Status;
3305 LIST_ENTRY* le;
3306 btrfs_scrub_error* bse = NULL;
3307
3308 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3309 return STATUS_PRIVILEGE_NOT_HELD;
3310
3311 if (length < offsetof(btrfs_query_scrub, errors))
3312 return STATUS_BUFFER_TOO_SMALL;
3313
3314 ExAcquireResourceSharedLite(&Vcb->scrub.stats_lock, true);
3315
3316 if (Vcb->scrub.thread && Vcb->scrub.chunks_left > 0)
3317 bqs->status = Vcb->scrub.paused ? BTRFS_SCRUB_PAUSED : BTRFS_SCRUB_RUNNING;
3318 else
3319 bqs->status = BTRFS_SCRUB_STOPPED;
3320
3321 bqs->start_time.QuadPart = Vcb->scrub.start_time.QuadPart;
3322 bqs->finish_time.QuadPart = Vcb->scrub.finish_time.QuadPart;
3323 bqs->chunks_left = Vcb->scrub.chunks_left;
3324 bqs->total_chunks = Vcb->scrub.total_chunks;
3325 bqs->data_scrubbed = Vcb->scrub.data_scrubbed;
3326
3327 bqs->duration = Vcb->scrub.duration.QuadPart;
3328
3329 if (bqs->status == BTRFS_SCRUB_RUNNING) {
3330 LARGE_INTEGER time;
3331
3332 KeQuerySystemTime(&time);
3333 bqs->duration += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3334 }
3335
3336 bqs->error = Vcb->scrub.error;
3337
3338 bqs->num_errors = Vcb->scrub.num_errors;
3339
3340 len = length - offsetof(btrfs_query_scrub, errors);
3341
3342 le = Vcb->scrub.errors.Flink;
3343 while (le != &Vcb->scrub.errors) {
3344 scrub_error* err = CONTAINING_RECORD(le, scrub_error, list_entry);
3345 ULONG errlen;
3346
3347 if (err->is_metadata)
3348 errlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3349 else
3350 errlen = offsetof(btrfs_scrub_error, data.filename) + err->data.filename_length;
3351
3352 if (len < errlen) {
3353 Status = STATUS_BUFFER_OVERFLOW;
3354 goto end;
3355 }
3356
3357 if (!bse)
3358 bse = &bqs->errors;
3359 else {
3360 ULONG lastlen;
3361
3362 if (bse->is_metadata)
3363 lastlen = offsetof(btrfs_scrub_error, metadata.firstitem) + sizeof(KEY);
3364 else
3365 lastlen = offsetof(btrfs_scrub_error, data.filename) + bse->data.filename_length;
3366
3367 bse->next_entry = lastlen;
3368 bse = (btrfs_scrub_error*)(((uint8_t*)bse) + lastlen);
3369 }
3370
3371 bse->next_entry = 0;
3372 bse->address = err->address;
3373 bse->device = err->device;
3374 bse->recovered = err->recovered;
3375 bse->is_metadata = err->is_metadata;
3376 bse->parity = err->parity;
3377
3378 if (err->is_metadata) {
3379 bse->metadata.root = err->metadata.root;
3380 bse->metadata.level = err->metadata.level;
3381 bse->metadata.firstitem = err->metadata.firstitem;
3382 } else {
3383 bse->data.subvol = err->data.subvol;
3384 bse->data.offset = err->data.offset;
3385 bse->data.filename_length = err->data.filename_length;
3386 RtlCopyMemory(bse->data.filename, err->data.filename, err->data.filename_length);
3387 }
3388
3389 len -= errlen;
3390 le = le->Flink;
3391 }
3392
3393 Status = STATUS_SUCCESS;
3394
3395 end:
3396 ExReleaseResourceLite(&Vcb->scrub.stats_lock);
3397
3398 return Status;
3399 }
3400
pause_scrub(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3401 NTSTATUS pause_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3402 LARGE_INTEGER time;
3403
3404 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3405 return STATUS_PRIVILEGE_NOT_HELD;
3406
3407 if (!Vcb->scrub.thread)
3408 return STATUS_DEVICE_NOT_READY;
3409
3410 if (Vcb->scrub.paused)
3411 return STATUS_DEVICE_NOT_READY;
3412
3413 Vcb->scrub.paused = true;
3414 KeClearEvent(&Vcb->scrub.event);
3415
3416 KeQuerySystemTime(&time);
3417 Vcb->scrub.duration.QuadPart += time.QuadPart - Vcb->scrub.resume_time.QuadPart;
3418
3419 return STATUS_SUCCESS;
3420 }
3421
resume_scrub(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3422 NTSTATUS resume_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3423 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3424 return STATUS_PRIVILEGE_NOT_HELD;
3425
3426 if (!Vcb->scrub.thread)
3427 return STATUS_DEVICE_NOT_READY;
3428
3429 if (!Vcb->scrub.paused)
3430 return STATUS_DEVICE_NOT_READY;
3431
3432 Vcb->scrub.paused = false;
3433 KeSetEvent(&Vcb->scrub.event, 0, false);
3434
3435 KeQuerySystemTime(&Vcb->scrub.resume_time);
3436
3437 return STATUS_SUCCESS;
3438 }
3439
stop_scrub(device_extension * Vcb,KPROCESSOR_MODE processor_mode)3440 NTSTATUS stop_scrub(device_extension* Vcb, KPROCESSOR_MODE processor_mode) {
3441 if (!SeSinglePrivilegeCheck(RtlConvertLongToLuid(SE_MANAGE_VOLUME_PRIVILEGE), processor_mode))
3442 return STATUS_PRIVILEGE_NOT_HELD;
3443
3444 if (!Vcb->scrub.thread)
3445 return STATUS_DEVICE_NOT_READY;
3446
3447 Vcb->scrub.paused = false;
3448 Vcb->scrub.stopping = true;
3449 KeSetEvent(&Vcb->scrub.event, 0, false);
3450
3451 return STATUS_SUCCESS;
3452 }
3453