xref: /reactos/drivers/filesystems/btrfs/read.c (revision 6e0cf03d)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "xxhash.h"
20 #include "crc32c.h"
21 
22 enum read_data_status {
23     ReadDataStatus_Pending,
24     ReadDataStatus_Success,
25     ReadDataStatus_Error,
26     ReadDataStatus_MissingDevice,
27     ReadDataStatus_Skip
28 };
29 
30 struct read_data_context;
31 
32 typedef struct {
33     struct read_data_context* context;
34     uint16_t stripenum;
35     bool rewrite;
36     PIRP Irp;
37     IO_STATUS_BLOCK iosb;
38     enum read_data_status status;
39     PMDL mdl;
40     uint64_t stripestart;
41     uint64_t stripeend;
42 } read_data_stripe;
43 
44 typedef struct {
45     KEVENT Event;
46     NTSTATUS Status;
47     chunk* c;
48     uint64_t address;
49     uint32_t buflen;
50     LONG num_stripes, stripes_left;
51     uint64_t type;
52     uint32_t sector_size;
53     uint16_t firstoff, startoffstripe, sectors_per_stripe;
54     void* csum;
55     bool tree;
56     read_data_stripe* stripes;
57     uint8_t* va;
58 } read_data_context;
59 
60 extern bool diskacc;
61 extern tPsUpdateDiskCounters fPsUpdateDiskCounters;
62 extern tCcCopyReadEx fCcCopyReadEx;
63 extern tFsRtlUpdateDiskCounters fFsRtlUpdateDiskCounters;
64 
65 #define LZO_PAGE_SIZE 4096
66 
_Function_class_(IO_COMPLETION_ROUTINE)67 _Function_class_(IO_COMPLETION_ROUTINE)
68 static NTSTATUS __stdcall read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
69     read_data_stripe* stripe = conptr;
70     read_data_context* context = (read_data_context*)stripe->context;
71 
72     UNUSED(DeviceObject);
73 
74     stripe->iosb = Irp->IoStatus;
75 
76     if (NT_SUCCESS(Irp->IoStatus.Status))
77         stripe->status = ReadDataStatus_Success;
78     else
79         stripe->status = ReadDataStatus_Error;
80 
81     if (InterlockedDecrement(&context->stripes_left) == 0)
82         KeSetEvent(&context->Event, 0, false);
83 
84     return STATUS_MORE_PROCESSING_REQUIRED;
85 }
86 
check_csum(device_extension * Vcb,uint8_t * data,uint32_t sectors,void * csum)87 NTSTATUS check_csum(device_extension* Vcb, uint8_t* data, uint32_t sectors, void* csum) {
88     void* csum2;
89 
90     csum2 = ExAllocatePoolWithTag(PagedPool, Vcb->csum_size * sectors, ALLOC_TAG);
91     if (!csum2) {
92         ERR("out of memory\n");
93         return STATUS_INSUFFICIENT_RESOURCES;
94     }
95 
96     do_calc_job(Vcb, data, sectors, csum2);
97 
98     if (RtlCompareMemory(csum2, csum, sectors * Vcb->csum_size) != sectors * Vcb->csum_size) {
99         ExFreePool(csum2);
100         return STATUS_CRC_ERROR;
101     }
102 
103     ExFreePool(csum2);
104 
105     return STATUS_SUCCESS;
106 }
107 
get_tree_checksum(device_extension * Vcb,tree_header * th,void * csum)108 void get_tree_checksum(device_extension* Vcb, tree_header* th, void* csum) {
109     switch (Vcb->superblock.csum_type) {
110         case CSUM_TYPE_CRC32C:
111             *(uint32_t*)csum = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
112         break;
113 
114         case CSUM_TYPE_XXHASH:
115             *(uint64_t*)csum = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
116         break;
117 
118         case CSUM_TYPE_SHA256:
119             calc_sha256(csum, &th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
120         break;
121 
122         case CSUM_TYPE_BLAKE2:
123             blake2b(csum, BLAKE2_HASH_SIZE, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
124         break;
125     }
126 }
127 
check_tree_checksum(device_extension * Vcb,tree_header * th)128 bool check_tree_checksum(device_extension* Vcb, tree_header* th) {
129     switch (Vcb->superblock.csum_type) {
130         case CSUM_TYPE_CRC32C: {
131             uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
132 
133             if (crc32 == *((uint32_t*)th->csum))
134                 return true;
135 
136             WARN("hash was %08x, expected %08x\n", crc32, *((uint32_t*)th->csum));
137 
138             break;
139         }
140 
141         case CSUM_TYPE_XXHASH: {
142             uint64_t hash = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
143 
144             if (hash == *((uint64_t*)th->csum))
145                 return true;
146 
147             WARN("hash was %I64x, expected %I64x\n", hash, *((uint64_t*)th->csum));
148 
149             break;
150         }
151 
152         case CSUM_TYPE_SHA256: {
153             uint8_t hash[SHA256_HASH_SIZE];
154 
155             calc_sha256(hash, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
156 
157             if (RtlCompareMemory(hash, th, SHA256_HASH_SIZE) == SHA256_HASH_SIZE)
158                 return true;
159 
160             WARN("hash was invalid\n");
161 
162             break;
163         }
164 
165         case CSUM_TYPE_BLAKE2: {
166             uint8_t hash[BLAKE2_HASH_SIZE];
167 
168             blake2b(hash, sizeof(hash), (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
169 
170             if (RtlCompareMemory(hash, th, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE)
171                 return true;
172 
173             WARN("hash was invalid\n");
174 
175             break;
176         }
177     }
178 
179     return false;
180 }
181 
get_sector_csum(device_extension * Vcb,void * buf,void * csum)182 void get_sector_csum(device_extension* Vcb, void* buf, void* csum) {
183     switch (Vcb->superblock.csum_type) {
184         case CSUM_TYPE_CRC32C:
185             *(uint32_t*)csum = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
186         break;
187 
188         case CSUM_TYPE_XXHASH:
189             *(uint64_t*)csum = XXH64(buf, Vcb->superblock.sector_size, 0);
190         break;
191 
192         case CSUM_TYPE_SHA256:
193             calc_sha256(csum, buf, Vcb->superblock.sector_size);
194         break;
195 
196         case CSUM_TYPE_BLAKE2:
197             blake2b(csum, BLAKE2_HASH_SIZE, buf, Vcb->superblock.sector_size);
198         break;
199     }
200 }
201 
check_sector_csum(device_extension * Vcb,void * buf,void * csum)202 bool check_sector_csum(device_extension* Vcb, void* buf, void* csum) {
203     switch (Vcb->superblock.csum_type) {
204         case CSUM_TYPE_CRC32C: {
205             uint32_t crc32 = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
206 
207             return *(uint32_t*)csum == crc32;
208         }
209 
210         case CSUM_TYPE_XXHASH: {
211             uint64_t hash = XXH64(buf, Vcb->superblock.sector_size, 0);
212 
213             return *(uint64_t*)csum == hash;
214         }
215 
216         case CSUM_TYPE_SHA256: {
217             uint8_t hash[SHA256_HASH_SIZE];
218 
219             calc_sha256(hash, buf, Vcb->superblock.sector_size);
220 
221             return RtlCompareMemory(hash, csum, SHA256_HASH_SIZE) == SHA256_HASH_SIZE;
222         }
223 
224         case CSUM_TYPE_BLAKE2: {
225             uint8_t hash[BLAKE2_HASH_SIZE];
226 
227             blake2b(hash, sizeof(hash), buf, Vcb->superblock.sector_size);
228 
229             return RtlCompareMemory(hash, csum, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE;
230         }
231     }
232 
233     return false;
234 }
235 
read_data_dup(device_extension * Vcb,uint8_t * buf,uint64_t addr,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation)236 static NTSTATUS read_data_dup(device_extension* Vcb, uint8_t* buf, uint64_t addr, read_data_context* context, CHUNK_ITEM* ci,
237                               device** devices, uint64_t generation) {
238     bool checksum_error = false;
239     uint16_t j, stripe = 0;
240     NTSTATUS Status;
241     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
242 
243     for (j = 0; j < ci->num_stripes; j++) {
244         if (context->stripes[j].status == ReadDataStatus_Error) {
245             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
246             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
247             return context->stripes[j].iosb.Status;
248         } else if (context->stripes[j].status == ReadDataStatus_Success) {
249             stripe = j;
250             break;
251         }
252     }
253 
254     if (context->stripes[stripe].status != ReadDataStatus_Success)
255         return STATUS_INTERNAL_ERROR;
256 
257     if (context->tree) {
258         tree_header* th = (tree_header*)buf;
259 
260         if (th->address != context->address || !check_tree_checksum(Vcb, th)) {
261             checksum_error = true;
262             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
263         } else if (generation != 0 && th->generation != generation) {
264             checksum_error = true;
265             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
266         }
267     } else if (context->csum) {
268         Status = check_csum(Vcb, buf, (ULONG)context->stripes[stripe].Irp->IoStatus.Information / context->sector_size, context->csum);
269 
270         if (Status == STATUS_CRC_ERROR) {
271             checksum_error = true;
272             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
273         } else if (!NT_SUCCESS(Status)) {
274             ERR("check_csum returned %08lx\n", Status);
275             return Status;
276         }
277     }
278 
279     if (!checksum_error)
280         return STATUS_SUCCESS;
281 
282     if (ci->num_stripes == 1)
283         return STATUS_CRC_ERROR;
284 
285     if (context->tree) {
286         tree_header* t2;
287         bool recovered = false;
288 
289         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
290         if (!t2) {
291             ERR("out of memory\n");
292             return STATUS_INSUFFICIENT_RESOURCES;
293         }
294 
295         for (j = 0; j < ci->num_stripes; j++) {
296             if (j != stripe && devices[j] && devices[j]->devobj) {
297                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + context->stripes[stripe].stripestart,
298                                         Vcb->superblock.node_size, (uint8_t*)t2, false);
299                 if (!NT_SUCCESS(Status)) {
300                     WARN("sync_read_phys returned %08lx\n", Status);
301                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
302                 } else {
303                     bool checksum_error = !check_tree_checksum(Vcb, t2);
304 
305                     if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
306                         RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
307                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
308                         recovered = true;
309 
310                         if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
311                             Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + context->stripes[stripe].stripestart,
312                                                      t2, Vcb->superblock.node_size);
313                             if (!NT_SUCCESS(Status)) {
314                                 WARN("write_data_phys returned %08lx\n", Status);
315                                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
316                             }
317                         }
318 
319                         break;
320                     } else if (t2->address != addr || checksum_error)
321                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
322                     else
323                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_GENERATION_ERRORS);
324                 }
325             }
326         }
327 
328         if (!recovered) {
329             ERR("unrecoverable checksum error at %I64x\n", addr);
330             ExFreePool(t2);
331             return STATUS_CRC_ERROR;
332         }
333 
334         ExFreePool(t2);
335     } else {
336         ULONG sectors = (ULONG)context->stripes[stripe].Irp->IoStatus.Information >> Vcb->sector_shift;
337         uint8_t* sector;
338         void* ptr = context->csum;
339 
340         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
341         if (!sector) {
342             ERR("out of memory\n");
343             return STATUS_INSUFFICIENT_RESOURCES;
344         }
345 
346         for (ULONG i = 0; i < sectors; i++) {
347             if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
348                 bool recovered = false;
349 
350                 for (j = 0; j < ci->num_stripes; j++) {
351                     if (j != stripe && devices[j] && devices[j]->devobj) {
352                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj,
353                                                 cis[j].offset + context->stripes[stripe].stripestart + ((uint64_t)i << Vcb->sector_shift),
354                                                 Vcb->superblock.sector_size, sector, false);
355                         if (!NT_SUCCESS(Status)) {
356                             WARN("sync_read_phys returned %08lx\n", Status);
357                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
358                         } else {
359                             if (check_sector_csum(Vcb, sector, ptr)) {
360                                 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
361                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe]->devitem.dev_id);
362                                 recovered = true;
363 
364                                 if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
365                                     Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj,
366                                                              cis[stripe].offset + context->stripes[stripe].stripestart + ((uint64_t)i << Vcb->sector_shift),
367                                                              sector, Vcb->superblock.sector_size);
368                                     if (!NT_SUCCESS(Status)) {
369                                         WARN("write_data_phys returned %08lx\n", Status);
370                                         log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
371                                     }
372                                 }
373 
374                                 break;
375                             } else
376                                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
377                         }
378                     }
379                 }
380 
381                 if (!recovered) {
382                     ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
383                     ExFreePool(sector);
384                     return STATUS_CRC_ERROR;
385                 }
386             }
387 
388             ptr = (uint8_t*)ptr + Vcb->csum_size;
389         }
390 
391         ExFreePool(sector);
392     }
393 
394     return STATUS_SUCCESS;
395 }
396 
read_data_raid0(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation,uint64_t offset)397 static NTSTATUS read_data_raid0(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
398                                 CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
399     for (uint16_t i = 0; i < ci->num_stripes; i++) {
400         if (context->stripes[i].status == ReadDataStatus_Error) {
401             WARN("stripe %u returned error %08lx\n", i, context->stripes[i].iosb.Status);
402             log_device_error(Vcb, devices[i], BTRFS_DEV_STAT_READ_ERRORS);
403             return context->stripes[i].iosb.Status;
404         }
405     }
406 
407     if (context->tree) { // shouldn't happen, as trees shouldn't cross stripe boundaries
408         tree_header* th = (tree_header*)buf;
409         bool checksum_error = !check_tree_checksum(Vcb, th);
410 
411         if (checksum_error || addr != th->address || (generation != 0 && generation != th->generation)) {
412             uint64_t off;
413             uint16_t stripe;
414 
415             get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &off, &stripe);
416 
417             ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
418 
419             if (checksum_error) {
420                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
421                 return STATUS_CRC_ERROR;
422             } else if (addr != th->address) {
423                 WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
424                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
425                 return STATUS_CRC_ERROR;
426             } else if (generation != 0 && generation != th->generation) {
427                 WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
428                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
429                 return STATUS_CRC_ERROR;
430             }
431         }
432     } else if (context->csum) {
433         NTSTATUS Status;
434 
435         Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
436 
437         if (Status == STATUS_CRC_ERROR) {
438             void* ptr = context->csum;
439 
440             for (uint32_t i = 0; i < length >> Vcb->sector_shift; i++) {
441                 if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
442                     uint64_t off;
443                     uint16_t stripe;
444 
445                     get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length, ci->num_stripes, &off, &stripe);
446 
447                     ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
448 
449                     log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
450 
451                     return Status;
452                 }
453 
454                 ptr = (uint8_t*)ptr + Vcb->csum_size;
455             }
456 
457             return Status;
458         } else if (!NT_SUCCESS(Status)) {
459             ERR("check_csum returned %08lx\n", Status);
460             return Status;
461         }
462     }
463 
464     return STATUS_SUCCESS;
465 }
466 
read_data_raid10(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation,uint64_t offset)467 static NTSTATUS read_data_raid10(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
468                                  CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
469     uint16_t stripe = 0;
470     NTSTATUS Status;
471     bool checksum_error = false;
472     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
473 
474     for (uint16_t j = 0; j < ci->num_stripes; j++) {
475         if (context->stripes[j].status == ReadDataStatus_Error) {
476             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
477             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
478             return context->stripes[j].iosb.Status;
479         } else if (context->stripes[j].status == ReadDataStatus_Success)
480             stripe = j;
481     }
482 
483     if (context->tree) {
484         tree_header* th = (tree_header*)buf;
485 
486         if (!check_tree_checksum(Vcb, th)) {
487             checksum_error = true;
488             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
489         } else if (addr != th->address) {
490             WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
491             checksum_error = true;
492             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
493         } else if (generation != 0 && generation != th->generation) {
494             WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
495             checksum_error = true;
496             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
497         }
498     } else if (context->csum) {
499         Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
500 
501         if (Status == STATUS_CRC_ERROR)
502             checksum_error = true;
503         else if (!NT_SUCCESS(Status)) {
504             ERR("check_csum returned %08lx\n", Status);
505             return Status;
506         }
507     }
508 
509     if (!checksum_error)
510         return STATUS_SUCCESS;
511 
512     if (context->tree) {
513         tree_header* t2;
514         uint64_t off;
515         uint16_t badsubstripe = 0;
516         bool recovered = false;
517 
518         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
519         if (!t2) {
520             ERR("out of memory\n");
521             return STATUS_INSUFFICIENT_RESOURCES;
522         }
523 
524         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &off, &stripe);
525 
526         stripe *= ci->sub_stripes;
527 
528         for (uint16_t j = 0; j < ci->sub_stripes; j++) {
529             if (context->stripes[stripe + j].status == ReadDataStatus_Success) {
530                 badsubstripe = j;
531                 break;
532             }
533         }
534 
535         for (uint16_t j = 0; j < ci->sub_stripes; j++) {
536             if (context->stripes[stripe + j].status != ReadDataStatus_Success && devices[stripe + j] && devices[stripe + j]->devobj) {
537                 Status = sync_read_phys(devices[stripe + j]->devobj, devices[stripe + j]->fileobj, cis[stripe + j].offset + off,
538                                         Vcb->superblock.node_size, (uint8_t*)t2, false);
539                 if (!NT_SUCCESS(Status)) {
540                     WARN("sync_read_phys returned %08lx\n", Status);
541                     log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_READ_ERRORS);
542                 } else {
543                     bool checksum_error = !check_tree_checksum(Vcb, t2);
544 
545                     if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
546                         RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
547                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe + j]->devitem.dev_id);
548                         recovered = true;
549 
550                         if (!Vcb->readonly && !devices[stripe + badsubstripe]->readonly && devices[stripe + badsubstripe]->devobj) { // write good data over bad
551                             Status = write_data_phys(devices[stripe + badsubstripe]->devobj, devices[stripe + badsubstripe]->fileobj,
552                                                      cis[stripe + badsubstripe].offset + off, t2, Vcb->superblock.node_size);
553                             if (!NT_SUCCESS(Status)) {
554                                 WARN("write_data_phys returned %08lx\n", Status);
555                                 log_device_error(Vcb, devices[stripe + badsubstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
556                             }
557                         }
558 
559                         break;
560                     } else if (t2->address != addr || checksum_error)
561                         log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
562                     else
563                         log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_GENERATION_ERRORS);
564                 }
565             }
566         }
567 
568         if (!recovered) {
569             ERR("unrecoverable checksum error at %I64x\n", addr);
570             ExFreePool(t2);
571             return STATUS_CRC_ERROR;
572         }
573 
574         ExFreePool(t2);
575     } else {
576         ULONG sectors = length >> Vcb->sector_shift;
577         uint8_t* sector;
578         void* ptr = context->csum;
579 
580         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
581         if (!sector) {
582             ERR("out of memory\n");
583             return STATUS_INSUFFICIENT_RESOURCES;
584         }
585 
586         for (ULONG i = 0; i < sectors; i++) {
587             if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
588                 uint64_t off;
589                 uint16_t stripe2, badsubstripe = 0;
590                 bool recovered = false;
591 
592                 get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
593                                  ci->num_stripes / ci->sub_stripes, &off, &stripe2);
594 
595                 stripe2 *= ci->sub_stripes;
596 
597                 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
598                     if (context->stripes[stripe2 + j].status == ReadDataStatus_Success) {
599                         badsubstripe = j;
600                         break;
601                     }
602                 }
603 
604                 log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
605 
606                 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
607                     if (context->stripes[stripe2 + j].status != ReadDataStatus_Success && devices[stripe2 + j] && devices[stripe2 + j]->devobj) {
608                         Status = sync_read_phys(devices[stripe2 + j]->devobj, devices[stripe2 + j]->fileobj, cis[stripe2 + j].offset + off,
609                                                 Vcb->superblock.sector_size, sector, false);
610                         if (!NT_SUCCESS(Status)) {
611                             WARN("sync_read_phys returned %08lx\n", Status);
612                             log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_READ_ERRORS);
613                         } else {
614                             if (check_sector_csum(Vcb, sector, ptr)) {
615                                 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
616                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe2 + j]->devitem.dev_id);
617                                 recovered = true;
618 
619                                 if (!Vcb->readonly && !devices[stripe2 + badsubstripe]->readonly && devices[stripe2 + badsubstripe]->devobj) { // write good data over bad
620                                     Status = write_data_phys(devices[stripe2 + badsubstripe]->devobj, devices[stripe2 + badsubstripe]->fileobj,
621                                                              cis[stripe2 + badsubstripe].offset + off, sector, Vcb->superblock.sector_size);
622                                     if (!NT_SUCCESS(Status)) {
623                                         WARN("write_data_phys returned %08lx\n", Status);
624                                         log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_READ_ERRORS);
625                                     }
626                                 }
627 
628                                 break;
629                             } else
630                                 log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
631                         }
632                     }
633                 }
634 
635                 if (!recovered) {
636                     ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
637                     ExFreePool(sector);
638                     return STATUS_CRC_ERROR;
639                 }
640             }
641 
642             ptr = (uint8_t*)ptr + Vcb->csum_size;
643         }
644 
645         ExFreePool(sector);
646     }
647 
648     return STATUS_SUCCESS;
649 }
650 
read_data_raid5(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t offset,uint64_t generation,chunk * c,bool degraded)651 static NTSTATUS read_data_raid5(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
652                                 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
653     NTSTATUS Status;
654     bool checksum_error = false;
655     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
656     uint16_t j, stripe = 0;
657     bool no_success = true;
658 
659     for (j = 0; j < ci->num_stripes; j++) {
660         if (context->stripes[j].status == ReadDataStatus_Error) {
661             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
662             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
663             return context->stripes[j].iosb.Status;
664         } else if (context->stripes[j].status == ReadDataStatus_Success) {
665             stripe = j;
666             no_success = false;
667         }
668     }
669 
670     if (c) {    // check partial stripes
671         LIST_ENTRY* le;
672         uint64_t ps_length = (ci->num_stripes - 1) * ci->stripe_length;
673 
674         ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
675 
676         le = c->partial_stripes.Flink;
677         while (le != &c->partial_stripes) {
678             partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
679 
680             if (ps->address + ps_length > addr && ps->address < addr + length) {
681                 ULONG runlength, index;
682 
683                 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
684 
685                 while (runlength != 0) {
686                     if (index >= ps->bmplen)
687                         break;
688 
689                     if (index + runlength >= ps->bmplen) {
690                         runlength = ps->bmplen - index;
691 
692                         if (runlength == 0)
693                             break;
694                     }
695 
696                     uint64_t runstart = ps->address + (index << Vcb->sector_shift);
697                     uint64_t runend = runstart + (runlength << Vcb->sector_shift);
698                     uint64_t start = max(runstart, addr);
699                     uint64_t end = min(runend, addr + length);
700 
701                     if (end > start)
702                         RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
703 
704                     runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
705                 }
706             } else if (ps->address >= addr + length)
707                 break;
708 
709             le = le->Flink;
710         }
711 
712         ExReleaseResourceLite(&c->partial_stripes_lock);
713     }
714 
715     if (context->tree) {
716         tree_header* th = (tree_header*)buf;
717 
718         if (addr != th->address || !check_tree_checksum(Vcb, th)) {
719             checksum_error = true;
720             if (!no_success && !degraded)
721                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
722         } else if (generation != 0 && generation != th->generation) {
723             checksum_error = true;
724             if (!no_success && !degraded)
725                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
726         }
727     } else if (context->csum) {
728         Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
729 
730         if (Status == STATUS_CRC_ERROR) {
731             if (!degraded)
732                 WARN("checksum error\n");
733             checksum_error = true;
734         } else if (!NT_SUCCESS(Status)) {
735             ERR("check_csum returned %08lx\n", Status);
736             return Status;
737         }
738     } else if (degraded)
739         checksum_error = true;
740 
741     if (!checksum_error)
742         return STATUS_SUCCESS;
743 
744     if (context->tree) {
745         uint16_t parity;
746         uint64_t off;
747         bool recovered = false, first = true, failed = false;
748         uint8_t* t2;
749 
750         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * 2, ALLOC_TAG);
751         if (!t2) {
752             ERR("out of memory\n");
753             return STATUS_INSUFFICIENT_RESOURCES;
754         }
755 
756         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &off, &stripe);
757 
758         parity = (((addr - offset) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
759 
760         stripe = (parity + stripe + 1) % ci->num_stripes;
761 
762         for (j = 0; j < ci->num_stripes; j++) {
763             if (j != stripe) {
764                 if (devices[j] && devices[j]->devobj) {
765                     if (first) {
766                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2, false);
767                         if (!NT_SUCCESS(Status)) {
768                             ERR("sync_read_phys returned %08lx\n", Status);
769                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
770                             failed = true;
771                             break;
772                         }
773 
774                         first = false;
775                     } else {
776                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2 + Vcb->superblock.node_size, false);
777                         if (!NT_SUCCESS(Status)) {
778                             ERR("sync_read_phys returned %08lx\n", Status);
779                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
780                             failed = true;
781                             break;
782                         }
783 
784                         do_xor(t2, t2 + Vcb->superblock.node_size, Vcb->superblock.node_size);
785                     }
786                 } else {
787                     failed = true;
788                     break;
789                 }
790             }
791         }
792 
793         if (!failed) {
794             tree_header* t3 = (tree_header*)t2;
795 
796             if (t3->address == addr && check_tree_checksum(Vcb, t3) && (generation == 0 || t3->generation == generation)) {
797                 RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
798 
799                 if (!degraded)
800                     ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
801 
802                 recovered = true;
803 
804                 if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
805                     Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off, t2, Vcb->superblock.node_size);
806                     if (!NT_SUCCESS(Status)) {
807                         WARN("write_data_phys returned %08lx\n", Status);
808                         log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
809                     }
810                 }
811             }
812         }
813 
814         if (!recovered) {
815             ERR("unrecoverable checksum error at %I64x\n", addr);
816             ExFreePool(t2);
817             return STATUS_CRC_ERROR;
818         }
819 
820         ExFreePool(t2);
821     } else {
822         ULONG sectors = length >> Vcb->sector_shift;
823         uint8_t* sector;
824         void* ptr = context->csum;
825 
826         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size * 2, ALLOC_TAG);
827         if (!sector) {
828             ERR("out of memory\n");
829             return STATUS_INSUFFICIENT_RESOURCES;
830         }
831 
832         for (ULONG i = 0; i < sectors; i++) {
833             uint16_t parity;
834             uint64_t off;
835 
836             get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
837                              ci->num_stripes - 1, &off, &stripe);
838 
839             parity = (((addr - offset + ((uint64_t)i << Vcb->sector_shift)) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
840 
841             stripe = (parity + stripe + 1) % ci->num_stripes;
842 
843             if (!devices[stripe] || !devices[stripe]->devobj || (ptr && !check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr))) {
844                 bool recovered = false, first = true, failed = false;
845 
846                 if (devices[stripe] && devices[stripe]->devobj)
847                     log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_READ_ERRORS);
848 
849                 for (j = 0; j < ci->num_stripes; j++) {
850                     if (j != stripe) {
851                         if (devices[j] && devices[j]->devobj) {
852                             if (first) {
853                                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size, sector, false);
854                                 if (!NT_SUCCESS(Status)) {
855                                     ERR("sync_read_phys returned %08lx\n", Status);
856                                     failed = true;
857                                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
858                                     break;
859                                 }
860 
861                                 first = false;
862                             } else {
863                                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
864                                                         sector + Vcb->superblock.sector_size, false);
865                                 if (!NT_SUCCESS(Status)) {
866                                     ERR("sync_read_phys returned %08lx\n", Status);
867                                     failed = true;
868                                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
869                                     break;
870                                 }
871 
872                                 do_xor(sector, sector + Vcb->superblock.sector_size, Vcb->superblock.sector_size);
873                             }
874                         } else {
875                             failed = true;
876                             break;
877                         }
878                     }
879                 }
880 
881                 if (!failed) {
882                     if (!ptr || check_sector_csum(Vcb, sector, ptr)) {
883                         RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
884 
885                         if (!degraded)
886                             ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe]->devitem.dev_id);
887 
888                         recovered = true;
889 
890                         if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
891                             Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off,
892                                                      sector, Vcb->superblock.sector_size);
893                             if (!NT_SUCCESS(Status)) {
894                                 WARN("write_data_phys returned %08lx\n", Status);
895                                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
896                             }
897                         }
898                     }
899                 }
900 
901                 if (!recovered) {
902                     ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
903                     ExFreePool(sector);
904                     return STATUS_CRC_ERROR;
905                 }
906             }
907 
908             if (ptr)
909                 ptr = (uint8_t*)ptr + Vcb->csum_size;
910         }
911 
912         ExFreePool(sector);
913     }
914 
915     return STATUS_SUCCESS;
916 }
917 
raid6_recover2(uint8_t * sectors,uint16_t num_stripes,ULONG sector_size,uint16_t missing1,uint16_t missing2,uint8_t * out)918 void raid6_recover2(uint8_t* sectors, uint16_t num_stripes, ULONG sector_size, uint16_t missing1, uint16_t missing2, uint8_t* out) {
919     if (missing1 == num_stripes - 2 || missing2 == num_stripes - 2) { // reconstruct from q and data
920         uint16_t missing = missing1 == (num_stripes - 2) ? missing2 : missing1;
921         uint16_t stripe;
922 
923         stripe = num_stripes - 3;
924 
925         if (stripe == missing)
926             RtlZeroMemory(out, sector_size);
927         else
928             RtlCopyMemory(out, sectors + (stripe * sector_size), sector_size);
929 
930         do {
931             stripe--;
932 
933             galois_double(out, sector_size);
934 
935             if (stripe != missing)
936                 do_xor(out, sectors + (stripe * sector_size), sector_size);
937         } while (stripe > 0);
938 
939         do_xor(out, sectors + ((num_stripes - 1) * sector_size), sector_size);
940 
941         if (missing != 0)
942             galois_divpower(out, (uint8_t)missing, sector_size);
943     } else { // reconstruct from p and q
944         uint16_t x = missing1, y = missing2, stripe;
945         uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
946         uint32_t j;
947 
948         stripe = num_stripes - 3;
949 
950         pxy = out + sector_size;
951         qxy = out;
952 
953         if (stripe == missing1 || stripe == missing2) {
954             RtlZeroMemory(qxy, sector_size);
955             RtlZeroMemory(pxy, sector_size);
956         } else {
957             RtlCopyMemory(qxy, sectors + (stripe * sector_size), sector_size);
958             RtlCopyMemory(pxy, sectors + (stripe * sector_size), sector_size);
959         }
960 
961         do {
962             stripe--;
963 
964             galois_double(qxy, sector_size);
965 
966             if (stripe != missing1 && stripe != missing2) {
967                 do_xor(qxy, sectors + (stripe * sector_size), sector_size);
968                 do_xor(pxy, sectors + (stripe * sector_size), sector_size);
969             }
970         } while (stripe > 0);
971 
972         gyx = gpow2(y > x ? (y-x) : (255-x+y));
973         gx = gpow2(255-x);
974 
975         denom = gdiv(1, gyx ^ 1);
976         a = gmul(gyx, denom);
977         b = gmul(gx, denom);
978 
979         p = sectors + ((num_stripes - 2) * sector_size);
980         q = sectors + ((num_stripes - 1) * sector_size);
981 
982         for (j = 0; j < sector_size; j++) {
983             *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
984 
985             p++;
986             q++;
987             pxy++;
988             qxy++;
989         }
990 
991         do_xor(out + sector_size, out, sector_size);
992         do_xor(out + sector_size, sectors + ((num_stripes - 2) * sector_size), sector_size);
993     }
994 }
995 
read_data_raid6(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t offset,uint64_t generation,chunk * c,bool degraded)996 static NTSTATUS read_data_raid6(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
997                                 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
998     NTSTATUS Status;
999     bool checksum_error = false;
1000     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1001     uint16_t stripe = 0, j;
1002     bool no_success = true;
1003 
1004     for (j = 0; j < ci->num_stripes; j++) {
1005         if (context->stripes[j].status == ReadDataStatus_Error) {
1006             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
1007 
1008             if (devices[j])
1009                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1010             return context->stripes[j].iosb.Status;
1011         } else if (context->stripes[j].status == ReadDataStatus_Success) {
1012             stripe = j;
1013             no_success = false;
1014         }
1015     }
1016 
1017     if (c) {    // check partial stripes
1018         LIST_ENTRY* le;
1019         uint64_t ps_length = (ci->num_stripes - 2) * ci->stripe_length;
1020 
1021         ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
1022 
1023         le = c->partial_stripes.Flink;
1024         while (le != &c->partial_stripes) {
1025             partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
1026 
1027             if (ps->address + ps_length > addr && ps->address < addr + length) {
1028                 ULONG runlength, index;
1029 
1030                 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
1031 
1032                 while (runlength != 0) {
1033                     if (index >= ps->bmplen)
1034                         break;
1035 
1036                     if (index + runlength >= ps->bmplen) {
1037                         runlength = ps->bmplen - index;
1038 
1039                         if (runlength == 0)
1040                             break;
1041                     }
1042 
1043                     uint64_t runstart = ps->address + (index << Vcb->sector_shift);
1044                     uint64_t runend = runstart + (runlength << Vcb->sector_shift);
1045                     uint64_t start = max(runstart, addr);
1046                     uint64_t end = min(runend, addr + length);
1047 
1048                     if (end > start)
1049                         RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
1050 
1051                     runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
1052                 }
1053             } else if (ps->address >= addr + length)
1054                 break;
1055 
1056             le = le->Flink;
1057         }
1058 
1059         ExReleaseResourceLite(&c->partial_stripes_lock);
1060     }
1061 
1062     if (context->tree) {
1063         tree_header* th = (tree_header*)buf;
1064 
1065         if (addr != th->address || !check_tree_checksum(Vcb, th)) {
1066             checksum_error = true;
1067             if (!no_success && !degraded && devices[stripe])
1068                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1069         } else if (generation != 0 && generation != th->generation) {
1070             checksum_error = true;
1071             if (!no_success && !degraded && devices[stripe])
1072                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
1073         }
1074     } else if (context->csum) {
1075         Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
1076 
1077         if (Status == STATUS_CRC_ERROR) {
1078             if (!degraded)
1079                 WARN("checksum error\n");
1080             checksum_error = true;
1081         } else if (!NT_SUCCESS(Status)) {
1082             ERR("check_csum returned %08lx\n", Status);
1083             return Status;
1084         }
1085     } else if (degraded)
1086         checksum_error = true;
1087 
1088     if (!checksum_error)
1089         return STATUS_SUCCESS;
1090 
1091     if (context->tree) {
1092         uint8_t* sector;
1093         uint16_t k, physstripe, parity1, parity2, error_stripe = 0;
1094         uint64_t off;
1095         bool recovered = false, failed = false;
1096         ULONG num_errors = 0;
1097 
1098         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * (ci->num_stripes + 2), ALLOC_TAG);
1099         if (!sector) {
1100             ERR("out of memory\n");
1101             return STATUS_INSUFFICIENT_RESOURCES;
1102         }
1103 
1104         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &off, &stripe);
1105 
1106         parity1 = (((addr - offset) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1107         parity2 = (parity1 + 1) % ci->num_stripes;
1108 
1109         physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1110 
1111         j = (parity2 + 1) % ci->num_stripes;
1112 
1113         for (k = 0; k < ci->num_stripes - 1; k++) {
1114             if (j != physstripe) {
1115                 if (devices[j] && devices[j]->devobj) {
1116                     Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size,
1117                                             sector + (k * Vcb->superblock.node_size), false);
1118                     if (!NT_SUCCESS(Status)) {
1119                         ERR("sync_read_phys returned %08lx\n", Status);
1120                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1121                         num_errors++;
1122                         error_stripe = k;
1123 
1124                         if (num_errors > 1) {
1125                             failed = true;
1126                             break;
1127                         }
1128                     }
1129                 } else {
1130                     num_errors++;
1131                     error_stripe = k;
1132 
1133                     if (num_errors > 1) {
1134                         failed = true;
1135                         break;
1136                     }
1137                 }
1138             }
1139 
1140             j = (j + 1) % ci->num_stripes;
1141         }
1142 
1143         if (!failed) {
1144             if (num_errors == 0) {
1145                 tree_header* th = (tree_header*)(sector + (stripe * Vcb->superblock.node_size));
1146 
1147                 RtlCopyMemory(sector + (stripe * Vcb->superblock.node_size), sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size),
1148                               Vcb->superblock.node_size);
1149 
1150                 for (j = 0; j < ci->num_stripes - 2; j++) {
1151                     if (j != stripe)
1152                         do_xor(sector + (stripe * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size), Vcb->superblock.node_size);
1153                 }
1154 
1155                 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1156                     RtlCopyMemory(buf, sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1157 
1158                     if (devices[physstripe] && devices[physstripe]->devobj)
1159                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1160 
1161                     recovered = true;
1162 
1163                     if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1164                         Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1165                                                  sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1166                         if (!NT_SUCCESS(Status)) {
1167                             WARN("write_data_phys returned %08lx\n", Status);
1168                             log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1169                         }
1170                     }
1171                 }
1172             }
1173 
1174             if (!recovered) {
1175                 tree_header* th = (tree_header*)(sector + (ci->num_stripes * Vcb->superblock.node_size));
1176                 bool read_q = false;
1177 
1178                 if (devices[parity2] && devices[parity2]->devobj) {
1179                     Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1180                                             Vcb->superblock.node_size, sector + ((ci->num_stripes - 1) * Vcb->superblock.node_size), false);
1181                     if (!NT_SUCCESS(Status)) {
1182                         ERR("sync_read_phys returned %08lx\n", Status);
1183                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1184                     } else
1185                         read_q = true;
1186                 }
1187 
1188                 if (read_q) {
1189                     if (num_errors == 1) {
1190                         raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, error_stripe, sector + (ci->num_stripes * Vcb->superblock.node_size));
1191 
1192                         if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation))
1193                             recovered = true;
1194                     } else {
1195                         for (j = 0; j < ci->num_stripes - 1; j++) {
1196                             if (j != stripe) {
1197                                 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, j, sector + (ci->num_stripes * Vcb->superblock.node_size));
1198 
1199                                 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1200                                     recovered = true;
1201                                     error_stripe = j;
1202                                     break;
1203                                 }
1204                             }
1205                         }
1206                     }
1207                 }
1208 
1209                 if (recovered) {
1210                     uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1211 
1212                     if (devices[physstripe] && devices[physstripe]->devobj)
1213                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1214 
1215                     RtlCopyMemory(buf, sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1216 
1217                     if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1218                         Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1219                                                  sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1220                         if (!NT_SUCCESS(Status)) {
1221                             WARN("write_data_phys returned %08lx\n", Status);
1222                             log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1223                         }
1224                     }
1225 
1226                     if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1227                         if (error_stripe == ci->num_stripes - 2) {
1228                             ERR("recovering from parity error at %I64x, device %I64x\n", addr, devices[error_stripe_phys]->devitem.dev_id);
1229 
1230                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1231 
1232                             RtlZeroMemory(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1233 
1234                             for (j = 0; j < ci->num_stripes - 2; j++) {
1235                                 if (j == stripe) {
1236                                     do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (ci->num_stripes * Vcb->superblock.node_size),
1237                                            Vcb->superblock.node_size);
1238                                 } else {
1239                                     do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size),
1240                                             Vcb->superblock.node_size);
1241                                 }
1242                             }
1243                         } else {
1244                             ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((error_stripe - stripe) * ci->stripe_length),
1245                                 devices[error_stripe_phys]->devitem.dev_id);
1246 
1247                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1248 
1249                             RtlCopyMemory(sector + (error_stripe * Vcb->superblock.node_size),
1250                                           sector + ((ci->num_stripes + 1) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1251                         }
1252                     }
1253 
1254                     if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1255                         Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1256                                                  sector + (error_stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1257                         if (!NT_SUCCESS(Status)) {
1258                             WARN("write_data_phys returned %08lx\n", Status);
1259                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1260                         }
1261                     }
1262                 }
1263             }
1264         }
1265 
1266         if (!recovered) {
1267             ERR("unrecoverable checksum error at %I64x\n", addr);
1268             ExFreePool(sector);
1269             return STATUS_CRC_ERROR;
1270         }
1271 
1272         ExFreePool(sector);
1273     } else {
1274         ULONG sectors = length >> Vcb->sector_shift;
1275         uint8_t* sector;
1276         void* ptr = context->csum;
1277 
1278         sector = ExAllocatePoolWithTag(NonPagedPool, (ci->num_stripes + 2) << Vcb->sector_shift, ALLOC_TAG);
1279         if (!sector) {
1280             ERR("out of memory\n");
1281             return STATUS_INSUFFICIENT_RESOURCES;
1282         }
1283 
1284         for (ULONG i = 0; i < sectors; i++) {
1285             uint64_t off;
1286             uint16_t physstripe, parity1, parity2;
1287 
1288             get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
1289                              ci->num_stripes - 2, &off, &stripe);
1290 
1291             parity1 = (((addr - offset + ((uint64_t)i << Vcb->sector_shift)) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1292             parity2 = (parity1 + 1) % ci->num_stripes;
1293 
1294             physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1295 
1296             if (!devices[physstripe] || !devices[physstripe]->devobj || (context->csum && !check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr))) {
1297                 uint16_t error_stripe = 0;
1298                 bool recovered = false, failed = false;
1299                 ULONG num_errors = 0;
1300 
1301                 if (devices[physstripe] && devices[physstripe]->devobj)
1302                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_READ_ERRORS);
1303 
1304                 j = (parity2 + 1) % ci->num_stripes;
1305 
1306                 for (uint16_t k = 0; k < ci->num_stripes - 1; k++) {
1307                     if (j != physstripe) {
1308                         if (devices[j] && devices[j]->devobj) {
1309                             Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
1310                                                     sector + ((ULONG)k << Vcb->sector_shift), false);
1311                             if (!NT_SUCCESS(Status)) {
1312                                 ERR("sync_read_phys returned %08lx\n", Status);
1313                                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1314                                 num_errors++;
1315                                 error_stripe = k;
1316 
1317                                 if (num_errors > 1) {
1318                                     failed = true;
1319                                     break;
1320                                 }
1321                             }
1322                         } else {
1323                             num_errors++;
1324                             error_stripe = k;
1325 
1326                             if (num_errors > 1) {
1327                                 failed = true;
1328                                 break;
1329                             }
1330                         }
1331                     }
1332 
1333                     j = (j + 1) % ci->num_stripes;
1334                 }
1335 
1336                 if (!failed) {
1337                     if (num_errors == 0) {
1338                         RtlCopyMemory(sector + ((unsigned int)stripe << Vcb->sector_shift), sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), Vcb->superblock.sector_size);
1339 
1340                         for (j = 0; j < ci->num_stripes - 2; j++) {
1341                             if (j != stripe)
1342                                 do_xor(sector + ((unsigned int)stripe << Vcb->sector_shift), sector + ((unsigned int)j << Vcb->sector_shift), Vcb->superblock.sector_size);
1343                         }
1344 
1345                         if (!ptr || check_sector_csum(Vcb, sector + ((unsigned int)stripe << Vcb->sector_shift), ptr)) {
1346                             RtlCopyMemory(buf + (i << Vcb->sector_shift), sector + ((unsigned int)stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1347 
1348                             if (devices[physstripe] && devices[physstripe]->devobj)
1349                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift),
1350                                     devices[physstripe]->devitem.dev_id);
1351 
1352                             recovered = true;
1353 
1354                             if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1355                                 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1356                                                          sector + ((unsigned int)stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1357                                 if (!NT_SUCCESS(Status)) {
1358                                     WARN("write_data_phys returned %08lx\n", Status);
1359                                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1360                                 }
1361                             }
1362                         }
1363                     }
1364 
1365                     if (!recovered) {
1366                         bool read_q = false;
1367 
1368                         if (devices[parity2] && devices[parity2]->devobj) {
1369                             Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1370                                                     Vcb->superblock.sector_size, sector + ((unsigned int)(ci->num_stripes - 1) << Vcb->sector_shift), false);
1371                             if (!NT_SUCCESS(Status)) {
1372                                 ERR("sync_read_phys returned %08lx\n", Status);
1373                                 log_device_error(Vcb, devices[parity2], BTRFS_DEV_STAT_READ_ERRORS);
1374                             } else
1375                                 read_q = true;
1376                         }
1377 
1378                         if (read_q) {
1379                             if (num_errors == 1) {
1380                                 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, error_stripe, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift));
1381 
1382                                 if (!devices[physstripe] || !devices[physstripe]->devobj)
1383                                     recovered = true;
1384                                 else
1385                                     recovered = check_sector_csum(Vcb, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), ptr);
1386                             } else {
1387                                 for (j = 0; j < ci->num_stripes - 1; j++) {
1388                                     if (j != stripe) {
1389                                         raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, j, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift));
1390 
1391                                         if (check_sector_csum(Vcb, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), ptr)) {
1392                                             recovered = true;
1393                                             error_stripe = j;
1394                                             break;
1395                                         }
1396                                     }
1397                                 }
1398                             }
1399                         }
1400 
1401                         if (recovered) {
1402                             uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1403 
1404                             if (devices[physstripe] && devices[physstripe]->devobj)
1405                                 ERR("recovering from checksum error at %I64x, device %I64x\n",
1406                                     addr + ((uint64_t)i << Vcb->sector_shift), devices[physstripe]->devitem.dev_id);
1407 
1408                             RtlCopyMemory(buf + (i << Vcb->sector_shift), sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), Vcb->superblock.sector_size);
1409 
1410                             if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1411                                 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1412                                                          sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), Vcb->superblock.sector_size);
1413                                 if (!NT_SUCCESS(Status)) {
1414                                     WARN("write_data_phys returned %08lx\n", Status);
1415                                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1416                                 }
1417                             }
1418 
1419                             if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1420                                 if (error_stripe == ci->num_stripes - 2) {
1421                                     ERR("recovering from parity error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift),
1422                                         devices[error_stripe_phys]->devitem.dev_id);
1423 
1424                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1425 
1426                                     RtlZeroMemory(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), Vcb->superblock.sector_size);
1427 
1428                                     for (j = 0; j < ci->num_stripes - 2; j++) {
1429                                         if (j == stripe) {
1430                                             do_xor(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift),
1431                                                    Vcb->superblock.sector_size);
1432                                         } else {
1433                                             do_xor(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), sector + ((unsigned int)j << Vcb->sector_shift),
1434                                                    Vcb->superblock.sector_size);
1435                                         }
1436                                     }
1437                                 } else {
1438                                     ERR("recovering from checksum error at %I64x, device %I64x\n",
1439                                         addr + ((uint64_t)i << Vcb->sector_shift) + ((error_stripe - stripe) * ci->stripe_length),
1440                                         devices[error_stripe_phys]->devitem.dev_id);
1441 
1442                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1443 
1444                                     RtlCopyMemory(sector + ((unsigned int)error_stripe << Vcb->sector_shift),
1445                                                   sector + ((unsigned int)(ci->num_stripes + 1) << Vcb->sector_shift), Vcb->superblock.sector_size);
1446                                 }
1447                             }
1448 
1449                             if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1450                                 Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1451                                                          sector + ((unsigned int)error_stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1452                                 if (!NT_SUCCESS(Status)) {
1453                                     WARN("write_data_phys returned %08lx\n", Status);
1454                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1455                                 }
1456                             }
1457                         }
1458                     }
1459                 }
1460 
1461                 if (!recovered) {
1462                     ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
1463                     ExFreePool(sector);
1464                     return STATUS_CRC_ERROR;
1465                 }
1466             }
1467 
1468             if (ptr)
1469                 ptr = (uint8_t*)ptr + Vcb->csum_size;
1470         }
1471 
1472         ExFreePool(sector);
1473     }
1474 
1475     return STATUS_SUCCESS;
1476 }
1477 
1478 NTSTATUS read_data(_In_ device_extension* Vcb, _In_ uint64_t addr, _In_ uint32_t length, _In_reads_bytes_opt_(length*sizeof(uint32_t)/Vcb->superblock.sector_size) void* csum,
1479                    _In_ bool is_tree, _Out_writes_bytes_(length) uint8_t* buf, _In_opt_ chunk* c, _Out_opt_ chunk** pc, _In_opt_ PIRP Irp, _In_ uint64_t generation, _In_ bool file_read,
1480                    _In_ ULONG priority) {
1481     CHUNK_ITEM* ci;
1482     CHUNK_ITEM_STRIPE* cis;
1483     read_data_context context;
1484     uint64_t type, offset, total_reading = 0;
1485     NTSTATUS Status;
1486     device** devices = NULL;
1487     uint16_t i, startoffstripe, allowed_missing, missing_devices = 0;
1488     uint8_t* dummypage = NULL;
1489     PMDL dummy_mdl = NULL;
1490     bool need_to_wait;
1491     uint64_t lockaddr, locklen;
1492 
1493     if (Vcb->log_to_phys_loaded) {
1494         if (!c) {
1495             c = get_chunk_from_address(Vcb, addr);
1496 
1497             if (!c) {
1498                 ERR("get_chunk_from_address failed\n");
1499                 return STATUS_INTERNAL_ERROR;
1500             }
1501         }
1502 
1503         ci = c->chunk_item;
1504         offset = c->offset;
1505         devices = c->devices;
1506 
1507         if (pc)
1508             *pc = c;
1509     } else {
1510         LIST_ENTRY* le = Vcb->sys_chunks.Flink;
1511 
1512         ci = NULL;
1513 
1514         c = NULL;
1515         while (le != &Vcb->sys_chunks) {
1516             sys_chunk* sc = CONTAINING_RECORD(le, sys_chunk, list_entry);
1517 
1518             if (sc->key.obj_id == 0x100 && sc->key.obj_type == TYPE_CHUNK_ITEM && sc->key.offset <= addr) {
1519                 CHUNK_ITEM* chunk_item = sc->data;
1520 
1521                 if ((addr - sc->key.offset) < chunk_item->size && chunk_item->num_stripes > 0) {
1522                     ci = chunk_item;
1523                     offset = sc->key.offset;
1524                     cis = (CHUNK_ITEM_STRIPE*)&chunk_item[1];
1525 
1526                     devices = ExAllocatePoolWithTag(NonPagedPool, sizeof(device*) * ci->num_stripes, ALLOC_TAG);
1527                     if (!devices) {
1528                         ERR("out of memory\n");
1529                         return STATUS_INSUFFICIENT_RESOURCES;
1530                     }
1531 
1532                     for (i = 0; i < ci->num_stripes; i++) {
1533                         devices[i] = find_device_from_uuid(Vcb, &cis[i].dev_uuid);
1534                     }
1535 
1536                     break;
1537                 }
1538             }
1539 
1540             le = le->Flink;
1541         }
1542 
1543         if (!ci) {
1544             ERR("could not find chunk for %I64x in bootstrap\n", addr);
1545             return STATUS_INTERNAL_ERROR;
1546         }
1547 
1548         if (pc)
1549             *pc = NULL;
1550     }
1551 
1552     if (ci->type & BLOCK_FLAG_DUPLICATE) {
1553         type = BLOCK_FLAG_DUPLICATE;
1554         allowed_missing = ci->num_stripes - 1;
1555     } else if (ci->type & BLOCK_FLAG_RAID0) {
1556         type = BLOCK_FLAG_RAID0;
1557         allowed_missing = 0;
1558     } else if (ci->type & BLOCK_FLAG_RAID1) {
1559         type = BLOCK_FLAG_DUPLICATE;
1560         allowed_missing = 1;
1561     } else if (ci->type & BLOCK_FLAG_RAID10) {
1562         type = BLOCK_FLAG_RAID10;
1563         allowed_missing = 1;
1564     } else if (ci->type & BLOCK_FLAG_RAID5) {
1565         type = BLOCK_FLAG_RAID5;
1566         allowed_missing = 1;
1567     } else if (ci->type & BLOCK_FLAG_RAID6) {
1568         type = BLOCK_FLAG_RAID6;
1569         allowed_missing = 2;
1570     } else if (ci->type & BLOCK_FLAG_RAID1C3) {
1571         type = BLOCK_FLAG_DUPLICATE;
1572         allowed_missing = 2;
1573     } else if (ci->type & BLOCK_FLAG_RAID1C4) {
1574         type = BLOCK_FLAG_DUPLICATE;
1575         allowed_missing = 3;
1576     } else { // SINGLE
1577         type = BLOCK_FLAG_DUPLICATE;
1578         allowed_missing = 0;
1579     }
1580 
1581     cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1582 
1583     RtlZeroMemory(&context, sizeof(read_data_context));
1584     KeInitializeEvent(&context.Event, NotificationEvent, false);
1585 
1586     context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe) * ci->num_stripes, ALLOC_TAG);
1587     if (!context.stripes) {
1588         ERR("out of memory\n");
1589         return STATUS_INSUFFICIENT_RESOURCES;
1590     }
1591 
1592     if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6)) {
1593         get_raid56_lock_range(c, addr, length, &lockaddr, &locklen);
1594         chunk_lock_range(Vcb, c, lockaddr, locklen);
1595     }
1596 
1597     RtlZeroMemory(context.stripes, sizeof(read_data_stripe) * ci->num_stripes);
1598 
1599     context.buflen = length;
1600     context.num_stripes = ci->num_stripes;
1601     context.stripes_left = context.num_stripes;
1602     context.sector_size = Vcb->superblock.sector_size;
1603     context.csum = csum;
1604     context.tree = is_tree;
1605     context.type = type;
1606 
1607     if (type == BLOCK_FLAG_RAID0) {
1608         uint64_t startoff, endoff;
1609         uint16_t endoffstripe, stripe;
1610         uint32_t *stripeoff, pos;
1611         PMDL master_mdl;
1612         PFN_NUMBER* pfns;
1613 
1614         // FIXME - test this still works if page size isn't the same as sector size
1615 
1616         // This relies on the fact that MDLs are followed in memory by the page file numbers,
1617         // so with a bit of jiggery-pokery you can trick your disks into deinterlacing your RAID0
1618         // data for you without doing a memcpy yourself.
1619         // MDLs are officially opaque, so this might very well break in future versions of Windows.
1620 
1621         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &startoff, &startoffstripe);
1622         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes, &endoff, &endoffstripe);
1623 
1624         if (file_read) {
1625             // Unfortunately we can't avoid doing at least one memcpy, as Windows can give us an MDL
1626             // with duplicated dummy PFNs, which confuse check_csum. Ah well.
1627             // See https://msdn.microsoft.com/en-us/library/windows/hardware/Dn614012.aspx if you're interested.
1628 
1629             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1630 
1631             if (!context.va) {
1632                 ERR("out of memory\n");
1633                 Status = STATUS_INSUFFICIENT_RESOURCES;
1634                 goto exit;
1635             }
1636         } else
1637             context.va = buf;
1638 
1639         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1640         if (!master_mdl) {
1641             ERR("out of memory\n");
1642             Status = STATUS_INSUFFICIENT_RESOURCES;
1643             goto exit;
1644         }
1645 
1646         Status = STATUS_SUCCESS;
1647 
1648         _SEH2_TRY {
1649             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1650         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1651             Status = _SEH2_GetExceptionCode();
1652         } _SEH2_END;
1653 
1654         if (!NT_SUCCESS(Status)) {
1655             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1656             IoFreeMdl(master_mdl);
1657             goto exit;
1658         }
1659 
1660         pfns = (PFN_NUMBER*)(master_mdl + 1);
1661 
1662         for (i = 0; i < ci->num_stripes; i++) {
1663             if (startoffstripe > i)
1664                 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1665             else if (startoffstripe == i)
1666                 context.stripes[i].stripestart = startoff;
1667             else
1668                 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length);
1669 
1670             if (endoffstripe > i)
1671                 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1672             else if (endoffstripe == i)
1673                 context.stripes[i].stripeend = endoff + 1;
1674             else
1675                 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length);
1676 
1677             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
1678                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
1679 
1680                 if (!context.stripes[i].mdl) {
1681                     ERR("IoAllocateMdl failed\n");
1682                     MmUnlockPages(master_mdl);
1683                     IoFreeMdl(master_mdl);
1684                     Status = STATUS_INSUFFICIENT_RESOURCES;
1685                     goto exit;
1686                 }
1687             }
1688         }
1689 
1690         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
1691         if (!stripeoff) {
1692             ERR("out of memory\n");
1693             MmUnlockPages(master_mdl);
1694             IoFreeMdl(master_mdl);
1695             Status = STATUS_INSUFFICIENT_RESOURCES;
1696             goto exit;
1697         }
1698 
1699         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
1700 
1701         pos = 0;
1702         stripe = startoffstripe;
1703         while (pos < length) {
1704             PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
1705 
1706             if (pos == 0) {
1707                 uint32_t readlen = (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length));
1708 
1709                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1710 
1711                 stripeoff[stripe] += readlen;
1712                 pos += readlen;
1713             } else if (length - pos < ci->stripe_length) {
1714                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1715 
1716                 pos = length;
1717             } else {
1718                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1719 
1720                 stripeoff[stripe] += (uint32_t)ci->stripe_length;
1721                 pos += (uint32_t)ci->stripe_length;
1722             }
1723 
1724             stripe = (stripe + 1) % ci->num_stripes;
1725         }
1726 
1727         MmUnlockPages(master_mdl);
1728         IoFreeMdl(master_mdl);
1729 
1730         ExFreePool(stripeoff);
1731     } else if (type == BLOCK_FLAG_RAID10) {
1732         uint64_t startoff, endoff;
1733         uint16_t endoffstripe, j, stripe;
1734         ULONG orig_ls;
1735         PMDL master_mdl;
1736         PFN_NUMBER* pfns;
1737         uint32_t* stripeoff, pos;
1738         read_data_stripe** stripes;
1739 
1740         if (c)
1741             orig_ls = c->last_stripe;
1742         else
1743             orig_ls = 0;
1744 
1745         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &startoff, &startoffstripe);
1746         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &endoff, &endoffstripe);
1747 
1748         if ((ci->num_stripes % ci->sub_stripes) != 0) {
1749             ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", offset, ci->num_stripes, ci->sub_stripes);
1750             Status = STATUS_INTERNAL_ERROR;
1751             goto exit;
1752         }
1753 
1754         if (file_read) {
1755             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1756 
1757             if (!context.va) {
1758                 ERR("out of memory\n");
1759                 Status = STATUS_INSUFFICIENT_RESOURCES;
1760                 goto exit;
1761             }
1762         } else
1763             context.va = buf;
1764 
1765         context.firstoff = (uint16_t)((startoff % ci->stripe_length) >> Vcb->sector_shift);
1766         context.startoffstripe = startoffstripe;
1767         context.sectors_per_stripe = (uint16_t)(ci->stripe_length >> Vcb->sector_shift);
1768 
1769         startoffstripe *= ci->sub_stripes;
1770         endoffstripe *= ci->sub_stripes;
1771 
1772         if (c)
1773             c->last_stripe = (orig_ls + 1) % ci->sub_stripes;
1774 
1775         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1776         if (!master_mdl) {
1777             ERR("out of memory\n");
1778             Status = STATUS_INSUFFICIENT_RESOURCES;
1779             goto exit;
1780         }
1781 
1782         Status = STATUS_SUCCESS;
1783 
1784         _SEH2_TRY {
1785             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1786         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1787             Status = _SEH2_GetExceptionCode();
1788         } _SEH2_END;
1789 
1790         if (!NT_SUCCESS(Status)) {
1791             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1792             IoFreeMdl(master_mdl);
1793             goto exit;
1794         }
1795 
1796         pfns = (PFN_NUMBER*)(master_mdl + 1);
1797 
1798         stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1799         if (!stripes) {
1800             ERR("out of memory\n");
1801             MmUnlockPages(master_mdl);
1802             IoFreeMdl(master_mdl);
1803             Status = STATUS_INSUFFICIENT_RESOURCES;
1804             goto exit;
1805         }
1806 
1807         RtlZeroMemory(stripes, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes);
1808 
1809         for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
1810             uint64_t sstart, send;
1811             bool stripeset = false;
1812 
1813             if (startoffstripe > i)
1814                 sstart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1815             else if (startoffstripe == i)
1816                 sstart = startoff;
1817             else
1818                 sstart = startoff - (startoff % ci->stripe_length);
1819 
1820             if (endoffstripe > i)
1821                 send = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1822             else if (endoffstripe == i)
1823                 send = endoff + 1;
1824             else
1825                 send = endoff - (endoff % ci->stripe_length);
1826 
1827             for (j = 0; j < ci->sub_stripes; j++) {
1828                 if (j == orig_ls && devices[i+j] && devices[i+j]->devobj) {
1829                     context.stripes[i+j].stripestart = sstart;
1830                     context.stripes[i+j].stripeend = send;
1831                     stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1832 
1833                     if (sstart != send) {
1834                         context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1835 
1836                         if (!context.stripes[i+j].mdl) {
1837                             ERR("IoAllocateMdl failed\n");
1838                             MmUnlockPages(master_mdl);
1839                             IoFreeMdl(master_mdl);
1840                             Status = STATUS_INSUFFICIENT_RESOURCES;
1841                             goto exit;
1842                         }
1843                     }
1844 
1845                     stripeset = true;
1846                 } else
1847                     context.stripes[i+j].status = ReadDataStatus_Skip;
1848             }
1849 
1850             if (!stripeset) {
1851                 for (j = 0; j < ci->sub_stripes; j++) {
1852                     if (devices[i+j] && devices[i+j]->devobj) {
1853                         context.stripes[i+j].stripestart = sstart;
1854                         context.stripes[i+j].stripeend = send;
1855                         context.stripes[i+j].status = ReadDataStatus_Pending;
1856                         stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1857 
1858                         if (sstart != send) {
1859                             context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1860 
1861                             if (!context.stripes[i+j].mdl) {
1862                                 ERR("IoAllocateMdl failed\n");
1863                                 MmUnlockPages(master_mdl);
1864                                 IoFreeMdl(master_mdl);
1865                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1866                                 goto exit;
1867                             }
1868                         }
1869 
1870                         stripeset = true;
1871                         break;
1872                     }
1873                 }
1874 
1875                 if (!stripeset) {
1876                     ERR("could not find stripe to read\n");
1877                     Status = STATUS_DEVICE_NOT_READY;
1878                     goto exit;
1879                 }
1880             }
1881         }
1882 
1883         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1884         if (!stripeoff) {
1885             ERR("out of memory\n");
1886             MmUnlockPages(master_mdl);
1887             IoFreeMdl(master_mdl);
1888             Status = STATUS_INSUFFICIENT_RESOURCES;
1889             goto exit;
1890         }
1891 
1892         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes);
1893 
1894         pos = 0;
1895         stripe = startoffstripe / ci->sub_stripes;
1896         while (pos < length) {
1897             PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(stripes[stripe]->mdl + 1);
1898 
1899             if (pos == 0) {
1900                 uint32_t readlen = (uint32_t)min(stripes[stripe]->stripeend - stripes[stripe]->stripestart,
1901                                              ci->stripe_length - (stripes[stripe]->stripestart % ci->stripe_length));
1902 
1903                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1904 
1905                 stripeoff[stripe] += readlen;
1906                 pos += readlen;
1907             } else if (length - pos < ci->stripe_length) {
1908                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1909 
1910                 pos = length;
1911             } else {
1912                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1913 
1914                 stripeoff[stripe] += (ULONG)ci->stripe_length;
1915                 pos += (ULONG)ci->stripe_length;
1916             }
1917 
1918             stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
1919         }
1920 
1921         MmUnlockPages(master_mdl);
1922         IoFreeMdl(master_mdl);
1923 
1924         ExFreePool(stripeoff);
1925         ExFreePool(stripes);
1926     } else if (type == BLOCK_FLAG_DUPLICATE) {
1927         uint64_t orig_ls;
1928 
1929         if (c)
1930             orig_ls = i = c->last_stripe;
1931         else
1932             orig_ls = i = 0;
1933 
1934         while (!devices[i] || !devices[i]->devobj) {
1935             i = (i + 1) % ci->num_stripes;
1936 
1937             if (i == orig_ls) {
1938                 ERR("no devices available to service request\n");
1939                 Status = STATUS_DEVICE_NOT_READY;
1940                 goto exit;
1941             }
1942         }
1943 
1944         if (c)
1945             c->last_stripe = (i + 1) % ci->num_stripes;
1946 
1947         context.stripes[i].stripestart = addr - offset;
1948         context.stripes[i].stripeend = context.stripes[i].stripestart + length;
1949 
1950         if (file_read) {
1951             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1952 
1953             if (!context.va) {
1954                 ERR("out of memory\n");
1955                 Status = STATUS_INSUFFICIENT_RESOURCES;
1956                 goto exit;
1957             }
1958 
1959             context.stripes[i].mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1960             if (!context.stripes[i].mdl) {
1961                 ERR("IoAllocateMdl failed\n");
1962                 Status = STATUS_INSUFFICIENT_RESOURCES;
1963                 goto exit;
1964             }
1965 
1966             MmBuildMdlForNonPagedPool(context.stripes[i].mdl);
1967         } else {
1968             context.stripes[i].mdl = IoAllocateMdl(buf, length, false, false, NULL);
1969 
1970             if (!context.stripes[i].mdl) {
1971                 ERR("IoAllocateMdl failed\n");
1972                 Status = STATUS_INSUFFICIENT_RESOURCES;
1973                 goto exit;
1974             }
1975 
1976             Status = STATUS_SUCCESS;
1977 
1978             _SEH2_TRY {
1979                 MmProbeAndLockPages(context.stripes[i].mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1980             } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1981                 Status = _SEH2_GetExceptionCode();
1982             } _SEH2_END;
1983 
1984             if (!NT_SUCCESS(Status)) {
1985                 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1986                 goto exit;
1987             }
1988         }
1989     } else if (type == BLOCK_FLAG_RAID5) {
1990         uint64_t startoff, endoff;
1991         uint16_t endoffstripe, parity;
1992         uint32_t *stripeoff, pos;
1993         PMDL master_mdl;
1994         PFN_NUMBER *pfns, dummy = 0;
1995         bool need_dummy = false;
1996 
1997         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &startoff, &startoffstripe);
1998         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 1, &endoff, &endoffstripe);
1999 
2000         if (file_read) {
2001             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2002 
2003             if (!context.va) {
2004                 ERR("out of memory\n");
2005                 Status = STATUS_INSUFFICIENT_RESOURCES;
2006                 goto exit;
2007             }
2008         } else
2009             context.va = buf;
2010 
2011         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2012         if (!master_mdl) {
2013             ERR("out of memory\n");
2014             Status = STATUS_INSUFFICIENT_RESOURCES;
2015             goto exit;
2016         }
2017 
2018         Status = STATUS_SUCCESS;
2019 
2020         _SEH2_TRY {
2021             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)2022         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2023             Status = _SEH2_GetExceptionCode();
2024         } _SEH2_END;
2025 
2026         if (!NT_SUCCESS(Status)) {
2027             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2028             IoFreeMdl(master_mdl);
2029             goto exit;
2030         }
2031 
2032         pfns = (PFN_NUMBER*)(master_mdl + 1);
2033 
2034         pos = 0;
2035         while (pos < length) {
2036             parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2037 
2038             if (pos == 0) {
2039                 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2040                 ULONG skip, readlen;
2041 
2042                 i = startoffstripe;
2043                 while (stripe != parity) {
2044                     if (i == startoffstripe) {
2045                         readlen = min(length, (ULONG)(ci->stripe_length - (startoff % ci->stripe_length)));
2046 
2047                         context.stripes[stripe].stripestart = startoff;
2048                         context.stripes[stripe].stripeend = startoff + readlen;
2049 
2050                         pos += readlen;
2051 
2052                         if (pos == length)
2053                             break;
2054                     } else {
2055                         readlen = min(length - pos, (ULONG)ci->stripe_length);
2056 
2057                         context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2058                         context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2059 
2060                         pos += readlen;
2061 
2062                         if (pos == length)
2063                             break;
2064                     }
2065 
2066                     i++;
2067                     stripe = (stripe + 1) % ci->num_stripes;
2068                 }
2069 
2070                 if (pos == length)
2071                     break;
2072 
2073                 for (i = 0; i < startoffstripe; i++) {
2074                     uint16_t stripe2 = (parity + i + 1) % ci->num_stripes;
2075 
2076                     context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2077                 }
2078 
2079                 context.stripes[parity].stripestart = context.stripes[parity].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2080 
2081                 if (length - pos > ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length) {
2082                     skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length)) - 1);
2083 
2084                     for (i = 0; i < ci->num_stripes; i++) {
2085                         context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2086                     }
2087 
2088                     pos += (uint32_t)(skip * (ci->num_stripes - 1) * ci->num_stripes * ci->stripe_length);
2089                     need_dummy = true;
2090                 }
2091             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2092                 for (i = 0; i < ci->num_stripes; i++) {
2093                     context.stripes[i].stripeend += ci->stripe_length;
2094                 }
2095 
2096                 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 1));
2097                 need_dummy = true;
2098             } else {
2099                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2100 
2101                 i = 0;
2102                 while (stripe != parity) {
2103                     if (endoffstripe == i) {
2104                         context.stripes[stripe].stripeend = endoff + 1;
2105                         break;
2106                     } else if (endoffstripe > i)
2107                         context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2108 
2109                     i++;
2110                     stripe = (stripe + 1) % ci->num_stripes;
2111                 }
2112 
2113                 break;
2114             }
2115         }
2116 
2117         for (i = 0; i < ci->num_stripes; i++) {
2118             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2119                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart),
2120                                                        false, false, NULL);
2121 
2122                 if (!context.stripes[i].mdl) {
2123                     ERR("IoAllocateMdl failed\n");
2124                     MmUnlockPages(master_mdl);
2125                     IoFreeMdl(master_mdl);
2126                     Status = STATUS_INSUFFICIENT_RESOURCES;
2127                     goto exit;
2128                 }
2129             }
2130         }
2131 
2132         if (need_dummy) {
2133             dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2134             if (!dummypage) {
2135                 ERR("out of memory\n");
2136                 MmUnlockPages(master_mdl);
2137                 IoFreeMdl(master_mdl);
2138                 Status = STATUS_INSUFFICIENT_RESOURCES;
2139                 goto exit;
2140             }
2141 
2142             dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2143             if (!dummy_mdl) {
2144                 ERR("IoAllocateMdl failed\n");
2145                 MmUnlockPages(master_mdl);
2146                 IoFreeMdl(master_mdl);
2147                 Status = STATUS_INSUFFICIENT_RESOURCES;
2148                 goto exit;
2149             }
2150 
2151             MmBuildMdlForNonPagedPool(dummy_mdl);
2152 
2153             dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2154         }
2155 
2156         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2157         if (!stripeoff) {
2158             ERR("out of memory\n");
2159             MmUnlockPages(master_mdl);
2160             IoFreeMdl(master_mdl);
2161             Status = STATUS_INSUFFICIENT_RESOURCES;
2162             goto exit;
2163         }
2164 
2165         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2166 
2167         pos = 0;
2168 
2169         while (pos < length) {
2170             PFN_NUMBER* stripe_pfns;
2171 
2172             parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2173 
2174             if (pos == 0) {
2175                 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2176                 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2177                                                        ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2178 
2179                 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2180 
2181                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2182 
2183                 stripeoff[stripe] = readlen;
2184                 pos += readlen;
2185 
2186                 stripe = (stripe + 1) % ci->num_stripes;
2187 
2188                 while (stripe != parity) {
2189                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2190                     readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2191 
2192                     if (readlen == 0)
2193                         break;
2194 
2195                     RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2196 
2197                     stripeoff[stripe] = readlen;
2198                     pos += readlen;
2199 
2200                     stripe = (stripe + 1) % ci->num_stripes;
2201                 }
2202             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2203                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2204                 ULONG k;
2205 
2206                 while (stripe != parity) {
2207                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2208 
2209                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2210 
2211                     stripeoff[stripe] += (uint32_t)ci->stripe_length;
2212                     pos += (uint32_t)ci->stripe_length;
2213 
2214                     stripe = (stripe + 1) % ci->num_stripes;
2215                 }
2216 
2217                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity].mdl + 1);
2218 
2219                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2220                     stripe_pfns[stripeoff[parity] >> PAGE_SHIFT] = dummy;
2221                     stripeoff[parity] += PAGE_SIZE;
2222                 }
2223             } else {
2224                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2225                 uint32_t readlen;
2226 
2227                 while (pos < length) {
2228                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2229                     readlen = min(length - pos, (ULONG)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2230 
2231                     if (readlen == 0)
2232                         break;
2233 
2234                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2235 
2236                     stripeoff[stripe] += readlen;
2237                     pos += readlen;
2238 
2239                     stripe = (stripe + 1) % ci->num_stripes;
2240                 }
2241             }
2242         }
2243 
2244         MmUnlockPages(master_mdl);
2245         IoFreeMdl(master_mdl);
2246 
2247         ExFreePool(stripeoff);
2248     } else if (type == BLOCK_FLAG_RAID6) {
2249         uint64_t startoff, endoff;
2250         uint16_t endoffstripe, parity1;
2251         uint32_t *stripeoff, pos;
2252         PMDL master_mdl;
2253         PFN_NUMBER *pfns, dummy = 0;
2254         bool need_dummy = false;
2255 
2256         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &startoff, &startoffstripe);
2257         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 2, &endoff, &endoffstripe);
2258 
2259         if (file_read) {
2260             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2261 
2262             if (!context.va) {
2263                 ERR("out of memory\n");
2264                 Status = STATUS_INSUFFICIENT_RESOURCES;
2265                 goto exit;
2266             }
2267         } else
2268             context.va = buf;
2269 
2270         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2271         if (!master_mdl) {
2272             ERR("out of memory\n");
2273             Status = STATUS_INSUFFICIENT_RESOURCES;
2274             goto exit;
2275         }
2276 
2277         Status = STATUS_SUCCESS;
2278 
2279         _SEH2_TRY {
2280             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)2281         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2282             Status = _SEH2_GetExceptionCode();
2283         } _SEH2_END;
2284 
2285         if (!NT_SUCCESS(Status)) {
2286             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2287             IoFreeMdl(master_mdl);
2288             goto exit;
2289         }
2290 
2291         pfns = (PFN_NUMBER*)(master_mdl + 1);
2292 
2293         pos = 0;
2294         while (pos < length) {
2295             parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2296 
2297             if (pos == 0) {
2298                 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes, parity2;
2299                 ULONG skip, readlen;
2300 
2301                 i = startoffstripe;
2302                 while (stripe != parity1) {
2303                     if (i == startoffstripe) {
2304                         readlen = (ULONG)min(length, ci->stripe_length - (startoff % ci->stripe_length));
2305 
2306                         context.stripes[stripe].stripestart = startoff;
2307                         context.stripes[stripe].stripeend = startoff + readlen;
2308 
2309                         pos += readlen;
2310 
2311                         if (pos == length)
2312                             break;
2313                     } else {
2314                         readlen = min(length - pos, (ULONG)ci->stripe_length);
2315 
2316                         context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2317                         context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2318 
2319                         pos += readlen;
2320 
2321                         if (pos == length)
2322                             break;
2323                     }
2324 
2325                     i++;
2326                     stripe = (stripe + 1) % ci->num_stripes;
2327                 }
2328 
2329                 if (pos == length)
2330                     break;
2331 
2332                 for (i = 0; i < startoffstripe; i++) {
2333                     uint16_t stripe2 = (parity1 + i + 2) % ci->num_stripes;
2334 
2335                     context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2336                 }
2337 
2338                 context.stripes[parity1].stripestart = context.stripes[parity1].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2339 
2340                 parity2 = (parity1 + 1) % ci->num_stripes;
2341                 context.stripes[parity2].stripestart = context.stripes[parity2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2342 
2343                 if (length - pos > ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length) {
2344                     skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length)) - 1);
2345 
2346                     for (i = 0; i < ci->num_stripes; i++) {
2347                         context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2348                     }
2349 
2350                     pos += (uint32_t)(skip * (ci->num_stripes - 2) * ci->num_stripes * ci->stripe_length);
2351                     need_dummy = true;
2352                 }
2353             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2354                 for (i = 0; i < ci->num_stripes; i++) {
2355                     context.stripes[i].stripeend += ci->stripe_length;
2356                 }
2357 
2358                 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 2));
2359                 need_dummy = true;
2360             } else {
2361                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2362 
2363                 i = 0;
2364                 while (stripe != parity1) {
2365                     if (endoffstripe == i) {
2366                         context.stripes[stripe].stripeend = endoff + 1;
2367                         break;
2368                     } else if (endoffstripe > i)
2369                         context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2370 
2371                     i++;
2372                     stripe = (stripe + 1) % ci->num_stripes;
2373                 }
2374 
2375                 break;
2376             }
2377         }
2378 
2379         for (i = 0; i < ci->num_stripes; i++) {
2380             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2381                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
2382 
2383                 if (!context.stripes[i].mdl) {
2384                     ERR("IoAllocateMdl failed\n");
2385                     MmUnlockPages(master_mdl);
2386                     IoFreeMdl(master_mdl);
2387                     Status = STATUS_INSUFFICIENT_RESOURCES;
2388                     goto exit;
2389                 }
2390             }
2391         }
2392 
2393         if (need_dummy) {
2394             dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2395             if (!dummypage) {
2396                 ERR("out of memory\n");
2397                 MmUnlockPages(master_mdl);
2398                 IoFreeMdl(master_mdl);
2399                 Status = STATUS_INSUFFICIENT_RESOURCES;
2400                 goto exit;
2401             }
2402 
2403             dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2404             if (!dummy_mdl) {
2405                 ERR("IoAllocateMdl failed\n");
2406                 MmUnlockPages(master_mdl);
2407                 IoFreeMdl(master_mdl);
2408                 Status = STATUS_INSUFFICIENT_RESOURCES;
2409                 goto exit;
2410             }
2411 
2412             MmBuildMdlForNonPagedPool(dummy_mdl);
2413 
2414             dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2415         }
2416 
2417         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2418         if (!stripeoff) {
2419             ERR("out of memory\n");
2420             MmUnlockPages(master_mdl);
2421             IoFreeMdl(master_mdl);
2422             Status = STATUS_INSUFFICIENT_RESOURCES;
2423             goto exit;
2424         }
2425 
2426         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2427 
2428         pos = 0;
2429 
2430         while (pos < length) {
2431             PFN_NUMBER* stripe_pfns;
2432 
2433             parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2434 
2435             if (pos == 0) {
2436                 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes;
2437                 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2438                                                        ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2439 
2440                 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2441 
2442                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2443 
2444                 stripeoff[stripe] = readlen;
2445                 pos += readlen;
2446 
2447                 stripe = (stripe + 1) % ci->num_stripes;
2448 
2449                 while (stripe != parity1) {
2450                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2451                     readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2452 
2453                     if (readlen == 0)
2454                         break;
2455 
2456                     RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2457 
2458                     stripeoff[stripe] = readlen;
2459                     pos += readlen;
2460 
2461                     stripe = (stripe + 1) % ci->num_stripes;
2462                 }
2463             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2464                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2465                 uint16_t parity2 = (parity1 + 1) % ci->num_stripes;
2466                 ULONG k;
2467 
2468                 while (stripe != parity1) {
2469                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2470 
2471                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2472 
2473                     stripeoff[stripe] += (uint32_t)ci->stripe_length;
2474                     pos += (uint32_t)ci->stripe_length;
2475 
2476                     stripe = (stripe + 1) % ci->num_stripes;
2477                 }
2478 
2479                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity1].mdl + 1);
2480 
2481                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2482                     stripe_pfns[stripeoff[parity1] >> PAGE_SHIFT] = dummy;
2483                     stripeoff[parity1] += PAGE_SIZE;
2484                 }
2485 
2486                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity2].mdl + 1);
2487 
2488                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2489                     stripe_pfns[stripeoff[parity2] >> PAGE_SHIFT] = dummy;
2490                     stripeoff[parity2] += PAGE_SIZE;
2491                 }
2492             } else {
2493                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2494                 uint32_t readlen;
2495 
2496                 while (pos < length) {
2497                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2498                     readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2499 
2500                     if (readlen == 0)
2501                         break;
2502 
2503                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2504 
2505                     stripeoff[stripe] += readlen;
2506                     pos += readlen;
2507 
2508                     stripe = (stripe + 1) % ci->num_stripes;
2509                 }
2510             }
2511         }
2512 
2513         MmUnlockPages(master_mdl);
2514         IoFreeMdl(master_mdl);
2515 
2516         ExFreePool(stripeoff);
2517     }
2518 
2519     context.address = addr;
2520 
2521     for (i = 0; i < ci->num_stripes; i++) {
2522         if (!devices[i] || !devices[i]->devobj || context.stripes[i].stripestart == context.stripes[i].stripeend) {
2523             context.stripes[i].status = ReadDataStatus_MissingDevice;
2524             context.stripes_left--;
2525 
2526             if (!devices[i] || !devices[i]->devobj)
2527                 missing_devices++;
2528         }
2529     }
2530 
2531     if (missing_devices > allowed_missing) {
2532         ERR("not enough devices to service request (%u missing)\n", missing_devices);
2533         Status = STATUS_UNEXPECTED_IO_ERROR;
2534         goto exit;
2535     }
2536 
2537     for (i = 0; i < ci->num_stripes; i++) {
2538         PIO_STACK_LOCATION IrpSp;
2539 
2540         if (devices[i] && devices[i]->devobj && context.stripes[i].stripestart != context.stripes[i].stripeend && context.stripes[i].status != ReadDataStatus_Skip) {
2541             context.stripes[i].context = (struct read_data_context*)&context;
2542 
2543             if (type == BLOCK_FLAG_RAID10) {
2544                 context.stripes[i].stripenum = i / ci->sub_stripes;
2545             }
2546 
2547             if (!Irp) {
2548                 context.stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, false);
2549 
2550                 if (!context.stripes[i].Irp) {
2551                     ERR("IoAllocateIrp failed\n");
2552                     Status = STATUS_INSUFFICIENT_RESOURCES;
2553                     goto exit;
2554                 }
2555             } else {
2556                 context.stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
2557 
2558                 if (!context.stripes[i].Irp) {
2559                     ERR("IoMakeAssociatedIrp failed\n");
2560                     Status = STATUS_INSUFFICIENT_RESOURCES;
2561                     goto exit;
2562                 }
2563             }
2564 
2565             IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2566             IrpSp->MajorFunction = IRP_MJ_READ;
2567             IrpSp->MinorFunction = IRP_MN_NORMAL;
2568             IrpSp->FileObject = devices[i]->fileobj;
2569 
2570             if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2571                 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), ALLOC_TAG);
2572                 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2573                     ERR("out of memory\n");
2574                     Status = STATUS_INSUFFICIENT_RESOURCES;
2575                     goto exit;
2576                 }
2577 
2578                 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2579 
2580                 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2581             } else if (devices[i]->devobj->Flags & DO_DIRECT_IO)
2582                 context.stripes[i].Irp->MdlAddress = context.stripes[i].mdl;
2583             else
2584                 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2585 
2586             IrpSp->Parameters.Read.Length = (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart);
2587             IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].stripestart + cis[i].offset;
2588 
2589             total_reading += IrpSp->Parameters.Read.Length;
2590 
2591             context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2592 
2593             IoSetCompletionRoutine(context.stripes[i].Irp, read_data_completion, &context.stripes[i], true, true, true);
2594 
2595             context.stripes[i].status = ReadDataStatus_Pending;
2596         }
2597     }
2598 
2599     need_to_wait = false;
2600     for (i = 0; i < ci->num_stripes; i++) {
2601         if (context.stripes[i].status != ReadDataStatus_MissingDevice && context.stripes[i].status != ReadDataStatus_Skip) {
2602             IoCallDriver(devices[i]->devobj, context.stripes[i].Irp);
2603             need_to_wait = true;
2604         }
2605     }
2606 
2607     if (need_to_wait)
2608         KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2609 
2610     if (diskacc)
2611         fFsRtlUpdateDiskCounters(total_reading, 0);
2612 
2613     // check if any of the devices return a "user-induced" error
2614 
2615     for (i = 0; i < ci->num_stripes; i++) {
2616         if (context.stripes[i].status == ReadDataStatus_Error && IoIsErrorUserInduced(context.stripes[i].iosb.Status)) {
2617             Status = context.stripes[i].iosb.Status;
2618             goto exit;
2619         }
2620     }
2621 
2622     if (type == BLOCK_FLAG_RAID0) {
2623         Status = read_data_raid0(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2624         if (!NT_SUCCESS(Status)) {
2625             ERR("read_data_raid0 returned %08lx\n", Status);
2626 
2627             if (file_read)
2628                 ExFreePool(context.va);
2629 
2630             goto exit;
2631         }
2632 
2633         if (file_read) {
2634             RtlCopyMemory(buf, context.va, length);
2635             ExFreePool(context.va);
2636         }
2637     } else if (type == BLOCK_FLAG_RAID10) {
2638         Status = read_data_raid10(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2639 
2640         if (!NT_SUCCESS(Status)) {
2641             ERR("read_data_raid10 returned %08lx\n", Status);
2642 
2643             if (file_read)
2644                 ExFreePool(context.va);
2645 
2646             goto exit;
2647         }
2648 
2649         if (file_read) {
2650             RtlCopyMemory(buf, context.va, length);
2651             ExFreePool(context.va);
2652         }
2653     } else if (type == BLOCK_FLAG_DUPLICATE) {
2654         Status = read_data_dup(Vcb, file_read ? context.va : buf, addr, &context, ci, devices, generation);
2655         if (!NT_SUCCESS(Status)) {
2656             ERR("read_data_dup returned %08lx\n", Status);
2657 
2658             if (file_read)
2659                 ExFreePool(context.va);
2660 
2661             goto exit;
2662         }
2663 
2664         if (file_read) {
2665             RtlCopyMemory(buf, context.va, length);
2666             ExFreePool(context.va);
2667         }
2668     } else if (type == BLOCK_FLAG_RAID5) {
2669         Status = read_data_raid5(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2670         if (!NT_SUCCESS(Status)) {
2671             ERR("read_data_raid5 returned %08lx\n", Status);
2672 
2673             if (file_read)
2674                 ExFreePool(context.va);
2675 
2676             goto exit;
2677         }
2678 
2679         if (file_read) {
2680             RtlCopyMemory(buf, context.va, length);
2681             ExFreePool(context.va);
2682         }
2683     } else if (type == BLOCK_FLAG_RAID6) {
2684         Status = read_data_raid6(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2685         if (!NT_SUCCESS(Status)) {
2686             ERR("read_data_raid6 returned %08lx\n", Status);
2687 
2688             if (file_read)
2689                 ExFreePool(context.va);
2690 
2691             goto exit;
2692         }
2693 
2694         if (file_read) {
2695             RtlCopyMemory(buf, context.va, length);
2696             ExFreePool(context.va);
2697         }
2698     }
2699 
2700 exit:
2701     if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6))
2702         chunk_unlock_range(Vcb, c, lockaddr, locklen);
2703 
2704     if (dummy_mdl)
2705         IoFreeMdl(dummy_mdl);
2706 
2707     if (dummypage)
2708         ExFreePool(dummypage);
2709 
2710     for (i = 0; i < ci->num_stripes; i++) {
2711         if (context.stripes[i].mdl) {
2712             if (context.stripes[i].mdl->MdlFlags & MDL_PAGES_LOCKED)
2713                 MmUnlockPages(context.stripes[i].mdl);
2714 
2715             IoFreeMdl(context.stripes[i].mdl);
2716         }
2717 
2718         if (context.stripes[i].Irp)
2719             IoFreeIrp(context.stripes[i].Irp);
2720     }
2721 
2722     ExFreePool(context.stripes);
2723 
2724     if (!Vcb->log_to_phys_loaded)
2725         ExFreePool(devices);
2726 
2727     return Status;
2728 }
2729 
2730 __attribute__((nonnull(1, 2)))
read_stream(fcb * fcb,uint8_t * data,uint64_t start,ULONG length,ULONG * pbr)2731 NTSTATUS read_stream(fcb* fcb, uint8_t* data, uint64_t start, ULONG length, ULONG* pbr) {
2732     ULONG readlen;
2733 
2734     TRACE("(%p, %p, %I64x, %lx, %p)\n", fcb, data, start, length, pbr);
2735 
2736     if (pbr) *pbr = 0;
2737 
2738     if (start >= fcb->adsdata.Length) {
2739         TRACE("tried to read beyond end of stream\n");
2740         return STATUS_END_OF_FILE;
2741     }
2742 
2743     if (length == 0) {
2744         WARN("tried to read zero bytes\n");
2745         return STATUS_SUCCESS;
2746     }
2747 
2748     if (start + length < fcb->adsdata.Length)
2749         readlen = length;
2750     else
2751         readlen = fcb->adsdata.Length - (ULONG)start;
2752 
2753     if (readlen > 0)
2754         RtlCopyMemory(data, fcb->adsdata.Buffer + start, readlen);
2755 
2756     if (pbr) *pbr = readlen;
2757 
2758     return STATUS_SUCCESS;
2759 }
2760 
2761 typedef struct {
2762     uint64_t off;
2763     uint64_t ed_size;
2764     uint64_t ed_offset;
2765     uint64_t ed_num_bytes;
2766 } read_part_extent;
2767 
2768 typedef struct {
2769     LIST_ENTRY list_entry;
2770     uint64_t addr;
2771     chunk* c;
2772     uint32_t read;
2773     uint32_t to_read;
2774     void* csum;
2775     bool csum_free;
2776     uint8_t* buf;
2777     bool buf_free;
2778     uint32_t bumpoff;
2779     bool mdl;
2780     void* data;
2781     uint8_t compression;
2782     unsigned int num_extents;
2783     read_part_extent extents[1];
2784 } read_part;
2785 
2786 typedef struct {
2787     LIST_ENTRY list_entry;
2788     calc_job* cj;
2789     void* decomp;
2790     void* data;
2791     unsigned int offset;
2792     size_t length;
2793 } comp_calc_job;
2794 
2795 __attribute__((nonnull(1, 2)))
read_file(fcb * fcb,uint8_t * data,uint64_t start,uint64_t length,ULONG * pbr,PIRP Irp)2796 NTSTATUS read_file(fcb* fcb, uint8_t* data, uint64_t start, uint64_t length, ULONG* pbr, PIRP Irp) {
2797     NTSTATUS Status;
2798     uint32_t bytes_read = 0;
2799     uint64_t last_end;
2800     LIST_ENTRY* le;
2801     POOL_TYPE pool_type;
2802     LIST_ENTRY read_parts, calc_jobs;
2803 
2804     TRACE("(%p, %p, %I64x, %I64x, %p)\n", fcb, data, start, length, pbr);
2805 
2806     if (pbr)
2807         *pbr = 0;
2808 
2809     if (start >= fcb->inode_item.st_size) {
2810         WARN("Tried to read beyond end of file\n");
2811         return STATUS_END_OF_FILE;
2812     }
2813 
2814     InitializeListHead(&read_parts);
2815     InitializeListHead(&calc_jobs);
2816 
2817     pool_type = fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? NonPagedPool : PagedPool;
2818 
2819     le = fcb->extents.Flink;
2820 
2821     last_end = start;
2822 
2823     while (le != &fcb->extents) {
2824         extent* ext = CONTAINING_RECORD(le, extent, list_entry);
2825 
2826         if (!ext->ignore) {
2827             EXTENT_DATA* ed = &ext->extent_data;
2828             uint64_t len;
2829 
2830             if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC)
2831                 len = ((EXTENT_DATA2*)ed->data)->num_bytes;
2832             else
2833                 len = ed->decoded_size;
2834 
2835             if (ext->offset + len <= start) {
2836                 last_end = ext->offset + len;
2837                 goto nextitem;
2838             }
2839 
2840             if (ext->offset > last_end && ext->offset > start + bytes_read) {
2841                 uint32_t read = (uint32_t)min(length, ext->offset - max(start, last_end));
2842 
2843                 RtlZeroMemory(data + bytes_read, read);
2844                 bytes_read += read;
2845                 length -= read;
2846             }
2847 
2848             if (length == 0 || ext->offset > start + bytes_read + length)
2849                 break;
2850 
2851             if (ed->encryption != BTRFS_ENCRYPTION_NONE) {
2852                 WARN("Encryption not supported\n");
2853                 Status = STATUS_NOT_IMPLEMENTED;
2854                 goto exit;
2855             }
2856 
2857             if (ed->encoding != BTRFS_ENCODING_NONE) {
2858                 WARN("Other encodings not supported\n");
2859                 Status = STATUS_NOT_IMPLEMENTED;
2860                 goto exit;
2861             }
2862 
2863             switch (ed->type) {
2864                 case EXTENT_TYPE_INLINE:
2865                 {
2866                     uint64_t off = start + bytes_read - ext->offset;
2867                     uint32_t read;
2868 
2869                     if (ed->compression == BTRFS_COMPRESSION_NONE) {
2870                         read = (uint32_t)min(min(len, ext->datalen) - off, length);
2871 
2872                         RtlCopyMemory(data + bytes_read, &ed->data[off], read);
2873                     } else if (ed->compression == BTRFS_COMPRESSION_ZLIB || ed->compression == BTRFS_COMPRESSION_LZO || ed->compression == BTRFS_COMPRESSION_ZSTD) {
2874                         uint8_t* decomp;
2875                         bool decomp_alloc;
2876                         uint16_t inlen = ext->datalen - (uint16_t)offsetof(EXTENT_DATA, data[0]);
2877 
2878                         if (ed->decoded_size == 0 || ed->decoded_size > 0xffffffff) {
2879                             ERR("ed->decoded_size was invalid (%I64x)\n", ed->decoded_size);
2880                             Status = STATUS_INTERNAL_ERROR;
2881                             goto exit;
2882                         }
2883 
2884                         read = (uint32_t)min(ed->decoded_size - off, length);
2885 
2886                         if (off > 0) {
2887                             decomp = ExAllocatePoolWithTag(NonPagedPool, (uint32_t)ed->decoded_size, ALLOC_TAG);
2888                             if (!decomp) {
2889                                 ERR("out of memory\n");
2890                                 Status = STATUS_INSUFFICIENT_RESOURCES;
2891                                 goto exit;
2892                             }
2893 
2894                             decomp_alloc = true;
2895                         } else {
2896                             decomp = data + bytes_read;
2897                             decomp_alloc = false;
2898                         }
2899 
2900                         if (ed->compression == BTRFS_COMPRESSION_ZLIB) {
2901                             Status = zlib_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2902                             if (!NT_SUCCESS(Status)) {
2903                                 ERR("zlib_decompress returned %08lx\n", Status);
2904                                 if (decomp_alloc) ExFreePool(decomp);
2905                                 goto exit;
2906                             }
2907                         } else if (ed->compression == BTRFS_COMPRESSION_LZO) {
2908                             if (inlen < sizeof(uint32_t)) {
2909                                 ERR("extent data was truncated\n");
2910                                 Status = STATUS_INTERNAL_ERROR;
2911                                 if (decomp_alloc) ExFreePool(decomp);
2912                                 goto exit;
2913                             } else
2914                                 inlen -= sizeof(uint32_t);
2915 
2916                             Status = lzo_decompress(ed->data + sizeof(uint32_t), inlen, decomp, (uint32_t)(read + off), sizeof(uint32_t));
2917                             if (!NT_SUCCESS(Status)) {
2918                                 ERR("lzo_decompress returned %08lx\n", Status);
2919                                 if (decomp_alloc) ExFreePool(decomp);
2920                                 goto exit;
2921                             }
2922                         } else if (ed->compression == BTRFS_COMPRESSION_ZSTD) {
2923                             Status = zstd_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2924                             if (!NT_SUCCESS(Status)) {
2925                                 ERR("zstd_decompress returned %08lx\n", Status);
2926                                 if (decomp_alloc) ExFreePool(decomp);
2927                                 goto exit;
2928                             }
2929                         }
2930 
2931                         if (decomp_alloc) {
2932                             RtlCopyMemory(data + bytes_read, decomp + off, read);
2933                             ExFreePool(decomp);
2934                         }
2935                     } else {
2936                         ERR("unhandled compression type %x\n", ed->compression);
2937                         Status = STATUS_NOT_IMPLEMENTED;
2938                         goto exit;
2939                     }
2940 
2941                     bytes_read += read;
2942                     length -= read;
2943 
2944                     break;
2945                 }
2946 
2947                 case EXTENT_TYPE_REGULAR:
2948                 {
2949                     EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
2950                     read_part* rp;
2951 
2952                     rp = ExAllocatePoolWithTag(pool_type, sizeof(read_part), ALLOC_TAG);
2953                     if (!rp) {
2954                         ERR("out of memory\n");
2955                         Status = STATUS_INSUFFICIENT_RESOURCES;
2956                         goto exit;
2957                     }
2958 
2959                     rp->mdl = (Irp && Irp->MdlAddress) ? true : false;
2960                     rp->extents[0].off = start + bytes_read - ext->offset;
2961                     rp->bumpoff = 0;
2962                     rp->num_extents = 1;
2963                     rp->csum_free = false;
2964 
2965                     rp->read = (uint32_t)(len - rp->extents[0].off);
2966                     if (rp->read > length) rp->read = (uint32_t)length;
2967 
2968                     if (ed->compression == BTRFS_COMPRESSION_NONE) {
2969                         rp->addr = ed2->address + ed2->offset + rp->extents[0].off;
2970                         rp->to_read = (uint32_t)sector_align(rp->read, fcb->Vcb->superblock.sector_size);
2971 
2972                         if (rp->addr & (fcb->Vcb->superblock.sector_size - 1)) {
2973                             rp->bumpoff = rp->addr & (fcb->Vcb->superblock.sector_size - 1);
2974                             rp->addr -= rp->bumpoff;
2975                             rp->to_read = (uint32_t)sector_align(rp->read + rp->bumpoff, fcb->Vcb->superblock.sector_size);
2976                         }
2977                     } else {
2978                         rp->addr = ed2->address;
2979                         rp->to_read = (uint32_t)sector_align(ed2->size, fcb->Vcb->superblock.sector_size);
2980                     }
2981 
2982                     if (ed->compression == BTRFS_COMPRESSION_NONE && (start & (fcb->Vcb->superblock.sector_size - 1)) == 0 &&
2983                         (length & (fcb->Vcb->superblock.sector_size - 1)) == 0) {
2984                         rp->buf = data + bytes_read;
2985                         rp->buf_free = false;
2986                     } else {
2987                         rp->buf = ExAllocatePoolWithTag(pool_type, rp->to_read, ALLOC_TAG);
2988                         rp->buf_free = true;
2989 
2990                         if (!rp->buf) {
2991                             ERR("out of memory\n");
2992                             Status = STATUS_INSUFFICIENT_RESOURCES;
2993                             ExFreePool(rp);
2994                             goto exit;
2995                         }
2996 
2997                         rp->mdl = false;
2998                     }
2999 
3000                     rp->c = get_chunk_from_address(fcb->Vcb, rp->addr);
3001 
3002                     if (!rp->c) {
3003                         ERR("get_chunk_from_address(%I64x) failed\n", rp->addr);
3004 
3005                         if (rp->buf_free)
3006                             ExFreePool(rp->buf);
3007 
3008                         ExFreePool(rp);
3009 
3010                         Status = STATUS_INTERNAL_ERROR;
3011                         goto exit;
3012                     }
3013 
3014                     if (ext->csum) {
3015                         if (ed->compression == BTRFS_COMPRESSION_NONE) {
3016                             rp->csum = (uint8_t*)ext->csum + (fcb->Vcb->csum_size * (rp->extents[0].off >> fcb->Vcb->sector_shift));
3017                         } else
3018                             rp->csum = ext->csum;
3019                     } else
3020                         rp->csum = NULL;
3021 
3022                     rp->data = data + bytes_read;
3023                     rp->compression = ed->compression;
3024                     rp->extents[0].ed_offset = ed2->offset;
3025                     rp->extents[0].ed_size = ed2->size;
3026                     rp->extents[0].ed_num_bytes = ed2->num_bytes;
3027 
3028                     InsertTailList(&read_parts, &rp->list_entry);
3029 
3030                     bytes_read += rp->read;
3031                     length -= rp->read;
3032 
3033                     break;
3034                 }
3035 
3036                 case EXTENT_TYPE_PREALLOC:
3037                 {
3038                     uint64_t off = start + bytes_read - ext->offset;
3039                     uint32_t read = (uint32_t)(len - off);
3040 
3041                     if (read > length) read = (uint32_t)length;
3042 
3043                     RtlZeroMemory(data + bytes_read, read);
3044 
3045                     bytes_read += read;
3046                     length -= read;
3047 
3048                     break;
3049                 }
3050 
3051                 default:
3052                     WARN("Unsupported extent data type %u\n", ed->type);
3053                     Status = STATUS_NOT_IMPLEMENTED;
3054                     goto exit;
3055             }
3056 
3057             last_end = ext->offset + len;
3058 
3059             if (length == 0)
3060                 break;
3061         }
3062 
3063 nextitem:
3064         le = le->Flink;
3065     }
3066 
3067     if (!IsListEmpty(&read_parts) && read_parts.Flink->Flink != &read_parts) { // at least two entries in list
3068         read_part* last_rp = CONTAINING_RECORD(read_parts.Flink, read_part, list_entry);
3069 
3070         le = read_parts.Flink->Flink;
3071         while (le != &read_parts) {
3072             LIST_ENTRY* le2 = le->Flink;
3073             read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3074 
3075             // merge together runs
3076             if (rp->compression != BTRFS_COMPRESSION_NONE && rp->compression == last_rp->compression && rp->addr == last_rp->addr + last_rp->to_read &&
3077                 rp->data == (uint8_t*)last_rp->data + last_rp->read && rp->c == last_rp->c && ((rp->csum && last_rp->csum) || (!rp->csum && !last_rp->csum))) {
3078                 read_part* rp2;
3079 
3080                 rp2 = ExAllocatePoolWithTag(pool_type, offsetof(read_part, extents) + (sizeof(read_part_extent) * (last_rp->num_extents + 1)), ALLOC_TAG);
3081 
3082                 rp2->addr = last_rp->addr;
3083                 rp2->c = last_rp->c;
3084                 rp2->read = last_rp->read + rp->read;
3085                 rp2->to_read = last_rp->to_read + rp->to_read;
3086                 rp2->csum_free = false;
3087 
3088                 if (last_rp->csum) {
3089                     uint32_t sectors = (last_rp->to_read + rp->to_read) >> fcb->Vcb->sector_shift;
3090 
3091                     rp2->csum = ExAllocatePoolWithTag(pool_type, sectors * fcb->Vcb->csum_size, ALLOC_TAG);
3092                     if (!rp2->csum) {
3093                         ERR("out of memory\n");
3094                         ExFreePool(rp2);
3095                         Status = STATUS_INSUFFICIENT_RESOURCES;
3096                         goto exit;
3097                     }
3098 
3099                     RtlCopyMemory(rp2->csum, last_rp->csum, (last_rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift);
3100                     RtlCopyMemory((uint8_t*)rp2->csum + ((last_rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift), rp->csum,
3101                                   (rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift);
3102 
3103                     rp2->csum_free = true;
3104                 } else
3105                     rp2->csum = NULL;
3106 
3107                 rp2->buf = ExAllocatePoolWithTag(pool_type, rp2->to_read, ALLOC_TAG);
3108                 if (!rp2->buf) {
3109                     ERR("out of memory\n");
3110 
3111                     if (rp2->csum)
3112                         ExFreePool(rp2->csum);
3113 
3114                     ExFreePool(rp2);
3115                     Status = STATUS_INSUFFICIENT_RESOURCES;
3116                     goto exit;
3117                 }
3118 
3119                 rp2->buf_free = true;
3120                 rp2->bumpoff = 0;
3121                 rp2->mdl = false;
3122                 rp2->data = last_rp->data;
3123                 rp2->compression = last_rp->compression;
3124                 rp2->num_extents = last_rp->num_extents + 1;
3125 
3126                 RtlCopyMemory(rp2->extents, last_rp->extents, last_rp->num_extents * sizeof(read_part_extent));
3127                 RtlCopyMemory(&rp2->extents[last_rp->num_extents], rp->extents, sizeof(read_part_extent));
3128 
3129                 InsertHeadList(le->Blink, &rp2->list_entry);
3130 
3131                 if (rp->buf_free)
3132                     ExFreePool(rp->buf);
3133 
3134                 if (rp->csum_free)
3135                     ExFreePool(rp->csum);
3136 
3137                 RemoveEntryList(&rp->list_entry);
3138 
3139                 ExFreePool(rp);
3140 
3141                 if (last_rp->buf_free)
3142                     ExFreePool(last_rp->buf);
3143 
3144                 if (last_rp->csum_free)
3145                     ExFreePool(last_rp->csum);
3146 
3147                 RemoveEntryList(&last_rp->list_entry);
3148 
3149                 ExFreePool(last_rp);
3150 
3151                 last_rp = rp2;
3152             } else
3153                 last_rp = rp;
3154 
3155             le = le2;
3156         }
3157     }
3158 
3159     le = read_parts.Flink;
3160     while (le != &read_parts) {
3161         read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3162 
3163         Status = read_data(fcb->Vcb, rp->addr, rp->to_read, rp->csum, false, rp->buf, rp->c, NULL, Irp, 0, rp->mdl,
3164                            fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3165         if (!NT_SUCCESS(Status)) {
3166             ERR("read_data returned %08lx\n", Status);
3167             goto exit;
3168         }
3169 
3170         if (rp->compression == BTRFS_COMPRESSION_NONE) {
3171             if (rp->buf_free)
3172                 RtlCopyMemory(rp->data, rp->buf + rp->bumpoff, rp->read);
3173         } else {
3174             uint8_t* buf = rp->buf;
3175 
3176             for (unsigned int i = 0; i < rp->num_extents; i++) {
3177                 uint8_t *decomp = NULL, *buf2;
3178                 ULONG outlen, inlen, off2;
3179                 uint32_t inpageoff = 0;
3180                 comp_calc_job* ccj;
3181 
3182                 off2 = (ULONG)(rp->extents[i].ed_offset + rp->extents[i].off);
3183                 buf2 = buf;
3184                 inlen = (ULONG)rp->extents[i].ed_size;
3185 
3186                 if (rp->compression == BTRFS_COMPRESSION_LZO) {
3187                     ULONG inoff = sizeof(uint32_t);
3188 
3189                     inlen -= sizeof(uint32_t);
3190 
3191                     // If reading a few sectors in, skip to the interesting bit
3192                     while (off2 > LZO_PAGE_SIZE) {
3193                         uint32_t partlen;
3194 
3195                         if (inlen < sizeof(uint32_t))
3196                             break;
3197 
3198                         partlen = *(uint32_t*)(buf2 + inoff);
3199 
3200                         if (partlen < inlen) {
3201                             off2 -= LZO_PAGE_SIZE;
3202                             inoff += partlen + sizeof(uint32_t);
3203                             inlen -= partlen + sizeof(uint32_t);
3204 
3205                             if (LZO_PAGE_SIZE - (inoff % LZO_PAGE_SIZE) < sizeof(uint32_t))
3206                                 inoff = ((inoff / LZO_PAGE_SIZE) + 1) * LZO_PAGE_SIZE;
3207                         } else
3208                             break;
3209                     }
3210 
3211                     buf2 = &buf2[inoff];
3212                     inpageoff = inoff % LZO_PAGE_SIZE;
3213                 }
3214 
3215                 /* Previous versions of this code decompressed directly into the destination buffer,
3216                  * but unfortunately that can't be relied on - Windows likes to use dummy pages sometimes
3217                  * when mmap-ing, which breaks the backtracking used by e.g. zstd. */
3218 
3219                 if (off2 != 0)
3220                     outlen = off2 + min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3221                 else
3222                     outlen = min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3223 
3224                 decomp = ExAllocatePoolWithTag(pool_type, outlen, ALLOC_TAG);
3225                 if (!decomp) {
3226                     ERR("out of memory\n");
3227                     Status = STATUS_INSUFFICIENT_RESOURCES;
3228                     goto exit;
3229                 }
3230 
3231                 ccj = (comp_calc_job*)ExAllocatePoolWithTag(pool_type, sizeof(comp_calc_job), ALLOC_TAG);
3232                 if (!ccj) {
3233                     ERR("out of memory\n");
3234 
3235                     ExFreePool(decomp);
3236 
3237                     Status = STATUS_INSUFFICIENT_RESOURCES;
3238                     goto exit;
3239                 }
3240 
3241                 ccj->data = rp->data;
3242                 ccj->decomp = decomp;
3243 
3244                 ccj->offset = off2;
3245                 ccj->length = (size_t)min(rp->read, rp->extents[i].ed_num_bytes - rp->extents[i].off);
3246 
3247                 Status = add_calc_job_decomp(fcb->Vcb, rp->compression, buf2, inlen, decomp, outlen,
3248                                              inpageoff, &ccj->cj);
3249                 if (!NT_SUCCESS(Status)) {
3250                     ERR("add_calc_job_decomp returned %08lx\n", Status);
3251 
3252                     ExFreePool(decomp);
3253                     ExFreePool(ccj);
3254 
3255                     goto exit;
3256                 }
3257 
3258                 InsertTailList(&calc_jobs, &ccj->list_entry);
3259 
3260                 buf += rp->extents[i].ed_size;
3261                 rp->data = (uint8_t*)rp->data + rp->extents[i].ed_num_bytes - rp->extents[i].off;
3262                 rp->read -= (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off);
3263             }
3264         }
3265 
3266         le = le->Flink;
3267     }
3268 
3269     if (length > 0 && start + bytes_read < fcb->inode_item.st_size) {
3270         uint32_t read = (uint32_t)min(fcb->inode_item.st_size - start - bytes_read, length);
3271 
3272         RtlZeroMemory(data + bytes_read, read);
3273 
3274         bytes_read += read;
3275         length -= read;
3276     }
3277 
3278     Status = STATUS_SUCCESS;
3279 
3280     while (!IsListEmpty(&calc_jobs)) {
3281         comp_calc_job* ccj = CONTAINING_RECORD(RemoveTailList(&calc_jobs), comp_calc_job, list_entry);
3282 
3283         calc_thread_main(fcb->Vcb, ccj->cj);
3284 
3285         KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3286 
3287         if (!NT_SUCCESS(ccj->cj->Status))
3288             Status = ccj->cj->Status;
3289 
3290         RtlCopyMemory(ccj->data, (uint8_t*)ccj->decomp + ccj->offset, ccj->length);
3291         ExFreePool(ccj->decomp);
3292 
3293         ExFreePool(ccj);
3294     }
3295 
3296     if (pbr)
3297         *pbr = bytes_read;
3298 
3299 exit:
3300     while (!IsListEmpty(&read_parts)) {
3301         read_part* rp = CONTAINING_RECORD(RemoveHeadList(&read_parts), read_part, list_entry);
3302 
3303         if (rp->buf_free)
3304             ExFreePool(rp->buf);
3305 
3306         if (rp->csum_free)
3307             ExFreePool(rp->csum);
3308 
3309         ExFreePool(rp);
3310     }
3311 
3312     while (!IsListEmpty(&calc_jobs)) {
3313         comp_calc_job* ccj = CONTAINING_RECORD(RemoveHeadList(&calc_jobs), comp_calc_job, list_entry);
3314 
3315         KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3316 
3317         if (ccj->decomp)
3318             ExFreePool(ccj->decomp);
3319 
3320         ExFreePool(ccj->cj);
3321 
3322         ExFreePool(ccj);
3323     }
3324 
3325     return Status;
3326 }
3327 
do_read(PIRP Irp,bool wait,ULONG * bytes_read)3328 NTSTATUS do_read(PIRP Irp, bool wait, ULONG* bytes_read) {
3329     PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3330     PFILE_OBJECT FileObject = IrpSp->FileObject;
3331     fcb* fcb = FileObject->FsContext;
3332     uint8_t* data = NULL;
3333     ULONG length = IrpSp->Parameters.Read.Length, addon = 0;
3334     uint64_t start = IrpSp->Parameters.Read.ByteOffset.QuadPart;
3335 
3336     *bytes_read = 0;
3337 
3338     if (!fcb || !fcb->Vcb || !fcb->subvol)
3339         return STATUS_INTERNAL_ERROR;
3340 
3341     TRACE("fcb = %p\n", fcb);
3342     TRACE("offset = %I64x, length = %lx\n", start, length);
3343     TRACE("paging_io = %s, no cache = %s\n", Irp->Flags & IRP_PAGING_IO ? "true" : "false", Irp->Flags & IRP_NOCACHE ? "true" : "false");
3344 
3345     if (!fcb->ads && fcb->type == BTRFS_TYPE_DIRECTORY)
3346         return STATUS_INVALID_DEVICE_REQUEST;
3347 
3348     if (!(Irp->Flags & IRP_PAGING_IO) && !FsRtlCheckLockForReadAccess(&fcb->lock, Irp)) {
3349         WARN("tried to read locked region\n");
3350         return STATUS_FILE_LOCK_CONFLICT;
3351     }
3352 
3353     if (length == 0) {
3354         TRACE("tried to read zero bytes\n");
3355         return STATUS_SUCCESS;
3356     }
3357 
3358     if (start >= (uint64_t)fcb->Header.FileSize.QuadPart) {
3359         TRACE("tried to read with offset after file end (%I64x >= %I64x)\n", start, fcb->Header.FileSize.QuadPart);
3360         return STATUS_END_OF_FILE;
3361     }
3362 
3363     TRACE("FileObject %p fcb %p FileSize = %I64x st_size = %I64x (%p)\n", FileObject, fcb, fcb->Header.FileSize.QuadPart, fcb->inode_item.st_size, &fcb->inode_item.st_size);
3364 
3365     if (!(Irp->Flags & IRP_NOCACHE) && IrpSp->MinorFunction & IRP_MN_MDL) {
3366         NTSTATUS Status = STATUS_SUCCESS;
3367 
3368         _SEH2_TRY {
3369             if (!FileObject->PrivateCacheMap) {
3370                 CC_FILE_SIZES ccfs;
3371 
3372                 ccfs.AllocationSize = fcb->Header.AllocationSize;
3373                 ccfs.FileSize = fcb->Header.FileSize;
3374                 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
3375 
3376                 init_file_cache(FileObject, &ccfs);
3377             }
3378 
3379             CcMdlRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, &Irp->MdlAddress, &Irp->IoStatus);
3380         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
3381         Status = _SEH2_GetExceptionCode();
3382     } _SEH2_END;
3383 
3384         if (NT_SUCCESS(Status)) {
3385             Status = Irp->IoStatus.Status;
3386             Irp->IoStatus.Information += addon;
3387             *bytes_read = (ULONG)Irp->IoStatus.Information;
3388         } else
3389             ERR("EXCEPTION - %08lx\n", Status);
3390 
3391         return Status;
3392     }
3393 
3394     data = map_user_buffer(Irp, fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3395 
3396     if (Irp->MdlAddress && !data) {
3397         ERR("MmGetSystemAddressForMdlSafe returned NULL\n");
3398         return STATUS_INSUFFICIENT_RESOURCES;
3399     }
3400 
3401     if (start >= (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3402         length = (ULONG)min(length, min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3403         RtlZeroMemory(data, length);
3404         Irp->IoStatus.Information = *bytes_read = length;
3405         return STATUS_SUCCESS;
3406     }
3407 
3408     if (length + start > (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3409         addon = (ULONG)(min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3410         RtlZeroMemory(data + (fcb->Header.ValidDataLength.QuadPart - start), addon);
3411         length = (ULONG)(fcb->Header.ValidDataLength.QuadPart - start);
3412     }
3413 
3414     if (!(Irp->Flags & IRP_NOCACHE)) {
3415         NTSTATUS Status = STATUS_SUCCESS;
3416 
3417         _SEH2_TRY {
3418             if (!FileObject->PrivateCacheMap) {
3419                 CC_FILE_SIZES ccfs;
3420 
3421                 ccfs.AllocationSize = fcb->Header.AllocationSize;
3422                 ccfs.FileSize = fcb->Header.FileSize;
3423                 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
3424 
3425                 init_file_cache(FileObject, &ccfs);
3426             }
3427 
3428             if (fCcCopyReadEx) {
3429                 TRACE("CcCopyReadEx(%p, %I64x, %lx, %u, %p, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart,
3430                         length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread);
3431                 TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3432                 if (!fCcCopyReadEx(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread)) {
3433                     TRACE("CcCopyReadEx could not wait\n");
3434 
3435                     IoMarkIrpPending(Irp);
3436                     return STATUS_PENDING;
3437                 }
3438                 TRACE("CcCopyReadEx finished\n");
3439             } else {
3440                 TRACE("CcCopyRead(%p, %I64x, %lx, %u, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart, length, wait, data, &Irp->IoStatus);
3441                 TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3442                 if (!CcCopyRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus)) {
3443                     TRACE("CcCopyRead could not wait\n");
3444 
3445                     IoMarkIrpPending(Irp);
3446                     return STATUS_PENDING;
3447                 }
3448                 TRACE("CcCopyRead finished\n");
3449             }
3450         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
3451             Status = _SEH2_GetExceptionCode();
3452         } _SEH2_END;
3453 
3454         if (NT_SUCCESS(Status)) {
3455             Status = Irp->IoStatus.Status;
3456             Irp->IoStatus.Information += addon;
3457             *bytes_read = (ULONG)Irp->IoStatus.Information;
3458         } else
3459             ERR("EXCEPTION - %08lx\n", Status);
3460 
3461         return Status;
3462     } else {
3463         NTSTATUS Status;
3464 
3465         if (!wait) {
3466             IoMarkIrpPending(Irp);
3467             return STATUS_PENDING;
3468         }
3469 
3470         if (fcb->ads) {
3471             Status = read_stream(fcb, data, start, length, bytes_read);
3472 
3473             if (!NT_SUCCESS(Status))
3474                 ERR("read_stream returned %08lx\n", Status);
3475         } else {
3476             Status = read_file(fcb, data, start, length, bytes_read, Irp);
3477 
3478             if (!NT_SUCCESS(Status))
3479                 ERR("read_file returned %08lx\n", Status);
3480         }
3481 
3482         *bytes_read += addon;
3483         TRACE("read %lu bytes\n", *bytes_read);
3484 
3485         Irp->IoStatus.Information = *bytes_read;
3486 
3487         if (diskacc && Status != STATUS_PENDING) {
3488             PETHREAD thread = NULL;
3489 
3490             if (Irp->Tail.Overlay.Thread && !IoIsSystemThread(Irp->Tail.Overlay.Thread))
3491                 thread = Irp->Tail.Overlay.Thread;
3492             else if (!IoIsSystemThread(PsGetCurrentThread()))
3493                 thread = PsGetCurrentThread();
3494             else if (IoIsSystemThread(PsGetCurrentThread()) && IoGetTopLevelIrp() == Irp)
3495                 thread = PsGetCurrentThread();
3496 
3497             if (thread)
3498                 fPsUpdateDiskCounters(PsGetThreadProcess(thread), *bytes_read, 0, 1, 0, 0);
3499         }
3500 
3501         return Status;
3502     }
3503 }
3504 
3505 _Dispatch_type_(IRP_MJ_READ)
_Function_class_(DRIVER_DISPATCH)3506 _Function_class_(DRIVER_DISPATCH)
3507 NTSTATUS __stdcall drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
3508     device_extension* Vcb = DeviceObject->DeviceExtension;
3509     PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3510     PFILE_OBJECT FileObject = IrpSp->FileObject;
3511     ULONG bytes_read = 0;
3512     NTSTATUS Status;
3513     bool top_level;
3514     fcb* fcb;
3515     ccb* ccb;
3516     bool acquired_fcb_lock = false, wait;
3517 
3518     FsRtlEnterFileSystem();
3519 
3520     top_level = is_top_level(Irp);
3521 
3522     TRACE("read\n");
3523 
3524     if (Vcb && Vcb->type == VCB_TYPE_VOLUME) {
3525         Status = vol_read(DeviceObject, Irp);
3526         goto exit2;
3527     } else if (!Vcb || Vcb->type != VCB_TYPE_FS) {
3528         Status = STATUS_INVALID_PARAMETER;
3529         goto end;
3530     }
3531 
3532     Irp->IoStatus.Information = 0;
3533 
3534     if (IrpSp->MinorFunction & IRP_MN_COMPLETE) {
3535         CcMdlReadComplete(IrpSp->FileObject, Irp->MdlAddress);
3536 
3537         Irp->MdlAddress = NULL;
3538         Status = STATUS_SUCCESS;
3539 
3540         goto exit;
3541     }
3542 
3543     fcb = FileObject->FsContext;
3544 
3545     if (!fcb) {
3546         ERR("fcb was NULL\n");
3547         Status = STATUS_INVALID_PARAMETER;
3548         goto exit;
3549     }
3550 
3551     ccb = FileObject->FsContext2;
3552 
3553     if (!ccb) {
3554         ERR("ccb was NULL\n");
3555         Status = STATUS_INVALID_PARAMETER;
3556         goto exit;
3557     }
3558 
3559     if (Irp->RequestorMode == UserMode && !(ccb->access & FILE_READ_DATA)) {
3560         WARN("insufficient privileges\n");
3561         Status = STATUS_ACCESS_DENIED;
3562         goto exit;
3563     }
3564 
3565     if (fcb == Vcb->volume_fcb) {
3566         TRACE("reading volume FCB\n");
3567 
3568         IoSkipCurrentIrpStackLocation(Irp);
3569 
3570         Status = IoCallDriver(Vcb->Vpb->RealDevice, Irp);
3571 
3572         goto exit2;
3573     }
3574 
3575     if (!(Irp->Flags & IRP_PAGING_IO))
3576         FsRtlCheckOplock(fcb_oplock(fcb), Irp, NULL, NULL, NULL);
3577 
3578     wait = IoIsOperationSynchronous(Irp);
3579 
3580     // Don't offload jobs when doing paging IO - otherwise this can lead to
3581     // deadlocks in CcCopyRead.
3582     if (Irp->Flags & IRP_PAGING_IO)
3583         wait = true;
3584 
3585     if (!(Irp->Flags & IRP_PAGING_IO) && FileObject->SectionObjectPointer && FileObject->SectionObjectPointer->DataSectionObject) {
3586         IO_STATUS_BLOCK iosb;
3587 
3588         CcFlushCache(FileObject->SectionObjectPointer, &IrpSp->Parameters.Read.ByteOffset, IrpSp->Parameters.Read.Length, &iosb);
3589         if (!NT_SUCCESS(iosb.Status)) {
3590             ERR("CcFlushCache returned %08lx\n", iosb.Status);
3591             return iosb.Status;
3592         }
3593     }
3594 
3595     if (!ExIsResourceAcquiredSharedLite(fcb->Header.Resource)) {
3596         if (!ExAcquireResourceSharedLite(fcb->Header.Resource, wait)) {
3597             Status = STATUS_PENDING;
3598             IoMarkIrpPending(Irp);
3599             goto exit;
3600         }
3601 
3602         acquired_fcb_lock = true;
3603     }
3604 
3605     Status = do_read(Irp, wait, &bytes_read);
3606 
3607     if (acquired_fcb_lock)
3608         ExReleaseResourceLite(fcb->Header.Resource);
3609 
3610 exit:
3611     if (FileObject->Flags & FO_SYNCHRONOUS_IO && !(Irp->Flags & IRP_PAGING_IO))
3612         FileObject->CurrentByteOffset.QuadPart = IrpSp->Parameters.Read.ByteOffset.QuadPart + (NT_SUCCESS(Status) ? bytes_read : 0);
3613 
3614 end:
3615     Irp->IoStatus.Status = Status;
3616 
3617     TRACE("Irp->IoStatus.Status = %08lx\n", Irp->IoStatus.Status);
3618     TRACE("Irp->IoStatus.Information = %Iu\n", Irp->IoStatus.Information);
3619     TRACE("returning %08lx\n", Status);
3620 
3621     if (Status != STATUS_PENDING)
3622         IoCompleteRequest(Irp, IO_NO_INCREMENT);
3623     else {
3624         if (!add_thread_job(Vcb, Irp))
3625             Status = do_read_job(Irp);
3626     }
3627 
3628 exit2:
3629     if (top_level)
3630         IoSetTopLevelIrp(NULL);
3631 
3632     FsRtlExitFileSystem();
3633 
3634     return Status;
3635 }
3636