xref: /reactos/drivers/filesystems/btrfs/read.c (revision 682f85ad)
1 /* Copyright (c) Mark Harmstone 2016-17
2  *
3  * This file is part of WinBtrfs.
4  *
5  * WinBtrfs is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public Licence as published by
7  * the Free Software Foundation, either version 3 of the Licence, or
8  * (at your option) any later version.
9  *
10  * WinBtrfs is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public Licence for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public Licence
16  * along with WinBtrfs.  If not, see <http://www.gnu.org/licenses/>. */
17 
18 #include "btrfs_drv.h"
19 #include "xxhash.h"
20 #include "crc32c.h"
21 
22 enum read_data_status {
23     ReadDataStatus_Pending,
24     ReadDataStatus_Success,
25     ReadDataStatus_Error,
26     ReadDataStatus_MissingDevice,
27     ReadDataStatus_Skip
28 };
29 
30 struct read_data_context;
31 
32 typedef struct {
33     struct read_data_context* context;
34     uint16_t stripenum;
35     bool rewrite;
36     PIRP Irp;
37     IO_STATUS_BLOCK iosb;
38     enum read_data_status status;
39     PMDL mdl;
40     uint64_t stripestart;
41     uint64_t stripeend;
42 } read_data_stripe;
43 
44 typedef struct {
45     KEVENT Event;
46     NTSTATUS Status;
47     chunk* c;
48     uint64_t address;
49     uint32_t buflen;
50     LONG num_stripes, stripes_left;
51     uint64_t type;
52     uint32_t sector_size;
53     uint16_t firstoff, startoffstripe, sectors_per_stripe;
54     void* csum;
55     bool tree;
56     read_data_stripe* stripes;
57     uint8_t* va;
58 } read_data_context;
59 
60 extern bool diskacc;
61 extern tPsUpdateDiskCounters fPsUpdateDiskCounters;
62 extern tCcCopyReadEx fCcCopyReadEx;
63 extern tFsRtlUpdateDiskCounters fFsRtlUpdateDiskCounters;
64 
65 #define LZO_PAGE_SIZE 4096
66 
67 _Function_class_(IO_COMPLETION_ROUTINE)
68 static NTSTATUS __stdcall read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
69     read_data_stripe* stripe = conptr;
70     read_data_context* context = (read_data_context*)stripe->context;
71 
72     UNUSED(DeviceObject);
73 
74     stripe->iosb = Irp->IoStatus;
75 
76     if (NT_SUCCESS(Irp->IoStatus.Status))
77         stripe->status = ReadDataStatus_Success;
78     else
79         stripe->status = ReadDataStatus_Error;
80 
81     if (InterlockedDecrement(&context->stripes_left) == 0)
82         KeSetEvent(&context->Event, 0, false);
83 
84     return STATUS_MORE_PROCESSING_REQUIRED;
85 }
86 
87 NTSTATUS check_csum(device_extension* Vcb, uint8_t* data, uint32_t sectors, void* csum) {
88     void* csum2;
89 
90     csum2 = ExAllocatePoolWithTag(PagedPool, Vcb->csum_size * sectors, ALLOC_TAG);
91     if (!csum2) {
92         ERR("out of memory\n");
93         return STATUS_INSUFFICIENT_RESOURCES;
94     }
95 
96     do_calc_job(Vcb, data, sectors, csum2);
97 
98     if (RtlCompareMemory(csum2, csum, sectors * Vcb->csum_size) != sectors * Vcb->csum_size) {
99         ExFreePool(csum2);
100         return STATUS_CRC_ERROR;
101     }
102 
103     ExFreePool(csum2);
104 
105     return STATUS_SUCCESS;
106 }
107 
108 void get_tree_checksum(device_extension* Vcb, tree_header* th, void* csum) {
109     switch (Vcb->superblock.csum_type) {
110         case CSUM_TYPE_CRC32C:
111             *(uint32_t*)csum = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
112         break;
113 
114         case CSUM_TYPE_XXHASH:
115             *(uint64_t*)csum = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
116         break;
117 
118         case CSUM_TYPE_SHA256:
119             calc_sha256(csum, &th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
120         break;
121 
122         case CSUM_TYPE_BLAKE2:
123             blake2b(csum, BLAKE2_HASH_SIZE, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
124         break;
125     }
126 }
127 
128 bool check_tree_checksum(device_extension* Vcb, tree_header* th) {
129     switch (Vcb->superblock.csum_type) {
130         case CSUM_TYPE_CRC32C: {
131             uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
132 
133             if (crc32 == *((uint32_t*)th->csum))
134                 return true;
135 
136             WARN("hash was %08x, expected %08x\n", crc32, *((uint32_t*)th->csum));
137 
138             break;
139         }
140 
141         case CSUM_TYPE_XXHASH: {
142             uint64_t hash = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
143 
144             if (hash == *((uint64_t*)th->csum))
145                 return true;
146 
147             WARN("hash was %I64x, expected %I64x\n", hash, *((uint64_t*)th->csum));
148 
149             break;
150         }
151 
152         case CSUM_TYPE_SHA256: {
153             uint8_t hash[SHA256_HASH_SIZE];
154 
155             calc_sha256(hash, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
156 
157             if (RtlCompareMemory(hash, th, SHA256_HASH_SIZE) == SHA256_HASH_SIZE)
158                 return true;
159 
160             WARN("hash was invalid\n");
161 
162             break;
163         }
164 
165         case CSUM_TYPE_BLAKE2: {
166             uint8_t hash[BLAKE2_HASH_SIZE];
167 
168             blake2b(hash, sizeof(hash), (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
169 
170             if (RtlCompareMemory(hash, th, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE)
171                 return true;
172 
173             WARN("hash was invalid\n");
174 
175             break;
176         }
177     }
178 
179     return false;
180 }
181 
182 void get_sector_csum(device_extension* Vcb, void* buf, void* csum) {
183     switch (Vcb->superblock.csum_type) {
184         case CSUM_TYPE_CRC32C:
185             *(uint32_t*)csum = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
186         break;
187 
188         case CSUM_TYPE_XXHASH:
189             *(uint64_t*)csum = XXH64(buf, Vcb->superblock.sector_size, 0);
190         break;
191 
192         case CSUM_TYPE_SHA256:
193             calc_sha256(csum, buf, Vcb->superblock.sector_size);
194         break;
195 
196         case CSUM_TYPE_BLAKE2:
197             blake2b(csum, BLAKE2_HASH_SIZE, buf, Vcb->superblock.sector_size);
198         break;
199     }
200 }
201 
202 bool check_sector_csum(device_extension* Vcb, void* buf, void* csum) {
203     switch (Vcb->superblock.csum_type) {
204         case CSUM_TYPE_CRC32C: {
205             uint32_t crc32 = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
206 
207             return *(uint32_t*)csum == crc32;
208         }
209 
210         case CSUM_TYPE_XXHASH: {
211             uint64_t hash = XXH64(buf, Vcb->superblock.sector_size, 0);
212 
213             return *(uint64_t*)csum == hash;
214         }
215 
216         case CSUM_TYPE_SHA256: {
217             uint8_t hash[SHA256_HASH_SIZE];
218 
219             calc_sha256(hash, buf, Vcb->superblock.sector_size);
220 
221             return RtlCompareMemory(hash, csum, SHA256_HASH_SIZE) == SHA256_HASH_SIZE;
222         }
223 
224         case CSUM_TYPE_BLAKE2: {
225             uint8_t hash[BLAKE2_HASH_SIZE];
226 
227             blake2b(hash, sizeof(hash), buf, Vcb->superblock.sector_size);
228 
229             return RtlCompareMemory(hash, csum, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE;
230         }
231     }
232 
233     return false;
234 }
235 
236 static NTSTATUS read_data_dup(device_extension* Vcb, uint8_t* buf, uint64_t addr, read_data_context* context, CHUNK_ITEM* ci,
237                               device** devices, uint64_t generation) {
238     ULONG i;
239     bool checksum_error = false;
240     uint16_t j, stripe = 0;
241     NTSTATUS Status;
242     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
243 
244     for (j = 0; j < ci->num_stripes; j++) {
245         if (context->stripes[j].status == ReadDataStatus_Error) {
246             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
247             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
248             return context->stripes[j].iosb.Status;
249         } else if (context->stripes[j].status == ReadDataStatus_Success) {
250             stripe = j;
251             break;
252         }
253     }
254 
255     if (context->stripes[stripe].status != ReadDataStatus_Success)
256         return STATUS_INTERNAL_ERROR;
257 
258     if (context->tree) {
259         tree_header* th = (tree_header*)buf;
260 
261         if (th->address != context->address || !check_tree_checksum(Vcb, th)) {
262             checksum_error = true;
263             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
264         } else if (generation != 0 && th->generation != generation) {
265             checksum_error = true;
266             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
267         }
268     } else if (context->csum) {
269         Status = check_csum(Vcb, buf, (ULONG)context->stripes[stripe].Irp->IoStatus.Information / context->sector_size, context->csum);
270 
271         if (Status == STATUS_CRC_ERROR) {
272             checksum_error = true;
273             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
274         } else if (!NT_SUCCESS(Status)) {
275             ERR("check_csum returned %08lx\n", Status);
276             return Status;
277         }
278     }
279 
280     if (!checksum_error)
281         return STATUS_SUCCESS;
282 
283     if (ci->num_stripes == 1)
284         return STATUS_CRC_ERROR;
285 
286     if (context->tree) {
287         tree_header* t2;
288         bool recovered = false;
289 
290         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
291         if (!t2) {
292             ERR("out of memory\n");
293             return STATUS_INSUFFICIENT_RESOURCES;
294         }
295 
296         for (j = 0; j < ci->num_stripes; j++) {
297             if (j != stripe && devices[j] && devices[j]->devobj) {
298                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + context->stripes[stripe].stripestart,
299                                         Vcb->superblock.node_size, (uint8_t*)t2, false);
300                 if (!NT_SUCCESS(Status)) {
301                     WARN("sync_read_phys returned %08lx\n", Status);
302                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
303                 } else {
304                     bool checksum_error = !check_tree_checksum(Vcb, t2);
305 
306                     if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
307                         RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
308                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
309                         recovered = true;
310 
311                         if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
312                             Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + context->stripes[stripe].stripestart,
313                                                      t2, Vcb->superblock.node_size);
314                             if (!NT_SUCCESS(Status)) {
315                                 WARN("write_data_phys returned %08lx\n", Status);
316                                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
317                             }
318                         }
319 
320                         break;
321                     } else if (t2->address != addr || checksum_error)
322                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
323                     else
324                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_GENERATION_ERRORS);
325                 }
326             }
327         }
328 
329         if (!recovered) {
330             ERR("unrecoverable checksum error at %I64x\n", addr);
331             ExFreePool(t2);
332             return STATUS_CRC_ERROR;
333         }
334 
335         ExFreePool(t2);
336     } else {
337         ULONG sectors = (ULONG)context->stripes[stripe].Irp->IoStatus.Information / Vcb->superblock.sector_size;
338         uint8_t* sector;
339         void* ptr = context->csum;
340 
341         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
342         if (!sector) {
343             ERR("out of memory\n");
344             return STATUS_INSUFFICIENT_RESOURCES;
345         }
346 
347         for (i = 0; i < sectors; i++) {
348             if (!check_sector_csum(Vcb, buf + (i * Vcb->superblock.sector_size), ptr)) {
349                 bool recovered = false;
350 
351                 for (j = 0; j < ci->num_stripes; j++) {
352                     if (j != stripe && devices[j] && devices[j]->devobj) {
353                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj,
354                                                 cis[j].offset + context->stripes[stripe].stripestart + UInt32x32To64(i, Vcb->superblock.sector_size),
355                                                 Vcb->superblock.sector_size, sector, false);
356                         if (!NT_SUCCESS(Status)) {
357                             WARN("sync_read_phys returned %08lx\n", Status);
358                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
359                         } else {
360                             if (check_sector_csum(Vcb, sector, ptr)) {
361                                 RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), sector, Vcb->superblock.sector_size);
362                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size), devices[stripe]->devitem.dev_id);
363                                 recovered = true;
364 
365                                 if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
366                                     Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj,
367                                                              cis[stripe].offset + context->stripes[stripe].stripestart + UInt32x32To64(i, Vcb->superblock.sector_size),
368                                                              sector, Vcb->superblock.sector_size);
369                                     if (!NT_SUCCESS(Status)) {
370                                         WARN("write_data_phys returned %08lx\n", Status);
371                                         log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
372                                     }
373                                 }
374 
375                                 break;
376                             } else
377                                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
378                         }
379                     }
380                 }
381 
382                 if (!recovered) {
383                     ERR("unrecoverable checksum error at %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size));
384                     ExFreePool(sector);
385                     return STATUS_CRC_ERROR;
386                 }
387             }
388 
389             ptr = (uint8_t*)ptr + Vcb->csum_size;
390         }
391 
392         ExFreePool(sector);
393     }
394 
395     return STATUS_SUCCESS;
396 }
397 
398 static NTSTATUS read_data_raid0(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
399                                 CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
400     uint64_t i;
401 
402     for (i = 0; i < ci->num_stripes; i++) {
403         if (context->stripes[i].status == ReadDataStatus_Error) {
404             WARN("stripe %I64u returned error %08lx\n", i, context->stripes[i].iosb.Status);
405             log_device_error(Vcb, devices[i], BTRFS_DEV_STAT_READ_ERRORS);
406             return context->stripes[i].iosb.Status;
407         }
408     }
409 
410     if (context->tree) { // shouldn't happen, as trees shouldn't cross stripe boundaries
411         tree_header* th = (tree_header*)buf;
412         bool checksum_error = !check_tree_checksum(Vcb, th);
413 
414         if (checksum_error || addr != th->address || (generation != 0 && generation != th->generation)) {
415             uint64_t off;
416             uint16_t stripe;
417 
418             get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &off, &stripe);
419 
420             ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
421 
422             if (checksum_error) {
423                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
424                 return STATUS_CRC_ERROR;
425             } else if (addr != th->address) {
426                 WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
427                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
428                 return STATUS_CRC_ERROR;
429             } else if (generation != 0 && generation != th->generation) {
430                 WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
431                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
432                 return STATUS_CRC_ERROR;
433             }
434         }
435     } else if (context->csum) {
436         NTSTATUS Status;
437 
438         Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
439 
440         if (Status == STATUS_CRC_ERROR) {
441             void* ptr = context->csum;
442 
443             for (i = 0; i < length / Vcb->superblock.sector_size; i++) {
444                 if (!check_sector_csum(Vcb, buf + (i * Vcb->superblock.sector_size), ptr)) {
445                     uint64_t off;
446                     uint16_t stripe;
447 
448                     get_raid0_offset(addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size), ci->stripe_length, ci->num_stripes, &off, &stripe);
449 
450                     ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
451 
452                     log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
453 
454                     return Status;
455                 }
456 
457                 ptr = (uint8_t*)ptr + Vcb->csum_size;
458             }
459 
460             return Status;
461         } else if (!NT_SUCCESS(Status)) {
462             ERR("check_csum returned %08lx\n", Status);
463             return Status;
464         }
465     }
466 
467     return STATUS_SUCCESS;
468 }
469 
470 static NTSTATUS read_data_raid10(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
471                                  CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
472     uint64_t i;
473     uint16_t j, stripe;
474     NTSTATUS Status;
475     bool checksum_error = false;
476     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
477 
478     for (j = 0; j < ci->num_stripes; j++) {
479         if (context->stripes[j].status == ReadDataStatus_Error) {
480             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
481             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
482             return context->stripes[j].iosb.Status;
483         } else if (context->stripes[j].status == ReadDataStatus_Success)
484             stripe = j;
485     }
486 
487     if (context->tree) {
488         tree_header* th = (tree_header*)buf;
489 
490         if (!check_tree_checksum(Vcb, th)) {
491             checksum_error = true;
492             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
493         } else if (addr != th->address) {
494             WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
495             checksum_error = true;
496             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
497         } else if (generation != 0 && generation != th->generation) {
498             WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
499             checksum_error = true;
500             log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
501         }
502     } else if (context->csum) {
503         Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
504 
505         if (Status == STATUS_CRC_ERROR)
506             checksum_error = true;
507         else if (!NT_SUCCESS(Status)) {
508             ERR("check_csum returned %08lx\n", Status);
509             return Status;
510         }
511     }
512 
513     if (!checksum_error)
514         return STATUS_SUCCESS;
515 
516     if (context->tree) {
517         tree_header* t2;
518         uint64_t off;
519         uint16_t badsubstripe = 0;
520         bool recovered = false;
521 
522         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
523         if (!t2) {
524             ERR("out of memory\n");
525             return STATUS_INSUFFICIENT_RESOURCES;
526         }
527 
528         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &off, &stripe);
529 
530         stripe *= ci->sub_stripes;
531 
532         for (j = 0; j < ci->sub_stripes; j++) {
533             if (context->stripes[stripe + j].status == ReadDataStatus_Success) {
534                 badsubstripe = j;
535                 break;
536             }
537         }
538 
539         for (j = 0; j < ci->sub_stripes; j++) {
540             if (context->stripes[stripe + j].status != ReadDataStatus_Success && devices[stripe + j] && devices[stripe + j]->devobj) {
541                 Status = sync_read_phys(devices[stripe + j]->devobj, devices[stripe + j]->fileobj, cis[stripe + j].offset + off,
542                                         Vcb->superblock.node_size, (uint8_t*)t2, false);
543                 if (!NT_SUCCESS(Status)) {
544                     WARN("sync_read_phys returned %08lx\n", Status);
545                     log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_READ_ERRORS);
546                 } else {
547                     bool checksum_error = !check_tree_checksum(Vcb, t2);
548 
549                     if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
550                         RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
551                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe + j]->devitem.dev_id);
552                         recovered = true;
553 
554                         if (!Vcb->readonly && !devices[stripe + badsubstripe]->readonly && devices[stripe + badsubstripe]->devobj) { // write good data over bad
555                             Status = write_data_phys(devices[stripe + badsubstripe]->devobj, devices[stripe + badsubstripe]->fileobj,
556                                                      cis[stripe + badsubstripe].offset + off, t2, Vcb->superblock.node_size);
557                             if (!NT_SUCCESS(Status)) {
558                                 WARN("write_data_phys returned %08lx\n", Status);
559                                 log_device_error(Vcb, devices[stripe + badsubstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
560                             }
561                         }
562 
563                         break;
564                     } else if (t2->address != addr || checksum_error)
565                         log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
566                     else
567                         log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_GENERATION_ERRORS);
568                 }
569             }
570         }
571 
572         if (!recovered) {
573             ERR("unrecoverable checksum error at %I64x\n", addr);
574             ExFreePool(t2);
575             return STATUS_CRC_ERROR;
576         }
577 
578         ExFreePool(t2);
579     } else {
580         ULONG sectors = length / Vcb->superblock.sector_size;
581         uint8_t* sector;
582         void* ptr = context->csum;
583 
584         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
585         if (!sector) {
586             ERR("out of memory\n");
587             return STATUS_INSUFFICIENT_RESOURCES;
588         }
589 
590         for (i = 0; i < sectors; i++) {
591             if (!check_sector_csum(Vcb, buf + (i * Vcb->superblock.sector_size), ptr)) {
592                 uint64_t off;
593                 uint16_t stripe2, badsubstripe = 0;
594                 bool recovered = false;
595 
596                 get_raid0_offset(addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size), ci->stripe_length,
597                                  ci->num_stripes / ci->sub_stripes, &off, &stripe2);
598 
599                 stripe2 *= ci->sub_stripes;
600 
601                 for (j = 0; j < ci->sub_stripes; j++) {
602                     if (context->stripes[stripe2 + j].status == ReadDataStatus_Success) {
603                         badsubstripe = j;
604                         break;
605                     }
606                 }
607 
608                 log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
609 
610                 for (j = 0; j < ci->sub_stripes; j++) {
611                     if (context->stripes[stripe2 + j].status != ReadDataStatus_Success && devices[stripe2 + j] && devices[stripe2 + j]->devobj) {
612                         Status = sync_read_phys(devices[stripe2 + j]->devobj, devices[stripe2 + j]->fileobj, cis[stripe2 + j].offset + off,
613                                                 Vcb->superblock.sector_size, sector, false);
614                         if (!NT_SUCCESS(Status)) {
615                             WARN("sync_read_phys returned %08lx\n", Status);
616                             log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_READ_ERRORS);
617                         } else {
618                             if (check_sector_csum(Vcb, sector, ptr)) {
619                                 RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), sector, Vcb->superblock.sector_size);
620                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size), devices[stripe2 + j]->devitem.dev_id);
621                                 recovered = true;
622 
623                                 if (!Vcb->readonly && !devices[stripe2 + badsubstripe]->readonly && devices[stripe2 + badsubstripe]->devobj) { // write good data over bad
624                                     Status = write_data_phys(devices[stripe2 + badsubstripe]->devobj, devices[stripe2 + badsubstripe]->fileobj,
625                                                              cis[stripe2 + badsubstripe].offset + off, sector, Vcb->superblock.sector_size);
626                                     if (!NT_SUCCESS(Status)) {
627                                         WARN("write_data_phys returned %08lx\n", Status);
628                                         log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_READ_ERRORS);
629                                     }
630                                 }
631 
632                                 break;
633                             } else
634                                 log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
635                         }
636                     }
637                 }
638 
639                 if (!recovered) {
640                     ERR("unrecoverable checksum error at %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size));
641                     ExFreePool(sector);
642                     return STATUS_CRC_ERROR;
643                 }
644             }
645 
646             ptr = (uint8_t*)ptr + Vcb->csum_size;
647         }
648 
649         ExFreePool(sector);
650     }
651 
652     return STATUS_SUCCESS;
653 }
654 
655 static NTSTATUS read_data_raid5(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
656                                 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
657     ULONG i;
658     NTSTATUS Status;
659     bool checksum_error = false;
660     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
661     uint16_t j, stripe;
662     bool no_success = true;
663 
664     for (j = 0; j < ci->num_stripes; j++) {
665         if (context->stripes[j].status == ReadDataStatus_Error) {
666             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
667             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
668             return context->stripes[j].iosb.Status;
669         } else if (context->stripes[j].status == ReadDataStatus_Success) {
670             stripe = j;
671             no_success = false;
672         }
673     }
674 
675     if (c) {    // check partial stripes
676         LIST_ENTRY* le;
677         uint64_t ps_length = (ci->num_stripes - 1) * ci->stripe_length;
678 
679         ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
680 
681         le = c->partial_stripes.Flink;
682         while (le != &c->partial_stripes) {
683             partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
684 
685             if (ps->address + ps_length > addr && ps->address < addr + length) {
686                 ULONG runlength, index;
687 
688                 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
689 
690                 while (runlength != 0) {
691 #ifdef __REACTOS__
692                     uint64_t runstart, runend, start, end;
693 #endif
694                     if (index >= ps->bmplen)
695                         break;
696 
697                     if (index + runlength >= ps->bmplen) {
698                         runlength = ps->bmplen - index;
699 
700                         if (runlength == 0)
701                             break;
702                     }
703 
704 #ifndef __REACTOS__
705                     uint64_t runstart = ps->address + (index * Vcb->superblock.sector_size);
706                     uint64_t runend = runstart + (runlength * Vcb->superblock.sector_size);
707                     uint64_t start = max(runstart, addr);
708                     uint64_t end = min(runend, addr + length);
709 #else
710                     runstart = ps->address + (index * Vcb->superblock.sector_size);
711                     runend = runstart + (runlength * Vcb->superblock.sector_size);
712                     start = max(runstart, addr);
713                     end = min(runend, addr + length);
714 #endif
715 
716                     if (end > start)
717                         RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
718 
719                     runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
720                 }
721             } else if (ps->address >= addr + length)
722                 break;
723 
724             le = le->Flink;
725         }
726 
727         ExReleaseResourceLite(&c->partial_stripes_lock);
728     }
729 
730     if (context->tree) {
731         tree_header* th = (tree_header*)buf;
732 
733         if (addr != th->address || !check_tree_checksum(Vcb, th)) {
734             checksum_error = true;
735             if (!no_success && !degraded)
736                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
737         } else if (generation != 0 && generation != th->generation) {
738             checksum_error = true;
739             if (!no_success && !degraded)
740                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
741         }
742     } else if (context->csum) {
743         Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
744 
745         if (Status == STATUS_CRC_ERROR) {
746             if (!degraded)
747                 WARN("checksum error\n");
748             checksum_error = true;
749         } else if (!NT_SUCCESS(Status)) {
750             ERR("check_csum returned %08lx\n", Status);
751             return Status;
752         }
753     } else if (degraded)
754         checksum_error = true;
755 
756     if (!checksum_error)
757         return STATUS_SUCCESS;
758 
759     if (context->tree) {
760         uint16_t parity;
761         uint64_t off;
762         bool recovered = false, first = true, failed = false;
763         uint8_t* t2;
764 
765         t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * 2, ALLOC_TAG);
766         if (!t2) {
767             ERR("out of memory\n");
768             return STATUS_INSUFFICIENT_RESOURCES;
769         }
770 
771         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &off, &stripe);
772 
773         parity = (((addr - offset) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
774 
775         stripe = (parity + stripe + 1) % ci->num_stripes;
776 
777         for (j = 0; j < ci->num_stripes; j++) {
778             if (j != stripe) {
779                 if (devices[j] && devices[j]->devobj) {
780                     if (first) {
781                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2, false);
782                         if (!NT_SUCCESS(Status)) {
783                             ERR("sync_read_phys returned %08lx\n", Status);
784                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
785                             failed = true;
786                             break;
787                         }
788 
789                         first = false;
790                     } else {
791                         Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2 + Vcb->superblock.node_size, false);
792                         if (!NT_SUCCESS(Status)) {
793                             ERR("sync_read_phys returned %08lx\n", Status);
794                             log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
795                             failed = true;
796                             break;
797                         }
798 
799                         do_xor(t2, t2 + Vcb->superblock.node_size, Vcb->superblock.node_size);
800                     }
801                 } else {
802                     failed = true;
803                     break;
804                 }
805             }
806         }
807 
808         if (!failed) {
809             tree_header* t3 = (tree_header*)t2;
810 
811             if (t3->address == addr && check_tree_checksum(Vcb, t3) && (generation == 0 || t3->generation == generation)) {
812                 RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
813 
814                 if (!degraded)
815                     ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
816 
817                 recovered = true;
818 
819                 if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
820                     Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off, t2, Vcb->superblock.node_size);
821                     if (!NT_SUCCESS(Status)) {
822                         WARN("write_data_phys returned %08lx\n", Status);
823                         log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
824                     }
825                 }
826             }
827         }
828 
829         if (!recovered) {
830             ERR("unrecoverable checksum error at %I64x\n", addr);
831             ExFreePool(t2);
832             return STATUS_CRC_ERROR;
833         }
834 
835         ExFreePool(t2);
836     } else {
837         ULONG sectors = length / Vcb->superblock.sector_size;
838         uint8_t* sector;
839         void* ptr = context->csum;
840 
841         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size * 2, ALLOC_TAG);
842         if (!sector) {
843             ERR("out of memory\n");
844             return STATUS_INSUFFICIENT_RESOURCES;
845         }
846 
847         for (i = 0; i < sectors; i++) {
848             uint16_t parity;
849             uint64_t off;
850 
851             get_raid0_offset(addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size), ci->stripe_length,
852                              ci->num_stripes - 1, &off, &stripe);
853 
854             parity = (((addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size)) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
855 
856             stripe = (parity + stripe + 1) % ci->num_stripes;
857 
858             if (!devices[stripe] || !devices[stripe]->devobj || (ptr && !check_sector_csum(Vcb, buf + (i * Vcb->superblock.sector_size), ptr))) {
859                 bool recovered = false, first = true, failed = false;
860 
861                 if (devices[stripe] && devices[stripe]->devobj)
862                     log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_READ_ERRORS);
863 
864                 for (j = 0; j < ci->num_stripes; j++) {
865                     if (j != stripe) {
866                         if (devices[j] && devices[j]->devobj) {
867                             if (first) {
868                                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size, sector, false);
869                                 if (!NT_SUCCESS(Status)) {
870                                     ERR("sync_read_phys returned %08lx\n", Status);
871                                     failed = true;
872                                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
873                                     break;
874                                 }
875 
876                                 first = false;
877                             } else {
878                                 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
879                                                         sector + Vcb->superblock.sector_size, false);
880                                 if (!NT_SUCCESS(Status)) {
881                                     ERR("sync_read_phys returned %08lx\n", Status);
882                                     failed = true;
883                                     log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
884                                     break;
885                                 }
886 
887                                 do_xor(sector, sector + Vcb->superblock.sector_size, Vcb->superblock.sector_size);
888                             }
889                         } else {
890                             failed = true;
891                             break;
892                         }
893                     }
894                 }
895 
896                 if (!failed) {
897                     if (!ptr || check_sector_csum(Vcb, sector, ptr)) {
898                         RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), sector, Vcb->superblock.sector_size);
899 
900                         if (!degraded)
901                             ERR("recovering from checksum error at %I64x, device %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size), devices[stripe]->devitem.dev_id);
902 
903                         recovered = true;
904 
905                         if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
906                             Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off,
907                                                      sector, Vcb->superblock.sector_size);
908                             if (!NT_SUCCESS(Status)) {
909                                 WARN("write_data_phys returned %08lx\n", Status);
910                                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
911                             }
912                         }
913                     }
914                 }
915 
916                 if (!recovered) {
917                     ERR("unrecoverable checksum error at %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size));
918                     ExFreePool(sector);
919                     return STATUS_CRC_ERROR;
920                 }
921             }
922 
923             if (ptr)
924                 ptr = (uint8_t*)ptr + Vcb->csum_size;
925         }
926 
927         ExFreePool(sector);
928     }
929 
930     return STATUS_SUCCESS;
931 }
932 
933 void raid6_recover2(uint8_t* sectors, uint16_t num_stripes, ULONG sector_size, uint16_t missing1, uint16_t missing2, uint8_t* out) {
934     if (missing1 == num_stripes - 2 || missing2 == num_stripes - 2) { // reconstruct from q and data
935         uint16_t missing = missing1 == (num_stripes - 2) ? missing2 : missing1;
936         uint16_t stripe;
937 
938         stripe = num_stripes - 3;
939 
940         if (stripe == missing)
941             RtlZeroMemory(out, sector_size);
942         else
943             RtlCopyMemory(out, sectors + (stripe * sector_size), sector_size);
944 
945         do {
946             stripe--;
947 
948             galois_double(out, sector_size);
949 
950             if (stripe != missing)
951                 do_xor(out, sectors + (stripe * sector_size), sector_size);
952         } while (stripe > 0);
953 
954         do_xor(out, sectors + ((num_stripes - 1) * sector_size), sector_size);
955 
956         if (missing != 0)
957             galois_divpower(out, (uint8_t)missing, sector_size);
958     } else { // reconstruct from p and q
959         uint16_t x, y, stripe;
960         uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
961         uint32_t j;
962 
963         stripe = num_stripes - 3;
964 
965         pxy = out + sector_size;
966         qxy = out;
967 
968         if (stripe == missing1 || stripe == missing2) {
969             RtlZeroMemory(qxy, sector_size);
970             RtlZeroMemory(pxy, sector_size);
971 
972             if (stripe == missing1)
973                 x = stripe;
974             else
975                 y = stripe;
976         } else {
977             RtlCopyMemory(qxy, sectors + (stripe * sector_size), sector_size);
978             RtlCopyMemory(pxy, sectors + (stripe * sector_size), sector_size);
979         }
980 
981         do {
982             stripe--;
983 
984             galois_double(qxy, sector_size);
985 
986             if (stripe != missing1 && stripe != missing2) {
987                 do_xor(qxy, sectors + (stripe * sector_size), sector_size);
988                 do_xor(pxy, sectors + (stripe * sector_size), sector_size);
989             } else if (stripe == missing1)
990                 x = stripe;
991             else if (stripe == missing2)
992                 y = stripe;
993         } while (stripe > 0);
994 
995         gyx = gpow2(y > x ? (y-x) : (255-x+y));
996         gx = gpow2(255-x);
997 
998         denom = gdiv(1, gyx ^ 1);
999         a = gmul(gyx, denom);
1000         b = gmul(gx, denom);
1001 
1002         p = sectors + ((num_stripes - 2) * sector_size);
1003         q = sectors + ((num_stripes - 1) * sector_size);
1004 
1005         for (j = 0; j < sector_size; j++) {
1006             *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
1007 
1008             p++;
1009             q++;
1010             pxy++;
1011             qxy++;
1012         }
1013 
1014         do_xor(out + sector_size, out, sector_size);
1015         do_xor(out + sector_size, sectors + ((num_stripes - 2) * sector_size), sector_size);
1016     }
1017 }
1018 
1019 static NTSTATUS read_data_raid6(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
1020                                 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
1021     NTSTATUS Status;
1022     ULONG i;
1023     bool checksum_error = false;
1024     CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1025     uint16_t stripe, j;
1026     bool no_success = true;
1027 
1028     for (j = 0; j < ci->num_stripes; j++) {
1029         if (context->stripes[j].status == ReadDataStatus_Error) {
1030             WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
1031 
1032             if (devices[j])
1033                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1034             return context->stripes[j].iosb.Status;
1035         } else if (context->stripes[j].status == ReadDataStatus_Success) {
1036             stripe = j;
1037             no_success = false;
1038         }
1039     }
1040 
1041     if (c) {    // check partial stripes
1042         LIST_ENTRY* le;
1043         uint64_t ps_length = (ci->num_stripes - 2) * ci->stripe_length;
1044 
1045         ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
1046 
1047         le = c->partial_stripes.Flink;
1048         while (le != &c->partial_stripes) {
1049             partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
1050 
1051             if (ps->address + ps_length > addr && ps->address < addr + length) {
1052                 ULONG runlength, index;
1053 
1054                 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
1055 
1056                 while (runlength != 0) {
1057 #ifdef __REACTOS__
1058                     uint64_t runstart, runend, start, end;
1059 #endif
1060                     if (index >= ps->bmplen)
1061                         break;
1062 
1063                     if (index + runlength >= ps->bmplen) {
1064                         runlength = ps->bmplen - index;
1065 
1066                         if (runlength == 0)
1067                             break;
1068                     }
1069 
1070 #ifndef __REACTOS__
1071                     uint64_t runstart = ps->address + (index * Vcb->superblock.sector_size);
1072                     uint64_t runend = runstart + (runlength * Vcb->superblock.sector_size);
1073                     uint64_t start = max(runstart, addr);
1074                     uint64_t end = min(runend, addr + length);
1075 #else
1076                     runstart = ps->address + (index * Vcb->superblock.sector_size);
1077                     runend = runstart + (runlength * Vcb->superblock.sector_size);
1078                     start = max(runstart, addr);
1079                     end = min(runend, addr + length);
1080 #endif
1081 
1082                     if (end > start)
1083                         RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
1084 
1085                     runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
1086                 }
1087             } else if (ps->address >= addr + length)
1088                 break;
1089 
1090             le = le->Flink;
1091         }
1092 
1093         ExReleaseResourceLite(&c->partial_stripes_lock);
1094     }
1095 
1096     if (context->tree) {
1097         tree_header* th = (tree_header*)buf;
1098 
1099         if (addr != th->address || !check_tree_checksum(Vcb, th)) {
1100             checksum_error = true;
1101             if (!no_success && !degraded && devices[stripe])
1102                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1103         } else if (generation != 0 && generation != th->generation) {
1104             checksum_error = true;
1105             if (!no_success && !degraded && devices[stripe])
1106                 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
1107         }
1108     } else if (context->csum) {
1109         Status = check_csum(Vcb, buf, length / Vcb->superblock.sector_size, context->csum);
1110 
1111         if (Status == STATUS_CRC_ERROR) {
1112             if (!degraded)
1113                 WARN("checksum error\n");
1114             checksum_error = true;
1115         } else if (!NT_SUCCESS(Status)) {
1116             ERR("check_csum returned %08lx\n", Status);
1117             return Status;
1118         }
1119     } else if (degraded)
1120         checksum_error = true;
1121 
1122     if (!checksum_error)
1123         return STATUS_SUCCESS;
1124 
1125     if (context->tree) {
1126         uint8_t* sector;
1127         uint16_t k, physstripe, parity1, parity2, error_stripe;
1128         uint64_t off;
1129         bool recovered = false, failed = false;
1130         ULONG num_errors = 0;
1131 
1132         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * (ci->num_stripes + 2), ALLOC_TAG);
1133         if (!sector) {
1134             ERR("out of memory\n");
1135             return STATUS_INSUFFICIENT_RESOURCES;
1136         }
1137 
1138         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &off, &stripe);
1139 
1140         parity1 = (((addr - offset) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1141         parity2 = (parity1 + 1) % ci->num_stripes;
1142 
1143         physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1144 
1145         j = (parity2 + 1) % ci->num_stripes;
1146 
1147         for (k = 0; k < ci->num_stripes - 1; k++) {
1148             if (j != physstripe) {
1149                 if (devices[j] && devices[j]->devobj) {
1150                     Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size,
1151                                             sector + (k * Vcb->superblock.node_size), false);
1152                     if (!NT_SUCCESS(Status)) {
1153                         ERR("sync_read_phys returned %08lx\n", Status);
1154                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1155                         num_errors++;
1156                         error_stripe = k;
1157 
1158                         if (num_errors > 1) {
1159                             failed = true;
1160                             break;
1161                         }
1162                     }
1163                 } else {
1164                     num_errors++;
1165                     error_stripe = k;
1166 
1167                     if (num_errors > 1) {
1168                         failed = true;
1169                         break;
1170                     }
1171                 }
1172             }
1173 
1174             j = (j + 1) % ci->num_stripes;
1175         }
1176 
1177         if (!failed) {
1178             if (num_errors == 0) {
1179                 tree_header* th = (tree_header*)(sector + (stripe * Vcb->superblock.node_size));
1180 
1181                 RtlCopyMemory(sector + (stripe * Vcb->superblock.node_size), sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size),
1182                               Vcb->superblock.node_size);
1183 
1184                 for (j = 0; j < ci->num_stripes - 2; j++) {
1185                     if (j != stripe)
1186                         do_xor(sector + (stripe * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size), Vcb->superblock.node_size);
1187                 }
1188 
1189                 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1190                     RtlCopyMemory(buf, sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1191 
1192                     if (devices[physstripe] && devices[physstripe]->devobj)
1193                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1194 
1195                     recovered = true;
1196 
1197                     if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1198                         Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1199                                                  sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1200                         if (!NT_SUCCESS(Status)) {
1201                             WARN("write_data_phys returned %08lx\n", Status);
1202                             log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1203                         }
1204                     }
1205                 }
1206             }
1207 
1208             if (!recovered) {
1209                 tree_header* th = (tree_header*)(sector + (ci->num_stripes * Vcb->superblock.node_size));
1210                 bool read_q = false;
1211 
1212                 if (devices[parity2] && devices[parity2]->devobj) {
1213                     Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1214                                             Vcb->superblock.node_size, sector + ((ci->num_stripes - 1) * Vcb->superblock.node_size), false);
1215                     if (!NT_SUCCESS(Status)) {
1216                         ERR("sync_read_phys returned %08lx\n", Status);
1217                         log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1218                     } else
1219                         read_q = true;
1220                 }
1221 
1222                 if (read_q) {
1223                     if (num_errors == 1) {
1224                         raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, error_stripe, sector + (ci->num_stripes * Vcb->superblock.node_size));
1225 
1226                         if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation))
1227                             recovered = true;
1228                     } else {
1229                         for (j = 0; j < ci->num_stripes - 1; j++) {
1230                             if (j != stripe) {
1231                                 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, j, sector + (ci->num_stripes * Vcb->superblock.node_size));
1232 
1233                                 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1234                                     recovered = true;
1235                                     error_stripe = j;
1236                                     break;
1237                                 }
1238                             }
1239                         }
1240                     }
1241                 }
1242 
1243                 if (recovered) {
1244                     uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1245 
1246                     if (devices[physstripe] && devices[physstripe]->devobj)
1247                         ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1248 
1249                     RtlCopyMemory(buf, sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1250 
1251                     if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1252                         Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1253                                                  sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1254                         if (!NT_SUCCESS(Status)) {
1255                             WARN("write_data_phys returned %08lx\n", Status);
1256                             log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1257                         }
1258                     }
1259 
1260                     if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1261                         if (error_stripe == ci->num_stripes - 2) {
1262                             ERR("recovering from parity error at %I64x, device %I64x\n", addr, devices[error_stripe_phys]->devitem.dev_id);
1263 
1264                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1265 
1266                             RtlZeroMemory(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1267 
1268                             for (j = 0; j < ci->num_stripes - 2; j++) {
1269                                 if (j == stripe) {
1270                                     do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (ci->num_stripes * Vcb->superblock.node_size),
1271                                            Vcb->superblock.node_size);
1272                                 } else {
1273                                     do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size),
1274                                             Vcb->superblock.node_size);
1275                                 }
1276                             }
1277                         } else {
1278                             ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((error_stripe - stripe) * ci->stripe_length),
1279                                 devices[error_stripe_phys]->devitem.dev_id);
1280 
1281                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1282 
1283                             RtlCopyMemory(sector + (error_stripe * Vcb->superblock.node_size),
1284                                           sector + ((ci->num_stripes + 1) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1285                         }
1286                     }
1287 
1288                     if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1289                         Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1290                                                  sector + (error_stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1291                         if (!NT_SUCCESS(Status)) {
1292                             WARN("write_data_phys returned %08lx\n", Status);
1293                             log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1294                         }
1295                     }
1296                 }
1297             }
1298         }
1299 
1300         if (!recovered) {
1301             ERR("unrecoverable checksum error at %I64x\n", addr);
1302             ExFreePool(sector);
1303             return STATUS_CRC_ERROR;
1304         }
1305 
1306         ExFreePool(sector);
1307     } else {
1308         ULONG sectors = length / Vcb->superblock.sector_size;
1309         uint8_t* sector;
1310         void* ptr = context->csum;
1311 
1312         sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size * (ci->num_stripes + 2), ALLOC_TAG);
1313         if (!sector) {
1314             ERR("out of memory\n");
1315             return STATUS_INSUFFICIENT_RESOURCES;
1316         }
1317 
1318         for (i = 0; i < sectors; i++) {
1319             uint64_t off;
1320             uint16_t physstripe, parity1, parity2;
1321 
1322             get_raid0_offset(addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size), ci->stripe_length,
1323                              ci->num_stripes - 2, &off, &stripe);
1324 
1325             parity1 = (((addr - offset + UInt32x32To64(i, Vcb->superblock.sector_size)) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1326             parity2 = (parity1 + 1) % ci->num_stripes;
1327 
1328             physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1329 
1330             if (!devices[physstripe] || !devices[physstripe]->devobj || (context->csum && !check_sector_csum(Vcb, buf + (i * Vcb->superblock.sector_size), ptr))) {
1331                 uint16_t k, error_stripe;
1332                 bool recovered = false, failed = false;
1333                 ULONG num_errors = 0;
1334 
1335                 if (devices[physstripe] && devices[physstripe]->devobj)
1336                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_READ_ERRORS);
1337 
1338                 j = (parity2 + 1) % ci->num_stripes;
1339 
1340                 for (k = 0; k < ci->num_stripes - 1; k++) {
1341                     if (j != physstripe) {
1342                         if (devices[j] && devices[j]->devobj) {
1343                             Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
1344                                                     sector + (k * Vcb->superblock.sector_size), false);
1345                             if (!NT_SUCCESS(Status)) {
1346                                 ERR("sync_read_phys returned %08lx\n", Status);
1347                                 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1348                                 num_errors++;
1349                                 error_stripe = k;
1350 
1351                                 if (num_errors > 1) {
1352                                     failed = true;
1353                                     break;
1354                                 }
1355                             }
1356                         } else {
1357                             num_errors++;
1358                             error_stripe = k;
1359 
1360                             if (num_errors > 1) {
1361                                 failed = true;
1362                                 break;
1363                             }
1364                         }
1365                     }
1366 
1367                     j = (j + 1) % ci->num_stripes;
1368                 }
1369 
1370                 if (!failed) {
1371                     if (num_errors == 0) {
1372                         RtlCopyMemory(sector + (stripe * Vcb->superblock.sector_size), sector + ((ci->num_stripes - 2) * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1373 
1374                         for (j = 0; j < ci->num_stripes - 2; j++) {
1375                             if (j != stripe)
1376                                 do_xor(sector + (stripe * Vcb->superblock.sector_size), sector + (j * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1377                         }
1378 
1379                         if (!ptr || check_sector_csum(Vcb, sector + (stripe * Vcb->superblock.sector_size), ptr)) {
1380                             RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), sector + (stripe * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1381 
1382                             if (devices[physstripe] && devices[physstripe]->devobj)
1383                                 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size),
1384                                     devices[physstripe]->devitem.dev_id);
1385 
1386                             recovered = true;
1387 
1388                             if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1389                                 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1390                                                          sector + (stripe * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1391                                 if (!NT_SUCCESS(Status)) {
1392                                     WARN("write_data_phys returned %08lx\n", Status);
1393                                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1394                                 }
1395                             }
1396                         }
1397                     }
1398 
1399                     if (!recovered) {
1400                         bool read_q = false;
1401 
1402                         if (devices[parity2] && devices[parity2]->devobj) {
1403                             Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1404                                                     Vcb->superblock.sector_size, sector + ((ci->num_stripes - 1) * Vcb->superblock.sector_size), false);
1405                             if (!NT_SUCCESS(Status)) {
1406                                 ERR("sync_read_phys returned %08lx\n", Status);
1407                                 log_device_error(Vcb, devices[parity2], BTRFS_DEV_STAT_READ_ERRORS);
1408                             } else
1409                                 read_q = true;
1410                         }
1411 
1412                         if (read_q) {
1413                             if (num_errors == 1) {
1414                                 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, error_stripe, sector + (ci->num_stripes * Vcb->superblock.sector_size));
1415 
1416                                 if (!devices[physstripe] || !devices[physstripe]->devobj)
1417                                     recovered = true;
1418                                 else
1419                                     recovered = check_sector_csum(Vcb, sector + (ci->num_stripes * Vcb->superblock.sector_size), ptr);
1420                             } else {
1421                                 for (j = 0; j < ci->num_stripes - 1; j++) {
1422                                     if (j != stripe) {
1423                                         raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, j, sector + (ci->num_stripes * Vcb->superblock.sector_size));
1424 
1425                                         if (check_sector_csum(Vcb, sector + (ci->num_stripes * Vcb->superblock.sector_size), ptr)) {
1426                                             recovered = true;
1427                                             error_stripe = j;
1428                                             break;
1429                                         }
1430                                     }
1431                                 }
1432                             }
1433                         }
1434 
1435                         if (recovered) {
1436                             uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1437 
1438                             if (devices[physstripe] && devices[physstripe]->devobj)
1439                                 ERR("recovering from checksum error at %I64x, device %I64x\n",
1440                                     addr + UInt32x32To64(i, Vcb->superblock.sector_size), devices[physstripe]->devitem.dev_id);
1441 
1442                             RtlCopyMemory(buf + (i * Vcb->superblock.sector_size), sector + (ci->num_stripes * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1443 
1444                             if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1445                                 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1446                                                          sector + (ci->num_stripes * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1447                                 if (!NT_SUCCESS(Status)) {
1448                                     WARN("write_data_phys returned %08lx\n", Status);
1449                                     log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1450                                 }
1451                             }
1452 
1453                             if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1454                                 if (error_stripe == ci->num_stripes - 2) {
1455                                     ERR("recovering from parity error at %I64x, device %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size),
1456                                         devices[error_stripe_phys]->devitem.dev_id);
1457 
1458                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1459 
1460                                     RtlZeroMemory(sector + ((ci->num_stripes - 2) * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1461 
1462                                     for (j = 0; j < ci->num_stripes - 2; j++) {
1463                                         if (j == stripe) {
1464                                             do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.sector_size), sector + (ci->num_stripes * Vcb->superblock.sector_size),
1465                                                    Vcb->superblock.sector_size);
1466                                         } else {
1467                                             do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.sector_size), sector + (j * Vcb->superblock.sector_size),
1468                                                    Vcb->superblock.sector_size);
1469                                         }
1470                                     }
1471                                 } else {
1472                                     ERR("recovering from checksum error at %I64x, device %I64x\n",
1473                                         addr + UInt32x32To64(i, Vcb->superblock.sector_size) + ((error_stripe - stripe) * ci->stripe_length),
1474                                         devices[error_stripe_phys]->devitem.dev_id);
1475 
1476                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1477 
1478                                     RtlCopyMemory(sector + (error_stripe * Vcb->superblock.sector_size),
1479                                                   sector + ((ci->num_stripes + 1) * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1480                                 }
1481                             }
1482 
1483                             if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1484                                 Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1485                                                          sector + (error_stripe * Vcb->superblock.sector_size), Vcb->superblock.sector_size);
1486                                 if (!NT_SUCCESS(Status)) {
1487                                     WARN("write_data_phys returned %08lx\n", Status);
1488                                     log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1489                                 }
1490                             }
1491                         }
1492                     }
1493                 }
1494 
1495                 if (!recovered) {
1496                     ERR("unrecoverable checksum error at %I64x\n", addr + UInt32x32To64(i, Vcb->superblock.sector_size));
1497                     ExFreePool(sector);
1498                     return STATUS_CRC_ERROR;
1499                 }
1500             }
1501 
1502             if (ptr)
1503                 ptr = (uint8_t*)ptr + Vcb->csum_size;
1504         }
1505 
1506         ExFreePool(sector);
1507     }
1508 
1509     return STATUS_SUCCESS;
1510 }
1511 
1512 NTSTATUS read_data(_In_ device_extension* Vcb, _In_ uint64_t addr, _In_ uint32_t length, _In_reads_bytes_opt_(length*sizeof(uint32_t)/Vcb->superblock.sector_size) void* csum,
1513                    _In_ bool is_tree, _Out_writes_bytes_(length) uint8_t* buf, _In_opt_ chunk* c, _Out_opt_ chunk** pc, _In_opt_ PIRP Irp, _In_ uint64_t generation, _In_ bool file_read,
1514                    _In_ ULONG priority) {
1515     CHUNK_ITEM* ci;
1516     CHUNK_ITEM_STRIPE* cis;
1517     read_data_context context;
1518     uint64_t type, offset, total_reading = 0;
1519     NTSTATUS Status;
1520     device** devices = NULL;
1521     uint16_t i, startoffstripe, allowed_missing, missing_devices = 0;
1522     uint8_t* dummypage = NULL;
1523     PMDL dummy_mdl = NULL;
1524     bool need_to_wait;
1525     uint64_t lockaddr, locklen;
1526 
1527     if (Vcb->log_to_phys_loaded) {
1528         if (!c) {
1529             c = get_chunk_from_address(Vcb, addr);
1530 
1531             if (!c) {
1532                 ERR("get_chunk_from_address failed\n");
1533                 return STATUS_INTERNAL_ERROR;
1534             }
1535         }
1536 
1537         ci = c->chunk_item;
1538         offset = c->offset;
1539         devices = c->devices;
1540 
1541         if (pc)
1542             *pc = c;
1543     } else {
1544         LIST_ENTRY* le = Vcb->sys_chunks.Flink;
1545 
1546         ci = NULL;
1547 
1548         c = NULL;
1549         while (le != &Vcb->sys_chunks) {
1550             sys_chunk* sc = CONTAINING_RECORD(le, sys_chunk, list_entry);
1551 
1552             if (sc->key.obj_id == 0x100 && sc->key.obj_type == TYPE_CHUNK_ITEM && sc->key.offset <= addr) {
1553                 CHUNK_ITEM* chunk_item = sc->data;
1554 
1555                 if ((addr - sc->key.offset) < chunk_item->size && chunk_item->num_stripes > 0) {
1556                     ci = chunk_item;
1557                     offset = sc->key.offset;
1558                     cis = (CHUNK_ITEM_STRIPE*)&chunk_item[1];
1559 
1560                     devices = ExAllocatePoolWithTag(NonPagedPool, sizeof(device*) * ci->num_stripes, ALLOC_TAG);
1561                     if (!devices) {
1562                         ERR("out of memory\n");
1563                         return STATUS_INSUFFICIENT_RESOURCES;
1564                     }
1565 
1566                     for (i = 0; i < ci->num_stripes; i++) {
1567                         devices[i] = find_device_from_uuid(Vcb, &cis[i].dev_uuid);
1568                     }
1569 
1570                     break;
1571                 }
1572             }
1573 
1574             le = le->Flink;
1575         }
1576 
1577         if (!ci) {
1578             ERR("could not find chunk for %I64x in bootstrap\n", addr);
1579             return STATUS_INTERNAL_ERROR;
1580         }
1581 
1582         if (pc)
1583             *pc = NULL;
1584     }
1585 
1586     if (ci->type & BLOCK_FLAG_DUPLICATE) {
1587         type = BLOCK_FLAG_DUPLICATE;
1588         allowed_missing = ci->num_stripes - 1;
1589     } else if (ci->type & BLOCK_FLAG_RAID0) {
1590         type = BLOCK_FLAG_RAID0;
1591         allowed_missing = 0;
1592     } else if (ci->type & BLOCK_FLAG_RAID1) {
1593         type = BLOCK_FLAG_DUPLICATE;
1594         allowed_missing = 1;
1595     } else if (ci->type & BLOCK_FLAG_RAID10) {
1596         type = BLOCK_FLAG_RAID10;
1597         allowed_missing = 1;
1598     } else if (ci->type & BLOCK_FLAG_RAID5) {
1599         type = BLOCK_FLAG_RAID5;
1600         allowed_missing = 1;
1601     } else if (ci->type & BLOCK_FLAG_RAID6) {
1602         type = BLOCK_FLAG_RAID6;
1603         allowed_missing = 2;
1604     } else if (ci->type & BLOCK_FLAG_RAID1C3) {
1605         type = BLOCK_FLAG_DUPLICATE;
1606         allowed_missing = 2;
1607     } else if (ci->type & BLOCK_FLAG_RAID1C4) {
1608         type = BLOCK_FLAG_DUPLICATE;
1609         allowed_missing = 3;
1610     } else { // SINGLE
1611         type = BLOCK_FLAG_DUPLICATE;
1612         allowed_missing = 0;
1613     }
1614 
1615     cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1616 
1617     RtlZeroMemory(&context, sizeof(read_data_context));
1618     KeInitializeEvent(&context.Event, NotificationEvent, false);
1619 
1620     context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe) * ci->num_stripes, ALLOC_TAG);
1621     if (!context.stripes) {
1622         ERR("out of memory\n");
1623         return STATUS_INSUFFICIENT_RESOURCES;
1624     }
1625 
1626     if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6)) {
1627         get_raid56_lock_range(c, addr, length, &lockaddr, &locklen);
1628         chunk_lock_range(Vcb, c, lockaddr, locklen);
1629     }
1630 
1631     RtlZeroMemory(context.stripes, sizeof(read_data_stripe) * ci->num_stripes);
1632 
1633     context.buflen = length;
1634     context.num_stripes = ci->num_stripes;
1635     context.stripes_left = context.num_stripes;
1636     context.sector_size = Vcb->superblock.sector_size;
1637     context.csum = csum;
1638     context.tree = is_tree;
1639     context.type = type;
1640 
1641     if (type == BLOCK_FLAG_RAID0) {
1642         uint64_t startoff, endoff;
1643         uint16_t endoffstripe, stripe;
1644         uint32_t *stripeoff, pos;
1645         PMDL master_mdl;
1646         PFN_NUMBER* pfns;
1647 
1648         // FIXME - test this still works if page size isn't the same as sector size
1649 
1650         // This relies on the fact that MDLs are followed in memory by the page file numbers,
1651         // so with a bit of jiggery-pokery you can trick your disks into deinterlacing your RAID0
1652         // data for you without doing a memcpy yourself.
1653         // MDLs are officially opaque, so this might very well break in future versions of Windows.
1654 
1655         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &startoff, &startoffstripe);
1656         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes, &endoff, &endoffstripe);
1657 
1658         if (file_read) {
1659             // Unfortunately we can't avoid doing at least one memcpy, as Windows can give us an MDL
1660             // with duplicated dummy PFNs, which confuse check_csum. Ah well.
1661             // See https://msdn.microsoft.com/en-us/library/windows/hardware/Dn614012.aspx if you're interested.
1662 
1663             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1664 
1665             if (!context.va) {
1666                 ERR("out of memory\n");
1667                 Status = STATUS_INSUFFICIENT_RESOURCES;
1668                 goto exit;
1669             }
1670         } else
1671             context.va = buf;
1672 
1673         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1674         if (!master_mdl) {
1675             ERR("out of memory\n");
1676             Status = STATUS_INSUFFICIENT_RESOURCES;
1677             goto exit;
1678         }
1679 
1680         Status = STATUS_SUCCESS;
1681 
1682         _SEH2_TRY {
1683             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
1684         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1685             Status = _SEH2_GetExceptionCode();
1686         } _SEH2_END;
1687 
1688         if (!NT_SUCCESS(Status)) {
1689             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1690             IoFreeMdl(master_mdl);
1691             goto exit;
1692         }
1693 
1694         pfns = (PFN_NUMBER*)(master_mdl + 1);
1695 
1696         for (i = 0; i < ci->num_stripes; i++) {
1697             if (startoffstripe > i)
1698                 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1699             else if (startoffstripe == i)
1700                 context.stripes[i].stripestart = startoff;
1701             else
1702                 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length);
1703 
1704             if (endoffstripe > i)
1705                 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1706             else if (endoffstripe == i)
1707                 context.stripes[i].stripeend = endoff + 1;
1708             else
1709                 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length);
1710 
1711             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
1712                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
1713 
1714                 if (!context.stripes[i].mdl) {
1715                     ERR("IoAllocateMdl failed\n");
1716                     MmUnlockPages(master_mdl);
1717                     IoFreeMdl(master_mdl);
1718                     Status = STATUS_INSUFFICIENT_RESOURCES;
1719                     goto exit;
1720                 }
1721             }
1722         }
1723 
1724         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
1725         if (!stripeoff) {
1726             ERR("out of memory\n");
1727             MmUnlockPages(master_mdl);
1728             IoFreeMdl(master_mdl);
1729             Status = STATUS_INSUFFICIENT_RESOURCES;
1730             goto exit;
1731         }
1732 
1733         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
1734 
1735         pos = 0;
1736         stripe = startoffstripe;
1737         while (pos < length) {
1738             PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
1739 
1740             if (pos == 0) {
1741                 uint32_t readlen = (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length));
1742 
1743                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1744 
1745                 stripeoff[stripe] += readlen;
1746                 pos += readlen;
1747             } else if (length - pos < ci->stripe_length) {
1748                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1749 
1750                 pos = length;
1751             } else {
1752                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1753 
1754                 stripeoff[stripe] += (uint32_t)ci->stripe_length;
1755                 pos += (uint32_t)ci->stripe_length;
1756             }
1757 
1758             stripe = (stripe + 1) % ci->num_stripes;
1759         }
1760 
1761         MmUnlockPages(master_mdl);
1762         IoFreeMdl(master_mdl);
1763 
1764         ExFreePool(stripeoff);
1765     } else if (type == BLOCK_FLAG_RAID10) {
1766         uint64_t startoff, endoff;
1767         uint16_t endoffstripe, j, stripe;
1768         ULONG orig_ls;
1769         PMDL master_mdl;
1770         PFN_NUMBER* pfns;
1771         uint32_t* stripeoff, pos;
1772         read_data_stripe** stripes;
1773 
1774         if (c)
1775             orig_ls = c->last_stripe;
1776         else
1777             orig_ls = 0;
1778 
1779         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &startoff, &startoffstripe);
1780         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &endoff, &endoffstripe);
1781 
1782         if ((ci->num_stripes % ci->sub_stripes) != 0) {
1783             ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", offset, ci->num_stripes, ci->sub_stripes);
1784             Status = STATUS_INTERNAL_ERROR;
1785             goto exit;
1786         }
1787 
1788         if (file_read) {
1789             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1790 
1791             if (!context.va) {
1792                 ERR("out of memory\n");
1793                 Status = STATUS_INSUFFICIENT_RESOURCES;
1794                 goto exit;
1795             }
1796         } else
1797             context.va = buf;
1798 
1799         context.firstoff = (uint16_t)((startoff % ci->stripe_length) / Vcb->superblock.sector_size);
1800         context.startoffstripe = startoffstripe;
1801         context.sectors_per_stripe = (uint16_t)(ci->stripe_length / Vcb->superblock.sector_size);
1802 
1803         startoffstripe *= ci->sub_stripes;
1804         endoffstripe *= ci->sub_stripes;
1805 
1806         if (c)
1807             c->last_stripe = (orig_ls + 1) % ci->sub_stripes;
1808 
1809         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1810         if (!master_mdl) {
1811             ERR("out of memory\n");
1812             Status = STATUS_INSUFFICIENT_RESOURCES;
1813             goto exit;
1814         }
1815 
1816         Status = STATUS_SUCCESS;
1817 
1818         _SEH2_TRY {
1819             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
1820         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1821             Status = _SEH2_GetExceptionCode();
1822         } _SEH2_END;
1823 
1824         if (!NT_SUCCESS(Status)) {
1825             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1826             IoFreeMdl(master_mdl);
1827             goto exit;
1828         }
1829 
1830         pfns = (PFN_NUMBER*)(master_mdl + 1);
1831 
1832         stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1833         if (!stripes) {
1834             ERR("out of memory\n");
1835             MmUnlockPages(master_mdl);
1836             IoFreeMdl(master_mdl);
1837             Status = STATUS_INSUFFICIENT_RESOURCES;
1838             goto exit;
1839         }
1840 
1841         RtlZeroMemory(stripes, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes);
1842 
1843         for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
1844             uint64_t sstart, send;
1845             bool stripeset = false;
1846 
1847             if (startoffstripe > i)
1848                 sstart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1849             else if (startoffstripe == i)
1850                 sstart = startoff;
1851             else
1852                 sstart = startoff - (startoff % ci->stripe_length);
1853 
1854             if (endoffstripe > i)
1855                 send = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1856             else if (endoffstripe == i)
1857                 send = endoff + 1;
1858             else
1859                 send = endoff - (endoff % ci->stripe_length);
1860 
1861             for (j = 0; j < ci->sub_stripes; j++) {
1862                 if (j == orig_ls && devices[i+j] && devices[i+j]->devobj) {
1863                     context.stripes[i+j].stripestart = sstart;
1864                     context.stripes[i+j].stripeend = send;
1865                     stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1866 
1867                     if (sstart != send) {
1868                         context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1869 
1870                         if (!context.stripes[i+j].mdl) {
1871                             ERR("IoAllocateMdl failed\n");
1872                             MmUnlockPages(master_mdl);
1873                             IoFreeMdl(master_mdl);
1874                             Status = STATUS_INSUFFICIENT_RESOURCES;
1875                             goto exit;
1876                         }
1877                     }
1878 
1879                     stripeset = true;
1880                 } else
1881                     context.stripes[i+j].status = ReadDataStatus_Skip;
1882             }
1883 
1884             if (!stripeset) {
1885                 for (j = 0; j < ci->sub_stripes; j++) {
1886                     if (devices[i+j] && devices[i+j]->devobj) {
1887                         context.stripes[i+j].stripestart = sstart;
1888                         context.stripes[i+j].stripeend = send;
1889                         context.stripes[i+j].status = ReadDataStatus_Pending;
1890                         stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1891 
1892                         if (sstart != send) {
1893                             context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1894 
1895                             if (!context.stripes[i+j].mdl) {
1896                                 ERR("IoAllocateMdl failed\n");
1897                                 MmUnlockPages(master_mdl);
1898                                 IoFreeMdl(master_mdl);
1899                                 Status = STATUS_INSUFFICIENT_RESOURCES;
1900                                 goto exit;
1901                             }
1902                         }
1903 
1904                         stripeset = true;
1905                         break;
1906                     }
1907                 }
1908 
1909                 if (!stripeset) {
1910                     ERR("could not find stripe to read\n");
1911                     Status = STATUS_DEVICE_NOT_READY;
1912                     goto exit;
1913                 }
1914             }
1915         }
1916 
1917         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1918         if (!stripeoff) {
1919             ERR("out of memory\n");
1920             MmUnlockPages(master_mdl);
1921             IoFreeMdl(master_mdl);
1922             Status = STATUS_INSUFFICIENT_RESOURCES;
1923             goto exit;
1924         }
1925 
1926         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes);
1927 
1928         pos = 0;
1929         stripe = startoffstripe / ci->sub_stripes;
1930         while (pos < length) {
1931             PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(stripes[stripe]->mdl + 1);
1932 
1933             if (pos == 0) {
1934                 uint32_t readlen = (uint32_t)min(stripes[stripe]->stripeend - stripes[stripe]->stripestart,
1935                                              ci->stripe_length - (stripes[stripe]->stripestart % ci->stripe_length));
1936 
1937                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1938 
1939                 stripeoff[stripe] += readlen;
1940                 pos += readlen;
1941             } else if (length - pos < ci->stripe_length) {
1942                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1943 
1944                 pos = length;
1945             } else {
1946                 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1947 
1948                 stripeoff[stripe] += (ULONG)ci->stripe_length;
1949                 pos += (ULONG)ci->stripe_length;
1950             }
1951 
1952             stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
1953         }
1954 
1955         MmUnlockPages(master_mdl);
1956         IoFreeMdl(master_mdl);
1957 
1958         ExFreePool(stripeoff);
1959         ExFreePool(stripes);
1960     } else if (type == BLOCK_FLAG_DUPLICATE) {
1961         uint64_t orig_ls;
1962 
1963         if (c)
1964             orig_ls = i = c->last_stripe;
1965         else
1966             orig_ls = i = 0;
1967 
1968         while (!devices[i] || !devices[i]->devobj) {
1969             i = (i + 1) % ci->num_stripes;
1970 
1971             if (i == orig_ls) {
1972                 ERR("no devices available to service request\n");
1973                 Status = STATUS_DEVICE_NOT_READY;
1974                 goto exit;
1975             }
1976         }
1977 
1978         if (c)
1979             c->last_stripe = (i + 1) % ci->num_stripes;
1980 
1981         context.stripes[i].stripestart = addr - offset;
1982         context.stripes[i].stripeend = context.stripes[i].stripestart + length;
1983 
1984         if (file_read) {
1985             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1986 
1987             if (!context.va) {
1988                 ERR("out of memory\n");
1989                 Status = STATUS_INSUFFICIENT_RESOURCES;
1990                 goto exit;
1991             }
1992 
1993             context.stripes[i].mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1994             if (!context.stripes[i].mdl) {
1995                 ERR("IoAllocateMdl failed\n");
1996                 Status = STATUS_INSUFFICIENT_RESOURCES;
1997                 goto exit;
1998             }
1999 
2000             MmBuildMdlForNonPagedPool(context.stripes[i].mdl);
2001         } else {
2002             context.stripes[i].mdl = IoAllocateMdl(buf, length, false, false, NULL);
2003 
2004             if (!context.stripes[i].mdl) {
2005                 ERR("IoAllocateMdl failed\n");
2006                 Status = STATUS_INSUFFICIENT_RESOURCES;
2007                 goto exit;
2008             }
2009 
2010             Status = STATUS_SUCCESS;
2011 
2012             _SEH2_TRY {
2013                 MmProbeAndLockPages(context.stripes[i].mdl, KernelMode, IoWriteAccess);
2014             } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2015                 Status = _SEH2_GetExceptionCode();
2016             } _SEH2_END;
2017 
2018             if (!NT_SUCCESS(Status)) {
2019                 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2020                 goto exit;
2021             }
2022         }
2023     } else if (type == BLOCK_FLAG_RAID5) {
2024         uint64_t startoff, endoff;
2025         uint16_t endoffstripe, parity;
2026         uint32_t *stripeoff, pos;
2027         PMDL master_mdl;
2028         PFN_NUMBER *pfns, dummy;
2029         bool need_dummy = false;
2030 
2031         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &startoff, &startoffstripe);
2032         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 1, &endoff, &endoffstripe);
2033 
2034         if (file_read) {
2035             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2036 
2037             if (!context.va) {
2038                 ERR("out of memory\n");
2039                 Status = STATUS_INSUFFICIENT_RESOURCES;
2040                 goto exit;
2041             }
2042         } else
2043             context.va = buf;
2044 
2045         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2046         if (!master_mdl) {
2047             ERR("out of memory\n");
2048             Status = STATUS_INSUFFICIENT_RESOURCES;
2049             goto exit;
2050         }
2051 
2052         Status = STATUS_SUCCESS;
2053 
2054         _SEH2_TRY {
2055             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
2056         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2057             Status = _SEH2_GetExceptionCode();
2058         } _SEH2_END;
2059 
2060         if (!NT_SUCCESS(Status)) {
2061             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2062             IoFreeMdl(master_mdl);
2063             goto exit;
2064         }
2065 
2066         pfns = (PFN_NUMBER*)(master_mdl + 1);
2067 
2068         pos = 0;
2069         while (pos < length) {
2070             parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2071 
2072             if (pos == 0) {
2073                 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2074                 ULONG skip, readlen;
2075 
2076                 i = startoffstripe;
2077                 while (stripe != parity) {
2078                     if (i == startoffstripe) {
2079                         readlen = min(length, (ULONG)(ci->stripe_length - (startoff % ci->stripe_length)));
2080 
2081                         context.stripes[stripe].stripestart = startoff;
2082                         context.stripes[stripe].stripeend = startoff + readlen;
2083 
2084                         pos += readlen;
2085 
2086                         if (pos == length)
2087                             break;
2088                     } else {
2089                         readlen = min(length - pos, (ULONG)ci->stripe_length);
2090 
2091                         context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2092                         context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2093 
2094                         pos += readlen;
2095 
2096                         if (pos == length)
2097                             break;
2098                     }
2099 
2100                     i++;
2101                     stripe = (stripe + 1) % ci->num_stripes;
2102                 }
2103 
2104                 if (pos == length)
2105                     break;
2106 
2107                 for (i = 0; i < startoffstripe; i++) {
2108                     uint16_t stripe2 = (parity + i + 1) % ci->num_stripes;
2109 
2110                     context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2111                 }
2112 
2113                 context.stripes[parity].stripestart = context.stripes[parity].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2114 
2115                 if (length - pos > ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length) {
2116                     skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length)) - 1);
2117 
2118                     for (i = 0; i < ci->num_stripes; i++) {
2119                         context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2120                     }
2121 
2122                     pos += (uint32_t)(skip * (ci->num_stripes - 1) * ci->num_stripes * ci->stripe_length);
2123                     need_dummy = true;
2124                 }
2125             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2126                 for (i = 0; i < ci->num_stripes; i++) {
2127                     context.stripes[i].stripeend += ci->stripe_length;
2128                 }
2129 
2130                 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 1));
2131                 need_dummy = true;
2132             } else {
2133                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2134 
2135                 i = 0;
2136                 while (stripe != parity) {
2137                     if (endoffstripe == i) {
2138                         context.stripes[stripe].stripeend = endoff + 1;
2139                         break;
2140                     } else if (endoffstripe > i)
2141                         context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2142 
2143                     i++;
2144                     stripe = (stripe + 1) % ci->num_stripes;
2145                 }
2146 
2147                 break;
2148             }
2149         }
2150 
2151         for (i = 0; i < ci->num_stripes; i++) {
2152             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2153                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart),
2154                                                        false, false, NULL);
2155 
2156                 if (!context.stripes[i].mdl) {
2157                     ERR("IoAllocateMdl failed\n");
2158                     MmUnlockPages(master_mdl);
2159                     IoFreeMdl(master_mdl);
2160                     Status = STATUS_INSUFFICIENT_RESOURCES;
2161                     goto exit;
2162                 }
2163             }
2164         }
2165 
2166         if (need_dummy) {
2167             dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2168             if (!dummypage) {
2169                 ERR("out of memory\n");
2170                 MmUnlockPages(master_mdl);
2171                 IoFreeMdl(master_mdl);
2172                 Status = STATUS_INSUFFICIENT_RESOURCES;
2173                 goto exit;
2174             }
2175 
2176             dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2177             if (!dummy_mdl) {
2178                 ERR("IoAllocateMdl failed\n");
2179                 MmUnlockPages(master_mdl);
2180                 IoFreeMdl(master_mdl);
2181                 Status = STATUS_INSUFFICIENT_RESOURCES;
2182                 goto exit;
2183             }
2184 
2185             MmBuildMdlForNonPagedPool(dummy_mdl);
2186 
2187             dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2188         }
2189 
2190         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2191         if (!stripeoff) {
2192             ERR("out of memory\n");
2193             MmUnlockPages(master_mdl);
2194             IoFreeMdl(master_mdl);
2195             Status = STATUS_INSUFFICIENT_RESOURCES;
2196             goto exit;
2197         }
2198 
2199         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2200 
2201         pos = 0;
2202 
2203         while (pos < length) {
2204             PFN_NUMBER* stripe_pfns;
2205 
2206             parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2207 
2208             if (pos == 0) {
2209                 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2210                 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2211                                                        ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2212 
2213                 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2214 
2215                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2216 
2217                 stripeoff[stripe] = readlen;
2218                 pos += readlen;
2219 
2220                 stripe = (stripe + 1) % ci->num_stripes;
2221 
2222                 while (stripe != parity) {
2223                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2224                     readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2225 
2226                     if (readlen == 0)
2227                         break;
2228 
2229                     RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2230 
2231                     stripeoff[stripe] = readlen;
2232                     pos += readlen;
2233 
2234                     stripe = (stripe + 1) % ci->num_stripes;
2235                 }
2236             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2237                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2238                 ULONG k;
2239 
2240                 while (stripe != parity) {
2241                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2242 
2243                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2244 
2245                     stripeoff[stripe] += (uint32_t)ci->stripe_length;
2246                     pos += (uint32_t)ci->stripe_length;
2247 
2248                     stripe = (stripe + 1) % ci->num_stripes;
2249                 }
2250 
2251                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity].mdl + 1);
2252 
2253                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2254                     stripe_pfns[stripeoff[parity] >> PAGE_SHIFT] = dummy;
2255                     stripeoff[parity] += PAGE_SIZE;
2256                 }
2257             } else {
2258                 uint16_t stripe = (parity + 1) % ci->num_stripes;
2259                 uint32_t readlen;
2260 
2261                 while (pos < length) {
2262                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2263                     readlen = min(length - pos, (ULONG)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2264 
2265                     if (readlen == 0)
2266                         break;
2267 
2268                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2269 
2270                     stripeoff[stripe] += readlen;
2271                     pos += readlen;
2272 
2273                     stripe = (stripe + 1) % ci->num_stripes;
2274                 }
2275             }
2276         }
2277 
2278         MmUnlockPages(master_mdl);
2279         IoFreeMdl(master_mdl);
2280 
2281         ExFreePool(stripeoff);
2282     } else if (type == BLOCK_FLAG_RAID6) {
2283         uint64_t startoff, endoff;
2284         uint16_t endoffstripe, parity1;
2285         uint32_t *stripeoff, pos;
2286         PMDL master_mdl;
2287         PFN_NUMBER *pfns, dummy;
2288         bool need_dummy = false;
2289 
2290         get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &startoff, &startoffstripe);
2291         get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 2, &endoff, &endoffstripe);
2292 
2293         if (file_read) {
2294             context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2295 
2296             if (!context.va) {
2297                 ERR("out of memory\n");
2298                 Status = STATUS_INSUFFICIENT_RESOURCES;
2299                 goto exit;
2300             }
2301         } else
2302             context.va = buf;
2303 
2304         master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2305         if (!master_mdl) {
2306             ERR("out of memory\n");
2307             Status = STATUS_INSUFFICIENT_RESOURCES;
2308             goto exit;
2309         }
2310 
2311         Status = STATUS_SUCCESS;
2312 
2313         _SEH2_TRY {
2314             MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
2315         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2316             Status = _SEH2_GetExceptionCode();
2317         } _SEH2_END;
2318 
2319         if (!NT_SUCCESS(Status)) {
2320             ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2321             IoFreeMdl(master_mdl);
2322             goto exit;
2323         }
2324 
2325         pfns = (PFN_NUMBER*)(master_mdl + 1);
2326 
2327         pos = 0;
2328         while (pos < length) {
2329             parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2330 
2331             if (pos == 0) {
2332                 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes, parity2;
2333                 ULONG skip, readlen;
2334 
2335                 i = startoffstripe;
2336                 while (stripe != parity1) {
2337                     if (i == startoffstripe) {
2338                         readlen = (ULONG)min(length, ci->stripe_length - (startoff % ci->stripe_length));
2339 
2340                         context.stripes[stripe].stripestart = startoff;
2341                         context.stripes[stripe].stripeend = startoff + readlen;
2342 
2343                         pos += readlen;
2344 
2345                         if (pos == length)
2346                             break;
2347                     } else {
2348                         readlen = min(length - pos, (ULONG)ci->stripe_length);
2349 
2350                         context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2351                         context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2352 
2353                         pos += readlen;
2354 
2355                         if (pos == length)
2356                             break;
2357                     }
2358 
2359                     i++;
2360                     stripe = (stripe + 1) % ci->num_stripes;
2361                 }
2362 
2363                 if (pos == length)
2364                     break;
2365 
2366                 for (i = 0; i < startoffstripe; i++) {
2367                     uint16_t stripe2 = (parity1 + i + 2) % ci->num_stripes;
2368 
2369                     context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2370                 }
2371 
2372                 context.stripes[parity1].stripestart = context.stripes[parity1].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2373 
2374                 parity2 = (parity1 + 1) % ci->num_stripes;
2375                 context.stripes[parity2].stripestart = context.stripes[parity2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2376 
2377                 if (length - pos > ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length) {
2378                     skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length)) - 1);
2379 
2380                     for (i = 0; i < ci->num_stripes; i++) {
2381                         context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2382                     }
2383 
2384                     pos += (uint32_t)(skip * (ci->num_stripes - 2) * ci->num_stripes * ci->stripe_length);
2385                     need_dummy = true;
2386                 }
2387             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2388                 for (i = 0; i < ci->num_stripes; i++) {
2389                     context.stripes[i].stripeend += ci->stripe_length;
2390                 }
2391 
2392                 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 2));
2393                 need_dummy = true;
2394             } else {
2395                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2396 
2397                 i = 0;
2398                 while (stripe != parity1) {
2399                     if (endoffstripe == i) {
2400                         context.stripes[stripe].stripeend = endoff + 1;
2401                         break;
2402                     } else if (endoffstripe > i)
2403                         context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2404 
2405                     i++;
2406                     stripe = (stripe + 1) % ci->num_stripes;
2407                 }
2408 
2409                 break;
2410             }
2411         }
2412 
2413         for (i = 0; i < ci->num_stripes; i++) {
2414             if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2415                 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
2416 
2417                 if (!context.stripes[i].mdl) {
2418                     ERR("IoAllocateMdl failed\n");
2419                     MmUnlockPages(master_mdl);
2420                     IoFreeMdl(master_mdl);
2421                     Status = STATUS_INSUFFICIENT_RESOURCES;
2422                     goto exit;
2423                 }
2424             }
2425         }
2426 
2427         if (need_dummy) {
2428             dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2429             if (!dummypage) {
2430                 ERR("out of memory\n");
2431                 MmUnlockPages(master_mdl);
2432                 IoFreeMdl(master_mdl);
2433                 Status = STATUS_INSUFFICIENT_RESOURCES;
2434                 goto exit;
2435             }
2436 
2437             dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2438             if (!dummy_mdl) {
2439                 ERR("IoAllocateMdl failed\n");
2440                 MmUnlockPages(master_mdl);
2441                 IoFreeMdl(master_mdl);
2442                 Status = STATUS_INSUFFICIENT_RESOURCES;
2443                 goto exit;
2444             }
2445 
2446             MmBuildMdlForNonPagedPool(dummy_mdl);
2447 
2448             dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2449         }
2450 
2451         stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2452         if (!stripeoff) {
2453             ERR("out of memory\n");
2454             MmUnlockPages(master_mdl);
2455             IoFreeMdl(master_mdl);
2456             Status = STATUS_INSUFFICIENT_RESOURCES;
2457             goto exit;
2458         }
2459 
2460         RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2461 
2462         pos = 0;
2463 
2464         while (pos < length) {
2465             PFN_NUMBER* stripe_pfns;
2466 
2467             parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2468 
2469             if (pos == 0) {
2470                 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes;
2471                 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2472                                                        ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2473 
2474                 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2475 
2476                 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2477 
2478                 stripeoff[stripe] = readlen;
2479                 pos += readlen;
2480 
2481                 stripe = (stripe + 1) % ci->num_stripes;
2482 
2483                 while (stripe != parity1) {
2484                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2485                     readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2486 
2487                     if (readlen == 0)
2488                         break;
2489 
2490                     RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2491 
2492                     stripeoff[stripe] = readlen;
2493                     pos += readlen;
2494 
2495                     stripe = (stripe + 1) % ci->num_stripes;
2496                 }
2497             } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2498                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2499                 uint16_t parity2 = (parity1 + 1) % ci->num_stripes;
2500                 ULONG k;
2501 
2502                 while (stripe != parity1) {
2503                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2504 
2505                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2506 
2507                     stripeoff[stripe] += (uint32_t)ci->stripe_length;
2508                     pos += (uint32_t)ci->stripe_length;
2509 
2510                     stripe = (stripe + 1) % ci->num_stripes;
2511                 }
2512 
2513                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity1].mdl + 1);
2514 
2515                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2516                     stripe_pfns[stripeoff[parity1] >> PAGE_SHIFT] = dummy;
2517                     stripeoff[parity1] += PAGE_SIZE;
2518                 }
2519 
2520                 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity2].mdl + 1);
2521 
2522                 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2523                     stripe_pfns[stripeoff[parity2] >> PAGE_SHIFT] = dummy;
2524                     stripeoff[parity2] += PAGE_SIZE;
2525                 }
2526             } else {
2527                 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2528                 uint32_t readlen;
2529 
2530                 while (pos < length) {
2531                     stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2532                     readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2533 
2534                     if (readlen == 0)
2535                         break;
2536 
2537                     RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2538 
2539                     stripeoff[stripe] += readlen;
2540                     pos += readlen;
2541 
2542                     stripe = (stripe + 1) % ci->num_stripes;
2543                 }
2544             }
2545         }
2546 
2547         MmUnlockPages(master_mdl);
2548         IoFreeMdl(master_mdl);
2549 
2550         ExFreePool(stripeoff);
2551     }
2552 
2553     context.address = addr;
2554 
2555     for (i = 0; i < ci->num_stripes; i++) {
2556         if (!devices[i] || !devices[i]->devobj || context.stripes[i].stripestart == context.stripes[i].stripeend) {
2557             context.stripes[i].status = ReadDataStatus_MissingDevice;
2558             context.stripes_left--;
2559 
2560             if (!devices[i] || !devices[i]->devobj)
2561                 missing_devices++;
2562         }
2563     }
2564 
2565     if (missing_devices > allowed_missing) {
2566         ERR("not enough devices to service request (%u missing)\n", missing_devices);
2567         Status = STATUS_UNEXPECTED_IO_ERROR;
2568         goto exit;
2569     }
2570 
2571     for (i = 0; i < ci->num_stripes; i++) {
2572         PIO_STACK_LOCATION IrpSp;
2573 
2574         if (devices[i] && devices[i]->devobj && context.stripes[i].stripestart != context.stripes[i].stripeend && context.stripes[i].status != ReadDataStatus_Skip) {
2575             context.stripes[i].context = (struct read_data_context*)&context;
2576 
2577             if (type == BLOCK_FLAG_RAID10) {
2578                 context.stripes[i].stripenum = i / ci->sub_stripes;
2579             }
2580 
2581             if (!Irp) {
2582                 context.stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, false);
2583 
2584                 if (!context.stripes[i].Irp) {
2585                     ERR("IoAllocateIrp failed\n");
2586                     Status = STATUS_INSUFFICIENT_RESOURCES;
2587                     goto exit;
2588                 }
2589             } else {
2590                 context.stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
2591 
2592                 if (!context.stripes[i].Irp) {
2593                     ERR("IoMakeAssociatedIrp failed\n");
2594                     Status = STATUS_INSUFFICIENT_RESOURCES;
2595                     goto exit;
2596                 }
2597             }
2598 
2599             IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2600             IrpSp->MajorFunction = IRP_MJ_READ;
2601             IrpSp->MinorFunction = IRP_MN_NORMAL;
2602             IrpSp->FileObject = devices[i]->fileobj;
2603 
2604             if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2605                 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), ALLOC_TAG);
2606                 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2607                     ERR("out of memory\n");
2608                     Status = STATUS_INSUFFICIENT_RESOURCES;
2609                     goto exit;
2610                 }
2611 
2612                 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2613 
2614                 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2615             } else if (devices[i]->devobj->Flags & DO_DIRECT_IO)
2616                 context.stripes[i].Irp->MdlAddress = context.stripes[i].mdl;
2617             else
2618                 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2619 
2620             IrpSp->Parameters.Read.Length = (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart);
2621             IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].stripestart + cis[i].offset;
2622 
2623             total_reading += IrpSp->Parameters.Read.Length;
2624 
2625             context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2626 
2627             IoSetCompletionRoutine(context.stripes[i].Irp, read_data_completion, &context.stripes[i], true, true, true);
2628 
2629             context.stripes[i].status = ReadDataStatus_Pending;
2630         }
2631     }
2632 
2633     need_to_wait = false;
2634     for (i = 0; i < ci->num_stripes; i++) {
2635         if (context.stripes[i].status != ReadDataStatus_MissingDevice && context.stripes[i].status != ReadDataStatus_Skip) {
2636             IoCallDriver(devices[i]->devobj, context.stripes[i].Irp);
2637             need_to_wait = true;
2638         }
2639     }
2640 
2641     if (need_to_wait)
2642         KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2643 
2644     if (diskacc)
2645         fFsRtlUpdateDiskCounters(total_reading, 0);
2646 
2647     // check if any of the devices return a "user-induced" error
2648 
2649     for (i = 0; i < ci->num_stripes; i++) {
2650         if (context.stripes[i].status == ReadDataStatus_Error && IoIsErrorUserInduced(context.stripes[i].iosb.Status)) {
2651             Status = context.stripes[i].iosb.Status;
2652             goto exit;
2653         }
2654     }
2655 
2656     if (type == BLOCK_FLAG_RAID0) {
2657         Status = read_data_raid0(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2658         if (!NT_SUCCESS(Status)) {
2659             ERR("read_data_raid0 returned %08lx\n", Status);
2660 
2661             if (file_read)
2662                 ExFreePool(context.va);
2663 
2664             goto exit;
2665         }
2666 
2667         if (file_read) {
2668             RtlCopyMemory(buf, context.va, length);
2669             ExFreePool(context.va);
2670         }
2671     } else if (type == BLOCK_FLAG_RAID10) {
2672         Status = read_data_raid10(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2673 
2674         if (!NT_SUCCESS(Status)) {
2675             ERR("read_data_raid10 returned %08lx\n", Status);
2676 
2677             if (file_read)
2678                 ExFreePool(context.va);
2679 
2680             goto exit;
2681         }
2682 
2683         if (file_read) {
2684             RtlCopyMemory(buf, context.va, length);
2685             ExFreePool(context.va);
2686         }
2687     } else if (type == BLOCK_FLAG_DUPLICATE) {
2688         Status = read_data_dup(Vcb, file_read ? context.va : buf, addr, &context, ci, devices, generation);
2689         if (!NT_SUCCESS(Status)) {
2690             ERR("read_data_dup returned %08lx\n", Status);
2691 
2692             if (file_read)
2693                 ExFreePool(context.va);
2694 
2695             goto exit;
2696         }
2697 
2698         if (file_read) {
2699             RtlCopyMemory(buf, context.va, length);
2700             ExFreePool(context.va);
2701         }
2702     } else if (type == BLOCK_FLAG_RAID5) {
2703         Status = read_data_raid5(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2704         if (!NT_SUCCESS(Status)) {
2705             ERR("read_data_raid5 returned %08lx\n", Status);
2706 
2707             if (file_read)
2708                 ExFreePool(context.va);
2709 
2710             goto exit;
2711         }
2712 
2713         if (file_read) {
2714             RtlCopyMemory(buf, context.va, length);
2715             ExFreePool(context.va);
2716         }
2717     } else if (type == BLOCK_FLAG_RAID6) {
2718         Status = read_data_raid6(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2719         if (!NT_SUCCESS(Status)) {
2720             ERR("read_data_raid6 returned %08lx\n", Status);
2721 
2722             if (file_read)
2723                 ExFreePool(context.va);
2724 
2725             goto exit;
2726         }
2727 
2728         if (file_read) {
2729             RtlCopyMemory(buf, context.va, length);
2730             ExFreePool(context.va);
2731         }
2732     }
2733 
2734 exit:
2735     if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6))
2736         chunk_unlock_range(Vcb, c, lockaddr, locklen);
2737 
2738     if (dummy_mdl)
2739         IoFreeMdl(dummy_mdl);
2740 
2741     if (dummypage)
2742         ExFreePool(dummypage);
2743 
2744     for (i = 0; i < ci->num_stripes; i++) {
2745         if (context.stripes[i].mdl) {
2746             if (context.stripes[i].mdl->MdlFlags & MDL_PAGES_LOCKED)
2747                 MmUnlockPages(context.stripes[i].mdl);
2748 
2749             IoFreeMdl(context.stripes[i].mdl);
2750         }
2751 
2752         if (context.stripes[i].Irp)
2753             IoFreeIrp(context.stripes[i].Irp);
2754     }
2755 
2756     ExFreePool(context.stripes);
2757 
2758     if (!Vcb->log_to_phys_loaded)
2759         ExFreePool(devices);
2760 
2761     return Status;
2762 }
2763 
2764 NTSTATUS read_stream(fcb* fcb, uint8_t* data, uint64_t start, ULONG length, ULONG* pbr) {
2765     ULONG readlen;
2766 
2767     TRACE("(%p, %p, %I64x, %lx, %p)\n", fcb, data, start, length, pbr);
2768 
2769     if (pbr) *pbr = 0;
2770 
2771     if (start >= fcb->adsdata.Length) {
2772         TRACE("tried to read beyond end of stream\n");
2773         return STATUS_END_OF_FILE;
2774     }
2775 
2776     if (length == 0) {
2777         WARN("tried to read zero bytes\n");
2778         return STATUS_SUCCESS;
2779     }
2780 
2781     if (start + length < fcb->adsdata.Length)
2782         readlen = length;
2783     else
2784         readlen = fcb->adsdata.Length - (ULONG)start;
2785 
2786     if (readlen > 0)
2787         RtlCopyMemory(data, fcb->adsdata.Buffer + start, readlen);
2788 
2789     if (pbr) *pbr = readlen;
2790 
2791     return STATUS_SUCCESS;
2792 }
2793 
2794 typedef struct {
2795     uint64_t off;
2796     uint64_t ed_size;
2797     uint64_t ed_offset;
2798     uint64_t ed_num_bytes;
2799 } read_part_extent;
2800 
2801 typedef struct {
2802     LIST_ENTRY list_entry;
2803     uint64_t addr;
2804     chunk* c;
2805     uint32_t read;
2806     uint32_t to_read;
2807     void* csum;
2808     bool csum_free;
2809     uint8_t* buf;
2810     bool buf_free;
2811     uint32_t bumpoff;
2812     bool mdl;
2813     void* data;
2814     uint8_t compression;
2815     unsigned int num_extents;
2816     read_part_extent extents[1];
2817 } read_part;
2818 
2819 typedef struct {
2820     LIST_ENTRY list_entry;
2821     calc_job* cj;
2822     void* decomp;
2823     void* data;
2824     unsigned int offset;
2825     size_t length;
2826 } comp_calc_job;
2827 
2828 NTSTATUS read_file(fcb* fcb, uint8_t* data, uint64_t start, uint64_t length, ULONG* pbr, PIRP Irp) {
2829     NTSTATUS Status;
2830     uint32_t bytes_read = 0;
2831     uint64_t last_end;
2832     LIST_ENTRY* le;
2833     POOL_TYPE pool_type;
2834     LIST_ENTRY read_parts, calc_jobs;
2835 
2836     TRACE("(%p, %p, %I64x, %I64x, %p)\n", fcb, data, start, length, pbr);
2837 
2838     if (pbr)
2839         *pbr = 0;
2840 
2841     if (start >= fcb->inode_item.st_size) {
2842         WARN("Tried to read beyond end of file\n");
2843         return STATUS_END_OF_FILE;
2844     }
2845 
2846     InitializeListHead(&read_parts);
2847     InitializeListHead(&calc_jobs);
2848 
2849     pool_type = fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? NonPagedPool : PagedPool;
2850 
2851     le = fcb->extents.Flink;
2852 
2853     last_end = start;
2854 
2855     while (le != &fcb->extents) {
2856         uint64_t len;
2857         extent* ext = CONTAINING_RECORD(le, extent, list_entry);
2858 
2859         if (!ext->ignore) {
2860             EXTENT_DATA* ed = &ext->extent_data;
2861             EXTENT_DATA2* ed2 = (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC) ? (EXTENT_DATA2*)ed->data : NULL;
2862 
2863             len = ed2 ? ed2->num_bytes : ed->decoded_size;
2864 
2865             if (ext->offset + len <= start) {
2866                 last_end = ext->offset + len;
2867                 goto nextitem;
2868             }
2869 
2870             if (ext->offset > last_end && ext->offset > start + bytes_read) {
2871                 uint32_t read = (uint32_t)min(length, ext->offset - max(start, last_end));
2872 
2873                 RtlZeroMemory(data + bytes_read, read);
2874                 bytes_read += read;
2875                 length -= read;
2876             }
2877 
2878             if (length == 0 || ext->offset > start + bytes_read + length)
2879                 break;
2880 
2881             if (ed->encryption != BTRFS_ENCRYPTION_NONE) {
2882                 WARN("Encryption not supported\n");
2883                 Status = STATUS_NOT_IMPLEMENTED;
2884                 goto exit;
2885             }
2886 
2887             if (ed->encoding != BTRFS_ENCODING_NONE) {
2888                 WARN("Other encodings not supported\n");
2889                 Status = STATUS_NOT_IMPLEMENTED;
2890                 goto exit;
2891             }
2892 
2893             switch (ed->type) {
2894                 case EXTENT_TYPE_INLINE:
2895                 {
2896                     uint64_t off = start + bytes_read - ext->offset;
2897                     uint32_t read;
2898 
2899                     if (ed->compression == BTRFS_COMPRESSION_NONE) {
2900                         read = (uint32_t)min(min(len, ext->datalen) - off, length);
2901 
2902                         RtlCopyMemory(data + bytes_read, &ed->data[off], read);
2903                     } else if (ed->compression == BTRFS_COMPRESSION_ZLIB || ed->compression == BTRFS_COMPRESSION_LZO || ed->compression == BTRFS_COMPRESSION_ZSTD) {
2904                         uint8_t* decomp;
2905                         bool decomp_alloc;
2906                         uint16_t inlen = ext->datalen - (uint16_t)offsetof(EXTENT_DATA, data[0]);
2907 
2908                         if (ed->decoded_size == 0 || ed->decoded_size > 0xffffffff) {
2909                             ERR("ed->decoded_size was invalid (%I64x)\n", ed->decoded_size);
2910                             Status = STATUS_INTERNAL_ERROR;
2911                             goto exit;
2912                         }
2913 
2914                         read = (uint32_t)min(ed->decoded_size - off, length);
2915 
2916                         if (off > 0) {
2917                             decomp = ExAllocatePoolWithTag(NonPagedPool, (uint32_t)ed->decoded_size, ALLOC_TAG);
2918                             if (!decomp) {
2919                                 ERR("out of memory\n");
2920                                 Status = STATUS_INSUFFICIENT_RESOURCES;
2921                                 goto exit;
2922                             }
2923 
2924                             decomp_alloc = true;
2925                         } else {
2926                             decomp = data + bytes_read;
2927                             decomp_alloc = false;
2928                         }
2929 
2930                         if (ed->compression == BTRFS_COMPRESSION_ZLIB) {
2931                             Status = zlib_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2932                             if (!NT_SUCCESS(Status)) {
2933                                 ERR("zlib_decompress returned %08lx\n", Status);
2934                                 if (decomp_alloc) ExFreePool(decomp);
2935                                 goto exit;
2936                             }
2937                         } else if (ed->compression == BTRFS_COMPRESSION_LZO) {
2938                             if (inlen < sizeof(uint32_t)) {
2939                                 ERR("extent data was truncated\n");
2940                                 Status = STATUS_INTERNAL_ERROR;
2941                                 if (decomp_alloc) ExFreePool(decomp);
2942                                 goto exit;
2943                             } else
2944                                 inlen -= sizeof(uint32_t);
2945 
2946                             Status = lzo_decompress(ed->data + sizeof(uint32_t), inlen, decomp, (uint32_t)(read + off), sizeof(uint32_t));
2947                             if (!NT_SUCCESS(Status)) {
2948                                 ERR("lzo_decompress returned %08lx\n", Status);
2949                                 if (decomp_alloc) ExFreePool(decomp);
2950                                 goto exit;
2951                             }
2952                         } else if (ed->compression == BTRFS_COMPRESSION_ZSTD) {
2953                             Status = zstd_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2954                             if (!NT_SUCCESS(Status)) {
2955                                 ERR("zstd_decompress returned %08lx\n", Status);
2956                                 if (decomp_alloc) ExFreePool(decomp);
2957                                 goto exit;
2958                             }
2959                         }
2960 
2961                         if (decomp_alloc) {
2962                             RtlCopyMemory(data + bytes_read, decomp + off, read);
2963                             ExFreePool(decomp);
2964                         }
2965                     } else {
2966                         ERR("unhandled compression type %x\n", ed->compression);
2967                         Status = STATUS_NOT_IMPLEMENTED;
2968                         goto exit;
2969                     }
2970 
2971                     bytes_read += read;
2972                     length -= read;
2973 
2974                     break;
2975                 }
2976 
2977                 case EXTENT_TYPE_REGULAR:
2978                 {
2979                     read_part* rp;
2980 
2981                     rp = ExAllocatePoolWithTag(pool_type, sizeof(read_part), ALLOC_TAG);
2982                     if (!rp) {
2983                         ERR("out of memory\n");
2984                         Status = STATUS_INSUFFICIENT_RESOURCES;
2985                         goto exit;
2986                     }
2987 
2988                     rp->mdl = (Irp && Irp->MdlAddress) ? true : false;
2989                     rp->extents[0].off = start + bytes_read - ext->offset;
2990                     rp->bumpoff = 0;
2991                     rp->num_extents = 1;
2992                     rp->csum_free = false;
2993 
2994                     rp->read = (uint32_t)(len - rp->extents[0].off);
2995                     if (rp->read > length) rp->read = (uint32_t)length;
2996 
2997                     if (ed->compression == BTRFS_COMPRESSION_NONE) {
2998                         rp->addr = ed2->address + ed2->offset + rp->extents[0].off;
2999                         rp->to_read = (uint32_t)sector_align(rp->read, fcb->Vcb->superblock.sector_size);
3000 
3001                         if (rp->addr % fcb->Vcb->superblock.sector_size > 0) {
3002                             rp->bumpoff = rp->addr % fcb->Vcb->superblock.sector_size;
3003                             rp->addr -= rp->bumpoff;
3004                             rp->to_read = (uint32_t)sector_align(rp->read + rp->bumpoff, fcb->Vcb->superblock.sector_size);
3005                         }
3006                     } else {
3007                         rp->addr = ed2->address;
3008                         rp->to_read = (uint32_t)sector_align(ed2->size, fcb->Vcb->superblock.sector_size);
3009                     }
3010 
3011                     if (ed->compression == BTRFS_COMPRESSION_NONE && start % fcb->Vcb->superblock.sector_size == 0 &&
3012                         length % fcb->Vcb->superblock.sector_size == 0) {
3013                         rp->buf = data + bytes_read;
3014                         rp->buf_free = false;
3015                     } else {
3016                         rp->buf = ExAllocatePoolWithTag(pool_type, rp->to_read, ALLOC_TAG);
3017                         rp->buf_free = true;
3018 
3019                         if (!rp->buf) {
3020                             ERR("out of memory\n");
3021                             Status = STATUS_INSUFFICIENT_RESOURCES;
3022                             ExFreePool(rp);
3023                             goto exit;
3024                         }
3025 
3026                         rp->mdl = false;
3027                     }
3028 
3029                     rp->c = get_chunk_from_address(fcb->Vcb, rp->addr);
3030 
3031                     if (!rp->c) {
3032                         ERR("get_chunk_from_address(%I64x) failed\n", rp->addr);
3033 
3034                         if (rp->buf_free)
3035                             ExFreePool(rp->buf);
3036 
3037                         ExFreePool(rp);
3038 
3039                         goto exit;
3040                     }
3041 
3042                     if (ext->csum) {
3043                         if (ed->compression == BTRFS_COMPRESSION_NONE) {
3044                             rp->csum = (uint8_t*)ext->csum + (fcb->Vcb->csum_size * (rp->extents[0].off / fcb->Vcb->superblock.sector_size));
3045                         } else
3046                             rp->csum = ext->csum;
3047                     } else
3048                         rp->csum = NULL;
3049 
3050                     rp->data = data + bytes_read;
3051                     rp->compression = ed->compression;
3052                     rp->extents[0].ed_offset = ed2->offset;
3053                     rp->extents[0].ed_size = ed2->size;
3054                     rp->extents[0].ed_num_bytes = ed2->num_bytes;
3055 
3056                     InsertTailList(&read_parts, &rp->list_entry);
3057 
3058                     bytes_read += rp->read;
3059                     length -= rp->read;
3060 
3061                     break;
3062                 }
3063 
3064                 case EXTENT_TYPE_PREALLOC:
3065                 {
3066                     uint64_t off = start + bytes_read - ext->offset;
3067                     uint32_t read = (uint32_t)(len - off);
3068 
3069                     if (read > length) read = (uint32_t)length;
3070 
3071                     RtlZeroMemory(data + bytes_read, read);
3072 
3073                     bytes_read += read;
3074                     length -= read;
3075 
3076                     break;
3077                 }
3078 
3079                 default:
3080                     WARN("Unsupported extent data type %u\n", ed->type);
3081                     Status = STATUS_NOT_IMPLEMENTED;
3082                     goto exit;
3083             }
3084 
3085             last_end = ext->offset + len;
3086 
3087             if (length == 0)
3088                 break;
3089         }
3090 
3091 nextitem:
3092         le = le->Flink;
3093     }
3094 
3095     if (!IsListEmpty(&read_parts) && read_parts.Flink->Flink != &read_parts) { // at least two entries in list
3096         read_part* last_rp = CONTAINING_RECORD(read_parts.Flink, read_part, list_entry);
3097 
3098         le = read_parts.Flink->Flink;
3099         while (le != &read_parts) {
3100             LIST_ENTRY* le2 = le->Flink;
3101             read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3102 
3103             // merge together runs
3104             if (rp->compression != BTRFS_COMPRESSION_NONE && rp->compression == last_rp->compression && rp->addr == last_rp->addr + last_rp->to_read &&
3105                 rp->data == (uint8_t*)last_rp->data + last_rp->read && rp->c == last_rp->c && ((rp->csum && last_rp->csum) || (!rp->csum && !last_rp->csum))) {
3106                 read_part* rp2;
3107 
3108                 rp2 = ExAllocatePoolWithTag(pool_type, offsetof(read_part, extents) + (sizeof(read_part_extent) * (last_rp->num_extents + 1)), ALLOC_TAG);
3109 
3110                 rp2->addr = last_rp->addr;
3111                 rp2->c = last_rp->c;
3112                 rp2->read = last_rp->read + rp->read;
3113                 rp2->to_read = last_rp->to_read + rp->to_read;
3114                 rp2->csum_free = false;
3115 
3116                 if (last_rp->csum) {
3117                     uint32_t sectors = (last_rp->to_read + rp->to_read) / fcb->Vcb->superblock.sector_size;
3118 
3119                     rp2->csum = ExAllocatePoolWithTag(pool_type, sectors * fcb->Vcb->csum_size, ALLOC_TAG);
3120                     if (!rp2->csum) {
3121                         ERR("out of memory\n");
3122                         ExFreePool(rp2);
3123                         Status = STATUS_INSUFFICIENT_RESOURCES;
3124                         goto exit;
3125                     }
3126 
3127                     RtlCopyMemory(rp2->csum, last_rp->csum, last_rp->to_read * fcb->Vcb->csum_size / fcb->Vcb->superblock.sector_size);
3128                     RtlCopyMemory((uint8_t*)rp2->csum + (last_rp->to_read * fcb->Vcb->csum_size / fcb->Vcb->superblock.sector_size), rp->csum,
3129                                   rp->to_read * fcb->Vcb->csum_size / fcb->Vcb->superblock.sector_size);
3130 
3131                     rp2->csum_free = true;
3132                 } else
3133                     rp2->csum = NULL;
3134 
3135                 rp2->buf = ExAllocatePoolWithTag(pool_type, rp2->to_read, ALLOC_TAG);
3136                 if (!rp2->buf) {
3137                     ERR("out of memory\n");
3138 
3139                     if (rp2->csum)
3140                         ExFreePool(rp2->csum);
3141 
3142                     ExFreePool(rp2);
3143                     Status = STATUS_INSUFFICIENT_RESOURCES;
3144                     goto exit;
3145                 }
3146 
3147                 rp2->buf_free = true;
3148                 rp2->bumpoff = 0;
3149                 rp2->mdl = false;
3150                 rp2->data = last_rp->data;
3151                 rp2->compression = last_rp->compression;
3152                 rp2->num_extents = last_rp->num_extents + 1;
3153 
3154                 RtlCopyMemory(rp2->extents, last_rp->extents, last_rp->num_extents * sizeof(read_part_extent));
3155                 RtlCopyMemory(&rp2->extents[last_rp->num_extents], rp->extents, sizeof(read_part_extent));
3156 
3157                 InsertHeadList(le->Blink, &rp2->list_entry);
3158 
3159                 if (rp->buf_free)
3160                     ExFreePool(rp->buf);
3161 
3162                 if (rp->csum_free)
3163                     ExFreePool(rp->csum);
3164 
3165                 RemoveEntryList(&rp->list_entry);
3166 
3167                 ExFreePool(rp);
3168 
3169                 if (last_rp->buf_free)
3170                     ExFreePool(last_rp->buf);
3171 
3172                 if (last_rp->csum_free)
3173                     ExFreePool(last_rp->csum);
3174 
3175                 RemoveEntryList(&last_rp->list_entry);
3176 
3177                 ExFreePool(last_rp);
3178 
3179                 last_rp = rp2;
3180             } else
3181                 last_rp = rp;
3182 
3183             le = le2;
3184         }
3185     }
3186 
3187     le = read_parts.Flink;
3188     while (le != &read_parts) {
3189         read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3190 
3191         Status = read_data(fcb->Vcb, rp->addr, rp->to_read, rp->csum, false, rp->buf, rp->c, NULL, Irp, 0, rp->mdl,
3192                            fcb && fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3193         if (!NT_SUCCESS(Status)) {
3194             ERR("read_data returned %08lx\n", Status);
3195             goto exit;
3196         }
3197 
3198         if (rp->compression == BTRFS_COMPRESSION_NONE) {
3199             if (rp->buf_free)
3200                 RtlCopyMemory(rp->data, rp->buf + rp->bumpoff, rp->read);
3201         } else {
3202             uint8_t* buf = rp->buf;
3203 #ifdef __REACTOS__
3204             unsigned int i;
3205             for (i = 0; i < rp->num_extents; i++) {
3206 #else
3207             for (unsigned int i = 0; i < rp->num_extents; i++) {
3208 #endif // __REACTOS__
3209                 uint8_t *decomp = NULL, *buf2;
3210                 ULONG outlen, inlen, off2;
3211                 uint32_t inpageoff = 0;
3212                 comp_calc_job* ccj;
3213 
3214                 off2 = (ULONG)(rp->extents[i].ed_offset + rp->extents[i].off);
3215                 buf2 = buf;
3216                 inlen = (ULONG)rp->extents[i].ed_size;
3217 
3218                 if (rp->compression == BTRFS_COMPRESSION_LZO) {
3219                     ULONG inoff = sizeof(uint32_t);
3220 
3221                     inlen -= sizeof(uint32_t);
3222 
3223                     // If reading a few sectors in, skip to the interesting bit
3224                     while (off2 > LZO_PAGE_SIZE) {
3225                         uint32_t partlen;
3226 
3227                         if (inlen < sizeof(uint32_t))
3228                             break;
3229 
3230                         partlen = *(uint32_t*)(buf2 + inoff);
3231 
3232                         if (partlen < inlen) {
3233                             off2 -= LZO_PAGE_SIZE;
3234                             inoff += partlen + sizeof(uint32_t);
3235                             inlen -= partlen + sizeof(uint32_t);
3236 
3237                             if (LZO_PAGE_SIZE - (inoff % LZO_PAGE_SIZE) < sizeof(uint32_t))
3238                                 inoff = ((inoff / LZO_PAGE_SIZE) + 1) * LZO_PAGE_SIZE;
3239                         } else
3240                             break;
3241                     }
3242 
3243                     buf2 = &buf2[inoff];
3244                     inpageoff = inoff % LZO_PAGE_SIZE;
3245                 }
3246 
3247                 if (off2 != 0) {
3248                     outlen = off2 + min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3249 
3250                     decomp = ExAllocatePoolWithTag(pool_type, outlen, ALLOC_TAG);
3251                     if (!decomp) {
3252                         ERR("out of memory\n");
3253                         Status = STATUS_INSUFFICIENT_RESOURCES;
3254                         goto exit;
3255                     }
3256                 } else
3257                     outlen = min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3258 
3259                 ccj = (comp_calc_job*)ExAllocatePoolWithTag(pool_type, sizeof(comp_calc_job), ALLOC_TAG);
3260                 if (!ccj) {
3261                     ERR("out of memory\n");
3262 
3263                     if (decomp)
3264                         ExFreePool(decomp);
3265 
3266                     Status = STATUS_INSUFFICIENT_RESOURCES;
3267                     goto exit;
3268                 }
3269 
3270                 Status = add_calc_job_decomp(fcb->Vcb, rp->compression, buf2, inlen, decomp ? decomp : rp->data, outlen,
3271                                              inpageoff, &ccj->cj);
3272                 if (!NT_SUCCESS(Status)) {
3273                     ERR("add_calc_job_decomp returned %08lx\n", Status);
3274 
3275                     if (decomp)
3276                         ExFreePool(decomp);
3277 
3278                     ExFreePool(ccj);
3279 
3280                     goto exit;
3281                 }
3282 
3283                 ccj->data = rp->data;
3284                 ccj->decomp = decomp;
3285 
3286                 ccj->offset = off2;
3287                 ccj->length = (size_t)min(rp->read, rp->extents[i].ed_num_bytes - rp->extents[i].off);
3288 
3289                 InsertTailList(&calc_jobs, &ccj->list_entry);
3290 
3291                 buf += rp->extents[i].ed_size;
3292                 rp->data = (uint8_t*)rp->data + rp->extents[i].ed_num_bytes - rp->extents[i].off;
3293                 rp->read -= (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off);
3294             }
3295         }
3296 
3297         le = le->Flink;
3298     }
3299 
3300     if (length > 0 && start + bytes_read < fcb->inode_item.st_size) {
3301         uint32_t read = (uint32_t)min(fcb->inode_item.st_size - start - bytes_read, length);
3302 
3303         RtlZeroMemory(data + bytes_read, read);
3304 
3305         bytes_read += read;
3306         length -= read;
3307     }
3308 
3309     Status = STATUS_SUCCESS;
3310 
3311     while (!IsListEmpty(&calc_jobs)) {
3312         comp_calc_job* ccj = CONTAINING_RECORD(RemoveTailList(&calc_jobs), comp_calc_job, list_entry);
3313 
3314         calc_thread_main(fcb->Vcb, ccj->cj);
3315 
3316         KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3317 
3318         if (!NT_SUCCESS(ccj->cj->Status))
3319             Status = ccj->cj->Status;
3320 
3321         if (ccj->decomp) {
3322             RtlCopyMemory(ccj->data, (uint8_t*)ccj->decomp + ccj->offset, ccj->length);
3323             ExFreePool(ccj->decomp);
3324         }
3325 
3326         ExFreePool(ccj);
3327     }
3328 
3329     if (pbr)
3330         *pbr = bytes_read;
3331 
3332 exit:
3333     while (!IsListEmpty(&read_parts)) {
3334         read_part* rp = CONTAINING_RECORD(RemoveHeadList(&read_parts), read_part, list_entry);
3335 
3336         if (rp->buf_free)
3337             ExFreePool(rp->buf);
3338 
3339         if (rp->csum_free)
3340             ExFreePool(rp->csum);
3341 
3342         ExFreePool(rp);
3343     }
3344 
3345     while (!IsListEmpty(&calc_jobs)) {
3346         comp_calc_job* ccj = CONTAINING_RECORD(RemoveHeadList(&calc_jobs), comp_calc_job, list_entry);
3347 
3348         KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3349 
3350         if (ccj->decomp)
3351             ExFreePool(ccj->decomp);
3352 
3353         ExFreePool(ccj->cj);
3354 
3355         ExFreePool(ccj);
3356     }
3357 
3358     return Status;
3359 }
3360 
3361 NTSTATUS do_read(PIRP Irp, bool wait, ULONG* bytes_read) {
3362     PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3363     PFILE_OBJECT FileObject = IrpSp->FileObject;
3364     fcb* fcb = FileObject->FsContext;
3365     uint8_t* data = NULL;
3366     ULONG length = IrpSp->Parameters.Read.Length, addon = 0;
3367     uint64_t start = IrpSp->Parameters.Read.ByteOffset.QuadPart;
3368 
3369     *bytes_read = 0;
3370 
3371     if (!fcb || !fcb->Vcb || !fcb->subvol)
3372         return STATUS_INTERNAL_ERROR;
3373 
3374     TRACE("fcb = %p\n", fcb);
3375     TRACE("offset = %I64x, length = %lx\n", start, length);
3376     TRACE("paging_io = %s, no cache = %s\n", Irp->Flags & IRP_PAGING_IO ? "true" : "false", Irp->Flags & IRP_NOCACHE ? "true" : "false");
3377 
3378     if (!fcb->ads && fcb->type == BTRFS_TYPE_DIRECTORY)
3379         return STATUS_INVALID_DEVICE_REQUEST;
3380 
3381     if (!(Irp->Flags & IRP_PAGING_IO) && !FsRtlCheckLockForReadAccess(&fcb->lock, Irp)) {
3382         WARN("tried to read locked region\n");
3383         return STATUS_FILE_LOCK_CONFLICT;
3384     }
3385 
3386     if (length == 0) {
3387         TRACE("tried to read zero bytes\n");
3388         return STATUS_SUCCESS;
3389     }
3390 
3391     if (start >= (uint64_t)fcb->Header.FileSize.QuadPart) {
3392         TRACE("tried to read with offset after file end (%I64x >= %I64x)\n", start, fcb->Header.FileSize.QuadPart);
3393         return STATUS_END_OF_FILE;
3394     }
3395 
3396     TRACE("FileObject %p fcb %p FileSize = %I64x st_size = %I64x (%p)\n", FileObject, fcb, fcb->Header.FileSize.QuadPart, fcb->inode_item.st_size, &fcb->inode_item.st_size);
3397 
3398     if (Irp->Flags & IRP_NOCACHE || !(IrpSp->MinorFunction & IRP_MN_MDL)) {
3399         data = map_user_buffer(Irp, fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3400 
3401         if (Irp->MdlAddress && !data) {
3402             ERR("MmGetSystemAddressForMdlSafe returned NULL\n");
3403             return STATUS_INSUFFICIENT_RESOURCES;
3404         }
3405 
3406         if (start >= (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3407             length = (ULONG)min(length, min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3408             RtlZeroMemory(data, length);
3409             Irp->IoStatus.Information = *bytes_read = length;
3410             return STATUS_SUCCESS;
3411         }
3412 
3413         if (length + start > (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3414             addon = (ULONG)(min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3415             RtlZeroMemory(data + (fcb->Header.ValidDataLength.QuadPart - start), addon);
3416             length = (ULONG)(fcb->Header.ValidDataLength.QuadPart - start);
3417         }
3418     }
3419 
3420     if (!(Irp->Flags & IRP_NOCACHE)) {
3421         NTSTATUS Status = STATUS_SUCCESS;
3422 
3423         _SEH2_TRY {
3424             if (!FileObject->PrivateCacheMap) {
3425                 CC_FILE_SIZES ccfs;
3426 
3427                 ccfs.AllocationSize = fcb->Header.AllocationSize;
3428                 ccfs.FileSize = fcb->Header.FileSize;
3429                 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
3430 
3431                 init_file_cache(FileObject, &ccfs);
3432             }
3433 
3434             if (IrpSp->MinorFunction & IRP_MN_MDL) {
3435                 CcMdlRead(FileObject,&IrpSp->Parameters.Read.ByteOffset, length, &Irp->MdlAddress, &Irp->IoStatus);
3436             } else {
3437                 if (fCcCopyReadEx) {
3438                     TRACE("CcCopyReadEx(%p, %I64x, %lx, %u, %p, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart,
3439                           length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread);
3440                     TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3441                     if (!fCcCopyReadEx(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread)) {
3442                         TRACE("CcCopyReadEx could not wait\n");
3443 
3444                         IoMarkIrpPending(Irp);
3445                         return STATUS_PENDING;
3446                     }
3447                     TRACE("CcCopyReadEx finished\n");
3448                 } else {
3449                     TRACE("CcCopyRead(%p, %I64x, %lx, %u, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart, length, wait, data, &Irp->IoStatus);
3450                     TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3451                     if (!CcCopyRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus)) {
3452                         TRACE("CcCopyRead could not wait\n");
3453 
3454                         IoMarkIrpPending(Irp);
3455                         return STATUS_PENDING;
3456                     }
3457                     TRACE("CcCopyRead finished\n");
3458                 }
3459             }
3460         } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
3461             Status = _SEH2_GetExceptionCode();
3462         } _SEH2_END;
3463 
3464         if (NT_SUCCESS(Status)) {
3465             Status = Irp->IoStatus.Status;
3466             Irp->IoStatus.Information += addon;
3467             *bytes_read = (ULONG)Irp->IoStatus.Information;
3468         } else
3469             ERR("EXCEPTION - %08lx\n", Status);
3470 
3471         return Status;
3472     } else {
3473         NTSTATUS Status;
3474 
3475         if (!wait) {
3476             IoMarkIrpPending(Irp);
3477             return STATUS_PENDING;
3478         }
3479 
3480         if (fcb->ads) {
3481             Status = read_stream(fcb, data, start, length, bytes_read);
3482 
3483             if (!NT_SUCCESS(Status))
3484                 ERR("read_stream returned %08lx\n", Status);
3485         } else {
3486             Status = read_file(fcb, data, start, length, bytes_read, Irp);
3487 
3488             if (!NT_SUCCESS(Status))
3489                 ERR("read_file returned %08lx\n", Status);
3490         }
3491 
3492         *bytes_read += addon;
3493         TRACE("read %lu bytes\n", *bytes_read);
3494 
3495         Irp->IoStatus.Information = *bytes_read;
3496 
3497         if (diskacc && Status != STATUS_PENDING) {
3498             PETHREAD thread = NULL;
3499 
3500             if (Irp->Tail.Overlay.Thread && !IoIsSystemThread(Irp->Tail.Overlay.Thread))
3501                 thread = Irp->Tail.Overlay.Thread;
3502             else if (!IoIsSystemThread(PsGetCurrentThread()))
3503                 thread = PsGetCurrentThread();
3504             else if (IoIsSystemThread(PsGetCurrentThread()) && IoGetTopLevelIrp() == Irp)
3505                 thread = PsGetCurrentThread();
3506 
3507             if (thread)
3508                 fPsUpdateDiskCounters(PsGetThreadProcess(thread), *bytes_read, 0, 1, 0, 0);
3509         }
3510 
3511         return Status;
3512     }
3513 }
3514 
3515 _Dispatch_type_(IRP_MJ_READ)
3516 _Function_class_(DRIVER_DISPATCH)
3517 NTSTATUS __stdcall drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
3518     device_extension* Vcb = DeviceObject->DeviceExtension;
3519     PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3520     PFILE_OBJECT FileObject = IrpSp->FileObject;
3521     ULONG bytes_read = 0;
3522     NTSTATUS Status;
3523     bool top_level;
3524     fcb* fcb;
3525     ccb* ccb;
3526     bool acquired_fcb_lock = false, wait;
3527 
3528     FsRtlEnterFileSystem();
3529 
3530     top_level = is_top_level(Irp);
3531 
3532     TRACE("read\n");
3533 
3534     if (Vcb && Vcb->type == VCB_TYPE_VOLUME) {
3535         Status = vol_read(DeviceObject, Irp);
3536         goto exit2;
3537     } else if (!Vcb || Vcb->type != VCB_TYPE_FS) {
3538         Status = STATUS_INVALID_PARAMETER;
3539         goto end;
3540     }
3541 
3542     Irp->IoStatus.Information = 0;
3543 
3544     if (IrpSp->MinorFunction & IRP_MN_COMPLETE) {
3545         CcMdlReadComplete(IrpSp->FileObject, Irp->MdlAddress);
3546 
3547         Irp->MdlAddress = NULL;
3548         Status = STATUS_SUCCESS;
3549 
3550         goto exit;
3551     }
3552 
3553     fcb = FileObject->FsContext;
3554 
3555     if (!fcb) {
3556         ERR("fcb was NULL\n");
3557         Status = STATUS_INVALID_PARAMETER;
3558         goto exit;
3559     }
3560 
3561     ccb = FileObject->FsContext2;
3562 
3563     if (!ccb) {
3564         ERR("ccb was NULL\n");
3565         Status = STATUS_INVALID_PARAMETER;
3566         goto exit;
3567     }
3568 
3569     if (Irp->RequestorMode == UserMode && !(ccb->access & FILE_READ_DATA)) {
3570         WARN("insufficient privileges\n");
3571         Status = STATUS_ACCESS_DENIED;
3572         goto exit;
3573     }
3574 
3575     if (fcb == Vcb->volume_fcb) {
3576         TRACE("reading volume FCB\n");
3577 
3578         IoSkipCurrentIrpStackLocation(Irp);
3579 
3580         Status = IoCallDriver(Vcb->Vpb->RealDevice, Irp);
3581 
3582         goto exit2;
3583     }
3584 
3585     if (!(Irp->Flags & IRP_PAGING_IO))
3586         FsRtlCheckOplock(fcb_oplock(fcb), Irp, NULL, NULL, NULL);
3587 
3588     wait = IoIsOperationSynchronous(Irp);
3589 
3590     // Don't offload jobs when doing paging IO - otherwise this can lead to
3591     // deadlocks in CcCopyRead.
3592     if (Irp->Flags & IRP_PAGING_IO)
3593         wait = true;
3594 
3595     if (!(Irp->Flags & IRP_PAGING_IO) && FileObject->SectionObjectPointer && FileObject->SectionObjectPointer->DataSectionObject) {
3596         IO_STATUS_BLOCK iosb;
3597 
3598         CcFlushCache(FileObject->SectionObjectPointer, &IrpSp->Parameters.Read.ByteOffset, IrpSp->Parameters.Read.Length, &iosb);
3599         if (!NT_SUCCESS(iosb.Status)) {
3600             ERR("CcFlushCache returned %08lx\n", iosb.Status);
3601             return iosb.Status;
3602         }
3603     }
3604 
3605     if (!ExIsResourceAcquiredSharedLite(fcb->Header.Resource)) {
3606         if (!ExAcquireResourceSharedLite(fcb->Header.Resource, wait)) {
3607             Status = STATUS_PENDING;
3608             IoMarkIrpPending(Irp);
3609             goto exit;
3610         }
3611 
3612         acquired_fcb_lock = true;
3613     }
3614 
3615     Status = do_read(Irp, wait, &bytes_read);
3616 
3617     if (acquired_fcb_lock)
3618         ExReleaseResourceLite(fcb->Header.Resource);
3619 
3620 exit:
3621     if (FileObject->Flags & FO_SYNCHRONOUS_IO && !(Irp->Flags & IRP_PAGING_IO))
3622         FileObject->CurrentByteOffset.QuadPart = IrpSp->Parameters.Read.ByteOffset.QuadPart + (NT_SUCCESS(Status) ? bytes_read : 0);
3623 
3624 end:
3625     Irp->IoStatus.Status = Status;
3626 
3627     TRACE("Irp->IoStatus.Status = %08lx\n", Irp->IoStatus.Status);
3628     TRACE("Irp->IoStatus.Information = %Iu\n", Irp->IoStatus.Information);
3629     TRACE("returning %08lx\n", Status);
3630 
3631     if (Status != STATUS_PENDING)
3632         IoCompleteRequest(Irp, IO_NO_INCREMENT);
3633     else {
3634         if (!add_thread_job(Vcb, Irp))
3635             Status = do_read_job(Irp);
3636     }
3637 
3638 exit2:
3639     if (top_level)
3640         IoSetTopLevelIrp(NULL);
3641 
3642     FsRtlExitFileSystem();
3643 
3644     return Status;
3645 }
3646