1 /* Copyright (c) Mark Harmstone 2016-17
2 *
3 * This file is part of WinBtrfs.
4 *
5 * WinBtrfs is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public Licence as published by
7 * the Free Software Foundation, either version 3 of the Licence, or
8 * (at your option) any later version.
9 *
10 * WinBtrfs is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public Licence for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public Licence
16 * along with WinBtrfs. If not, see <http://www.gnu.org/licenses/>. */
17
18 #include "btrfs_drv.h"
19 #include "xxhash.h"
20 #include "crc32c.h"
21
22 enum read_data_status {
23 ReadDataStatus_Pending,
24 ReadDataStatus_Success,
25 ReadDataStatus_Error,
26 ReadDataStatus_MissingDevice,
27 ReadDataStatus_Skip
28 };
29
30 struct read_data_context;
31
32 typedef struct {
33 struct read_data_context* context;
34 uint16_t stripenum;
35 bool rewrite;
36 PIRP Irp;
37 IO_STATUS_BLOCK iosb;
38 enum read_data_status status;
39 PMDL mdl;
40 uint64_t stripestart;
41 uint64_t stripeend;
42 } read_data_stripe;
43
44 typedef struct {
45 KEVENT Event;
46 NTSTATUS Status;
47 chunk* c;
48 uint64_t address;
49 uint32_t buflen;
50 LONG num_stripes, stripes_left;
51 uint64_t type;
52 uint32_t sector_size;
53 uint16_t firstoff, startoffstripe, sectors_per_stripe;
54 void* csum;
55 bool tree;
56 read_data_stripe* stripes;
57 uint8_t* va;
58 } read_data_context;
59
60 extern bool diskacc;
61 extern tPsUpdateDiskCounters fPsUpdateDiskCounters;
62 extern tCcCopyReadEx fCcCopyReadEx;
63 extern tFsRtlUpdateDiskCounters fFsRtlUpdateDiskCounters;
64
65 #define LZO_PAGE_SIZE 4096
66
_Function_class_(IO_COMPLETION_ROUTINE)67 _Function_class_(IO_COMPLETION_ROUTINE)
68 static NTSTATUS __stdcall read_data_completion(PDEVICE_OBJECT DeviceObject, PIRP Irp, PVOID conptr) {
69 read_data_stripe* stripe = conptr;
70 read_data_context* context = (read_data_context*)stripe->context;
71
72 UNUSED(DeviceObject);
73
74 stripe->iosb = Irp->IoStatus;
75
76 if (NT_SUCCESS(Irp->IoStatus.Status))
77 stripe->status = ReadDataStatus_Success;
78 else
79 stripe->status = ReadDataStatus_Error;
80
81 if (InterlockedDecrement(&context->stripes_left) == 0)
82 KeSetEvent(&context->Event, 0, false);
83
84 return STATUS_MORE_PROCESSING_REQUIRED;
85 }
86
check_csum(device_extension * Vcb,uint8_t * data,uint32_t sectors,void * csum)87 NTSTATUS check_csum(device_extension* Vcb, uint8_t* data, uint32_t sectors, void* csum) {
88 void* csum2;
89
90 csum2 = ExAllocatePoolWithTag(PagedPool, Vcb->csum_size * sectors, ALLOC_TAG);
91 if (!csum2) {
92 ERR("out of memory\n");
93 return STATUS_INSUFFICIENT_RESOURCES;
94 }
95
96 do_calc_job(Vcb, data, sectors, csum2);
97
98 if (RtlCompareMemory(csum2, csum, sectors * Vcb->csum_size) != sectors * Vcb->csum_size) {
99 ExFreePool(csum2);
100 return STATUS_CRC_ERROR;
101 }
102
103 ExFreePool(csum2);
104
105 return STATUS_SUCCESS;
106 }
107
get_tree_checksum(device_extension * Vcb,tree_header * th,void * csum)108 void get_tree_checksum(device_extension* Vcb, tree_header* th, void* csum) {
109 switch (Vcb->superblock.csum_type) {
110 case CSUM_TYPE_CRC32C:
111 *(uint32_t*)csum = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
112 break;
113
114 case CSUM_TYPE_XXHASH:
115 *(uint64_t*)csum = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
116 break;
117
118 case CSUM_TYPE_SHA256:
119 calc_sha256(csum, &th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
120 break;
121
122 case CSUM_TYPE_BLAKE2:
123 blake2b(csum, BLAKE2_HASH_SIZE, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
124 break;
125 }
126 }
127
check_tree_checksum(device_extension * Vcb,tree_header * th)128 bool check_tree_checksum(device_extension* Vcb, tree_header* th) {
129 switch (Vcb->superblock.csum_type) {
130 case CSUM_TYPE_CRC32C: {
131 uint32_t crc32 = ~calc_crc32c(0xffffffff, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
132
133 if (crc32 == *((uint32_t*)th->csum))
134 return true;
135
136 WARN("hash was %08x, expected %08x\n", crc32, *((uint32_t*)th->csum));
137
138 break;
139 }
140
141 case CSUM_TYPE_XXHASH: {
142 uint64_t hash = XXH64((uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum), 0);
143
144 if (hash == *((uint64_t*)th->csum))
145 return true;
146
147 WARN("hash was %I64x, expected %I64x\n", hash, *((uint64_t*)th->csum));
148
149 break;
150 }
151
152 case CSUM_TYPE_SHA256: {
153 uint8_t hash[SHA256_HASH_SIZE];
154
155 calc_sha256(hash, (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
156
157 if (RtlCompareMemory(hash, th, SHA256_HASH_SIZE) == SHA256_HASH_SIZE)
158 return true;
159
160 WARN("hash was invalid\n");
161
162 break;
163 }
164
165 case CSUM_TYPE_BLAKE2: {
166 uint8_t hash[BLAKE2_HASH_SIZE];
167
168 blake2b(hash, sizeof(hash), (uint8_t*)&th->fs_uuid, Vcb->superblock.node_size - sizeof(th->csum));
169
170 if (RtlCompareMemory(hash, th, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE)
171 return true;
172
173 WARN("hash was invalid\n");
174
175 break;
176 }
177 }
178
179 return false;
180 }
181
get_sector_csum(device_extension * Vcb,void * buf,void * csum)182 void get_sector_csum(device_extension* Vcb, void* buf, void* csum) {
183 switch (Vcb->superblock.csum_type) {
184 case CSUM_TYPE_CRC32C:
185 *(uint32_t*)csum = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
186 break;
187
188 case CSUM_TYPE_XXHASH:
189 *(uint64_t*)csum = XXH64(buf, Vcb->superblock.sector_size, 0);
190 break;
191
192 case CSUM_TYPE_SHA256:
193 calc_sha256(csum, buf, Vcb->superblock.sector_size);
194 break;
195
196 case CSUM_TYPE_BLAKE2:
197 blake2b(csum, BLAKE2_HASH_SIZE, buf, Vcb->superblock.sector_size);
198 break;
199 }
200 }
201
check_sector_csum(device_extension * Vcb,void * buf,void * csum)202 bool check_sector_csum(device_extension* Vcb, void* buf, void* csum) {
203 switch (Vcb->superblock.csum_type) {
204 case CSUM_TYPE_CRC32C: {
205 uint32_t crc32 = ~calc_crc32c(0xffffffff, buf, Vcb->superblock.sector_size);
206
207 return *(uint32_t*)csum == crc32;
208 }
209
210 case CSUM_TYPE_XXHASH: {
211 uint64_t hash = XXH64(buf, Vcb->superblock.sector_size, 0);
212
213 return *(uint64_t*)csum == hash;
214 }
215
216 case CSUM_TYPE_SHA256: {
217 uint8_t hash[SHA256_HASH_SIZE];
218
219 calc_sha256(hash, buf, Vcb->superblock.sector_size);
220
221 return RtlCompareMemory(hash, csum, SHA256_HASH_SIZE) == SHA256_HASH_SIZE;
222 }
223
224 case CSUM_TYPE_BLAKE2: {
225 uint8_t hash[BLAKE2_HASH_SIZE];
226
227 blake2b(hash, sizeof(hash), buf, Vcb->superblock.sector_size);
228
229 return RtlCompareMemory(hash, csum, BLAKE2_HASH_SIZE) == BLAKE2_HASH_SIZE;
230 }
231 }
232
233 return false;
234 }
235
read_data_dup(device_extension * Vcb,uint8_t * buf,uint64_t addr,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation)236 static NTSTATUS read_data_dup(device_extension* Vcb, uint8_t* buf, uint64_t addr, read_data_context* context, CHUNK_ITEM* ci,
237 device** devices, uint64_t generation) {
238 bool checksum_error = false;
239 uint16_t j, stripe = 0;
240 NTSTATUS Status;
241 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
242
243 for (j = 0; j < ci->num_stripes; j++) {
244 if (context->stripes[j].status == ReadDataStatus_Error) {
245 WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
246 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
247 return context->stripes[j].iosb.Status;
248 } else if (context->stripes[j].status == ReadDataStatus_Success) {
249 stripe = j;
250 break;
251 }
252 }
253
254 if (context->stripes[stripe].status != ReadDataStatus_Success)
255 return STATUS_INTERNAL_ERROR;
256
257 if (context->tree) {
258 tree_header* th = (tree_header*)buf;
259
260 if (th->address != context->address || !check_tree_checksum(Vcb, th)) {
261 checksum_error = true;
262 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
263 } else if (generation != 0 && th->generation != generation) {
264 checksum_error = true;
265 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
266 }
267 } else if (context->csum) {
268 Status = check_csum(Vcb, buf, (ULONG)context->stripes[stripe].Irp->IoStatus.Information / context->sector_size, context->csum);
269
270 if (Status == STATUS_CRC_ERROR) {
271 checksum_error = true;
272 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
273 } else if (!NT_SUCCESS(Status)) {
274 ERR("check_csum returned %08lx\n", Status);
275 return Status;
276 }
277 }
278
279 if (!checksum_error)
280 return STATUS_SUCCESS;
281
282 if (ci->num_stripes == 1)
283 return STATUS_CRC_ERROR;
284
285 if (context->tree) {
286 tree_header* t2;
287 bool recovered = false;
288
289 t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
290 if (!t2) {
291 ERR("out of memory\n");
292 return STATUS_INSUFFICIENT_RESOURCES;
293 }
294
295 for (j = 0; j < ci->num_stripes; j++) {
296 if (j != stripe && devices[j] && devices[j]->devobj) {
297 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + context->stripes[stripe].stripestart,
298 Vcb->superblock.node_size, (uint8_t*)t2, false);
299 if (!NT_SUCCESS(Status)) {
300 WARN("sync_read_phys returned %08lx\n", Status);
301 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
302 } else {
303 bool checksum_error = !check_tree_checksum(Vcb, t2);
304
305 if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
306 RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
307 ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
308 recovered = true;
309
310 if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
311 Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + context->stripes[stripe].stripestart,
312 t2, Vcb->superblock.node_size);
313 if (!NT_SUCCESS(Status)) {
314 WARN("write_data_phys returned %08lx\n", Status);
315 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
316 }
317 }
318
319 break;
320 } else if (t2->address != addr || checksum_error)
321 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
322 else
323 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_GENERATION_ERRORS);
324 }
325 }
326 }
327
328 if (!recovered) {
329 ERR("unrecoverable checksum error at %I64x\n", addr);
330 ExFreePool(t2);
331 return STATUS_CRC_ERROR;
332 }
333
334 ExFreePool(t2);
335 } else {
336 ULONG sectors = (ULONG)context->stripes[stripe].Irp->IoStatus.Information >> Vcb->sector_shift;
337 uint8_t* sector;
338 void* ptr = context->csum;
339
340 sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
341 if (!sector) {
342 ERR("out of memory\n");
343 return STATUS_INSUFFICIENT_RESOURCES;
344 }
345
346 for (ULONG i = 0; i < sectors; i++) {
347 if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
348 bool recovered = false;
349
350 for (j = 0; j < ci->num_stripes; j++) {
351 if (j != stripe && devices[j] && devices[j]->devobj) {
352 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj,
353 cis[j].offset + context->stripes[stripe].stripestart + ((uint64_t)i << Vcb->sector_shift),
354 Vcb->superblock.sector_size, sector, false);
355 if (!NT_SUCCESS(Status)) {
356 WARN("sync_read_phys returned %08lx\n", Status);
357 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
358 } else {
359 if (check_sector_csum(Vcb, sector, ptr)) {
360 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
361 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe]->devitem.dev_id);
362 recovered = true;
363
364 if (!Vcb->readonly && !devices[stripe]->readonly) { // write good data over bad
365 Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj,
366 cis[stripe].offset + context->stripes[stripe].stripestart + ((uint64_t)i << Vcb->sector_shift),
367 sector, Vcb->superblock.sector_size);
368 if (!NT_SUCCESS(Status)) {
369 WARN("write_data_phys returned %08lx\n", Status);
370 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
371 }
372 }
373
374 break;
375 } else
376 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
377 }
378 }
379 }
380
381 if (!recovered) {
382 ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
383 ExFreePool(sector);
384 return STATUS_CRC_ERROR;
385 }
386 }
387
388 ptr = (uint8_t*)ptr + Vcb->csum_size;
389 }
390
391 ExFreePool(sector);
392 }
393
394 return STATUS_SUCCESS;
395 }
396
read_data_raid0(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation,uint64_t offset)397 static NTSTATUS read_data_raid0(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
398 CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
399 for (uint16_t i = 0; i < ci->num_stripes; i++) {
400 if (context->stripes[i].status == ReadDataStatus_Error) {
401 WARN("stripe %u returned error %08lx\n", i, context->stripes[i].iosb.Status);
402 log_device_error(Vcb, devices[i], BTRFS_DEV_STAT_READ_ERRORS);
403 return context->stripes[i].iosb.Status;
404 }
405 }
406
407 if (context->tree) { // shouldn't happen, as trees shouldn't cross stripe boundaries
408 tree_header* th = (tree_header*)buf;
409 bool checksum_error = !check_tree_checksum(Vcb, th);
410
411 if (checksum_error || addr != th->address || (generation != 0 && generation != th->generation)) {
412 uint64_t off;
413 uint16_t stripe;
414
415 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &off, &stripe);
416
417 ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
418
419 if (checksum_error) {
420 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
421 return STATUS_CRC_ERROR;
422 } else if (addr != th->address) {
423 WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
424 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
425 return STATUS_CRC_ERROR;
426 } else if (generation != 0 && generation != th->generation) {
427 WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
428 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
429 return STATUS_CRC_ERROR;
430 }
431 }
432 } else if (context->csum) {
433 NTSTATUS Status;
434
435 Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
436
437 if (Status == STATUS_CRC_ERROR) {
438 void* ptr = context->csum;
439
440 for (uint32_t i = 0; i < length >> Vcb->sector_shift; i++) {
441 if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
442 uint64_t off;
443 uint16_t stripe;
444
445 get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length, ci->num_stripes, &off, &stripe);
446
447 ERR("unrecoverable checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
448
449 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
450
451 return Status;
452 }
453
454 ptr = (uint8_t*)ptr + Vcb->csum_size;
455 }
456
457 return Status;
458 } else if (!NT_SUCCESS(Status)) {
459 ERR("check_csum returned %08lx\n", Status);
460 return Status;
461 }
462 }
463
464 return STATUS_SUCCESS;
465 }
466
read_data_raid10(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t generation,uint64_t offset)467 static NTSTATUS read_data_raid10(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context,
468 CHUNK_ITEM* ci, device** devices, uint64_t generation, uint64_t offset) {
469 uint16_t stripe = 0;
470 NTSTATUS Status;
471 bool checksum_error = false;
472 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
473
474 for (uint16_t j = 0; j < ci->num_stripes; j++) {
475 if (context->stripes[j].status == ReadDataStatus_Error) {
476 WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
477 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
478 return context->stripes[j].iosb.Status;
479 } else if (context->stripes[j].status == ReadDataStatus_Success)
480 stripe = j;
481 }
482
483 if (context->tree) {
484 tree_header* th = (tree_header*)buf;
485
486 if (!check_tree_checksum(Vcb, th)) {
487 checksum_error = true;
488 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
489 } else if (addr != th->address) {
490 WARN("address of tree was %I64x, not %I64x as expected\n", th->address, addr);
491 checksum_error = true;
492 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
493 } else if (generation != 0 && generation != th->generation) {
494 WARN("generation of tree was %I64x, not %I64x as expected\n", th->generation, generation);
495 checksum_error = true;
496 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
497 }
498 } else if (context->csum) {
499 Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
500
501 if (Status == STATUS_CRC_ERROR)
502 checksum_error = true;
503 else if (!NT_SUCCESS(Status)) {
504 ERR("check_csum returned %08lx\n", Status);
505 return Status;
506 }
507 }
508
509 if (!checksum_error)
510 return STATUS_SUCCESS;
511
512 if (context->tree) {
513 tree_header* t2;
514 uint64_t off;
515 uint16_t badsubstripe = 0;
516 bool recovered = false;
517
518 t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size, ALLOC_TAG);
519 if (!t2) {
520 ERR("out of memory\n");
521 return STATUS_INSUFFICIENT_RESOURCES;
522 }
523
524 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &off, &stripe);
525
526 stripe *= ci->sub_stripes;
527
528 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
529 if (context->stripes[stripe + j].status == ReadDataStatus_Success) {
530 badsubstripe = j;
531 break;
532 }
533 }
534
535 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
536 if (context->stripes[stripe + j].status != ReadDataStatus_Success && devices[stripe + j] && devices[stripe + j]->devobj) {
537 Status = sync_read_phys(devices[stripe + j]->devobj, devices[stripe + j]->fileobj, cis[stripe + j].offset + off,
538 Vcb->superblock.node_size, (uint8_t*)t2, false);
539 if (!NT_SUCCESS(Status)) {
540 WARN("sync_read_phys returned %08lx\n", Status);
541 log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_READ_ERRORS);
542 } else {
543 bool checksum_error = !check_tree_checksum(Vcb, t2);
544
545 if (t2->address == addr && !checksum_error && (generation == 0 || t2->generation == generation)) {
546 RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
547 ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe + j]->devitem.dev_id);
548 recovered = true;
549
550 if (!Vcb->readonly && !devices[stripe + badsubstripe]->readonly && devices[stripe + badsubstripe]->devobj) { // write good data over bad
551 Status = write_data_phys(devices[stripe + badsubstripe]->devobj, devices[stripe + badsubstripe]->fileobj,
552 cis[stripe + badsubstripe].offset + off, t2, Vcb->superblock.node_size);
553 if (!NT_SUCCESS(Status)) {
554 WARN("write_data_phys returned %08lx\n", Status);
555 log_device_error(Vcb, devices[stripe + badsubstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
556 }
557 }
558
559 break;
560 } else if (t2->address != addr || checksum_error)
561 log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
562 else
563 log_device_error(Vcb, devices[stripe + j], BTRFS_DEV_STAT_GENERATION_ERRORS);
564 }
565 }
566 }
567
568 if (!recovered) {
569 ERR("unrecoverable checksum error at %I64x\n", addr);
570 ExFreePool(t2);
571 return STATUS_CRC_ERROR;
572 }
573
574 ExFreePool(t2);
575 } else {
576 ULONG sectors = length >> Vcb->sector_shift;
577 uint8_t* sector;
578 void* ptr = context->csum;
579
580 sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size, ALLOC_TAG);
581 if (!sector) {
582 ERR("out of memory\n");
583 return STATUS_INSUFFICIENT_RESOURCES;
584 }
585
586 for (ULONG i = 0; i < sectors; i++) {
587 if (!check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr)) {
588 uint64_t off;
589 uint16_t stripe2, badsubstripe = 0;
590 bool recovered = false;
591
592 get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
593 ci->num_stripes / ci->sub_stripes, &off, &stripe2);
594
595 stripe2 *= ci->sub_stripes;
596
597 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
598 if (context->stripes[stripe2 + j].status == ReadDataStatus_Success) {
599 badsubstripe = j;
600 break;
601 }
602 }
603
604 log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
605
606 for (uint16_t j = 0; j < ci->sub_stripes; j++) {
607 if (context->stripes[stripe2 + j].status != ReadDataStatus_Success && devices[stripe2 + j] && devices[stripe2 + j]->devobj) {
608 Status = sync_read_phys(devices[stripe2 + j]->devobj, devices[stripe2 + j]->fileobj, cis[stripe2 + j].offset + off,
609 Vcb->superblock.sector_size, sector, false);
610 if (!NT_SUCCESS(Status)) {
611 WARN("sync_read_phys returned %08lx\n", Status);
612 log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_READ_ERRORS);
613 } else {
614 if (check_sector_csum(Vcb, sector, ptr)) {
615 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
616 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe2 + j]->devitem.dev_id);
617 recovered = true;
618
619 if (!Vcb->readonly && !devices[stripe2 + badsubstripe]->readonly && devices[stripe2 + badsubstripe]->devobj) { // write good data over bad
620 Status = write_data_phys(devices[stripe2 + badsubstripe]->devobj, devices[stripe2 + badsubstripe]->fileobj,
621 cis[stripe2 + badsubstripe].offset + off, sector, Vcb->superblock.sector_size);
622 if (!NT_SUCCESS(Status)) {
623 WARN("write_data_phys returned %08lx\n", Status);
624 log_device_error(Vcb, devices[stripe2 + badsubstripe], BTRFS_DEV_STAT_READ_ERRORS);
625 }
626 }
627
628 break;
629 } else
630 log_device_error(Vcb, devices[stripe2 + j], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
631 }
632 }
633 }
634
635 if (!recovered) {
636 ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
637 ExFreePool(sector);
638 return STATUS_CRC_ERROR;
639 }
640 }
641
642 ptr = (uint8_t*)ptr + Vcb->csum_size;
643 }
644
645 ExFreePool(sector);
646 }
647
648 return STATUS_SUCCESS;
649 }
650
read_data_raid5(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t offset,uint64_t generation,chunk * c,bool degraded)651 static NTSTATUS read_data_raid5(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
652 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
653 NTSTATUS Status;
654 bool checksum_error = false;
655 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
656 uint16_t j, stripe = 0;
657 bool no_success = true;
658
659 for (j = 0; j < ci->num_stripes; j++) {
660 if (context->stripes[j].status == ReadDataStatus_Error) {
661 WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
662 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
663 return context->stripes[j].iosb.Status;
664 } else if (context->stripes[j].status == ReadDataStatus_Success) {
665 stripe = j;
666 no_success = false;
667 }
668 }
669
670 if (c) { // check partial stripes
671 LIST_ENTRY* le;
672 uint64_t ps_length = (ci->num_stripes - 1) * ci->stripe_length;
673
674 ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
675
676 le = c->partial_stripes.Flink;
677 while (le != &c->partial_stripes) {
678 partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
679
680 if (ps->address + ps_length > addr && ps->address < addr + length) {
681 ULONG runlength, index;
682
683 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
684
685 while (runlength != 0) {
686 if (index >= ps->bmplen)
687 break;
688
689 if (index + runlength >= ps->bmplen) {
690 runlength = ps->bmplen - index;
691
692 if (runlength == 0)
693 break;
694 }
695
696 uint64_t runstart = ps->address + (index << Vcb->sector_shift);
697 uint64_t runend = runstart + (runlength << Vcb->sector_shift);
698 uint64_t start = max(runstart, addr);
699 uint64_t end = min(runend, addr + length);
700
701 if (end > start)
702 RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
703
704 runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
705 }
706 } else if (ps->address >= addr + length)
707 break;
708
709 le = le->Flink;
710 }
711
712 ExReleaseResourceLite(&c->partial_stripes_lock);
713 }
714
715 if (context->tree) {
716 tree_header* th = (tree_header*)buf;
717
718 if (addr != th->address || !check_tree_checksum(Vcb, th)) {
719 checksum_error = true;
720 if (!no_success && !degraded)
721 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
722 } else if (generation != 0 && generation != th->generation) {
723 checksum_error = true;
724 if (!no_success && !degraded)
725 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
726 }
727 } else if (context->csum) {
728 Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
729
730 if (Status == STATUS_CRC_ERROR) {
731 if (!degraded)
732 WARN("checksum error\n");
733 checksum_error = true;
734 } else if (!NT_SUCCESS(Status)) {
735 ERR("check_csum returned %08lx\n", Status);
736 return Status;
737 }
738 } else if (degraded)
739 checksum_error = true;
740
741 if (!checksum_error)
742 return STATUS_SUCCESS;
743
744 if (context->tree) {
745 uint16_t parity;
746 uint64_t off;
747 bool recovered = false, first = true, failed = false;
748 uint8_t* t2;
749
750 t2 = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * 2, ALLOC_TAG);
751 if (!t2) {
752 ERR("out of memory\n");
753 return STATUS_INSUFFICIENT_RESOURCES;
754 }
755
756 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &off, &stripe);
757
758 parity = (((addr - offset) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
759
760 stripe = (parity + stripe + 1) % ci->num_stripes;
761
762 for (j = 0; j < ci->num_stripes; j++) {
763 if (j != stripe) {
764 if (devices[j] && devices[j]->devobj) {
765 if (first) {
766 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2, false);
767 if (!NT_SUCCESS(Status)) {
768 ERR("sync_read_phys returned %08lx\n", Status);
769 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
770 failed = true;
771 break;
772 }
773
774 first = false;
775 } else {
776 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size, t2 + Vcb->superblock.node_size, false);
777 if (!NT_SUCCESS(Status)) {
778 ERR("sync_read_phys returned %08lx\n", Status);
779 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
780 failed = true;
781 break;
782 }
783
784 do_xor(t2, t2 + Vcb->superblock.node_size, Vcb->superblock.node_size);
785 }
786 } else {
787 failed = true;
788 break;
789 }
790 }
791 }
792
793 if (!failed) {
794 tree_header* t3 = (tree_header*)t2;
795
796 if (t3->address == addr && check_tree_checksum(Vcb, t3) && (generation == 0 || t3->generation == generation)) {
797 RtlCopyMemory(buf, t2, Vcb->superblock.node_size);
798
799 if (!degraded)
800 ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[stripe]->devitem.dev_id);
801
802 recovered = true;
803
804 if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
805 Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off, t2, Vcb->superblock.node_size);
806 if (!NT_SUCCESS(Status)) {
807 WARN("write_data_phys returned %08lx\n", Status);
808 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
809 }
810 }
811 }
812 }
813
814 if (!recovered) {
815 ERR("unrecoverable checksum error at %I64x\n", addr);
816 ExFreePool(t2);
817 return STATUS_CRC_ERROR;
818 }
819
820 ExFreePool(t2);
821 } else {
822 ULONG sectors = length >> Vcb->sector_shift;
823 uint8_t* sector;
824 void* ptr = context->csum;
825
826 sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.sector_size * 2, ALLOC_TAG);
827 if (!sector) {
828 ERR("out of memory\n");
829 return STATUS_INSUFFICIENT_RESOURCES;
830 }
831
832 for (ULONG i = 0; i < sectors; i++) {
833 uint16_t parity;
834 uint64_t off;
835
836 get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
837 ci->num_stripes - 1, &off, &stripe);
838
839 parity = (((addr - offset + ((uint64_t)i << Vcb->sector_shift)) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
840
841 stripe = (parity + stripe + 1) % ci->num_stripes;
842
843 if (!devices[stripe] || !devices[stripe]->devobj || (ptr && !check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr))) {
844 bool recovered = false, first = true, failed = false;
845
846 if (devices[stripe] && devices[stripe]->devobj)
847 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_READ_ERRORS);
848
849 for (j = 0; j < ci->num_stripes; j++) {
850 if (j != stripe) {
851 if (devices[j] && devices[j]->devobj) {
852 if (first) {
853 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size, sector, false);
854 if (!NT_SUCCESS(Status)) {
855 ERR("sync_read_phys returned %08lx\n", Status);
856 failed = true;
857 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
858 break;
859 }
860
861 first = false;
862 } else {
863 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
864 sector + Vcb->superblock.sector_size, false);
865 if (!NT_SUCCESS(Status)) {
866 ERR("sync_read_phys returned %08lx\n", Status);
867 failed = true;
868 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
869 break;
870 }
871
872 do_xor(sector, sector + Vcb->superblock.sector_size, Vcb->superblock.sector_size);
873 }
874 } else {
875 failed = true;
876 break;
877 }
878 }
879 }
880
881 if (!failed) {
882 if (!ptr || check_sector_csum(Vcb, sector, ptr)) {
883 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector, Vcb->superblock.sector_size);
884
885 if (!degraded)
886 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift), devices[stripe]->devitem.dev_id);
887
888 recovered = true;
889
890 if (!Vcb->readonly && devices[stripe] && !devices[stripe]->readonly && devices[stripe]->devobj) { // write good data over bad
891 Status = write_data_phys(devices[stripe]->devobj, devices[stripe]->fileobj, cis[stripe].offset + off,
892 sector, Vcb->superblock.sector_size);
893 if (!NT_SUCCESS(Status)) {
894 WARN("write_data_phys returned %08lx\n", Status);
895 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_WRITE_ERRORS);
896 }
897 }
898 }
899 }
900
901 if (!recovered) {
902 ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
903 ExFreePool(sector);
904 return STATUS_CRC_ERROR;
905 }
906 }
907
908 if (ptr)
909 ptr = (uint8_t*)ptr + Vcb->csum_size;
910 }
911
912 ExFreePool(sector);
913 }
914
915 return STATUS_SUCCESS;
916 }
917
raid6_recover2(uint8_t * sectors,uint16_t num_stripes,ULONG sector_size,uint16_t missing1,uint16_t missing2,uint8_t * out)918 void raid6_recover2(uint8_t* sectors, uint16_t num_stripes, ULONG sector_size, uint16_t missing1, uint16_t missing2, uint8_t* out) {
919 if (missing1 == num_stripes - 2 || missing2 == num_stripes - 2) { // reconstruct from q and data
920 uint16_t missing = missing1 == (num_stripes - 2) ? missing2 : missing1;
921 uint16_t stripe;
922
923 stripe = num_stripes - 3;
924
925 if (stripe == missing)
926 RtlZeroMemory(out, sector_size);
927 else
928 RtlCopyMemory(out, sectors + (stripe * sector_size), sector_size);
929
930 do {
931 stripe--;
932
933 galois_double(out, sector_size);
934
935 if (stripe != missing)
936 do_xor(out, sectors + (stripe * sector_size), sector_size);
937 } while (stripe > 0);
938
939 do_xor(out, sectors + ((num_stripes - 1) * sector_size), sector_size);
940
941 if (missing != 0)
942 galois_divpower(out, (uint8_t)missing, sector_size);
943 } else { // reconstruct from p and q
944 uint16_t x = missing1, y = missing2, stripe;
945 uint8_t gyx, gx, denom, a, b, *p, *q, *pxy, *qxy;
946 uint32_t j;
947
948 stripe = num_stripes - 3;
949
950 pxy = out + sector_size;
951 qxy = out;
952
953 if (stripe == missing1 || stripe == missing2) {
954 RtlZeroMemory(qxy, sector_size);
955 RtlZeroMemory(pxy, sector_size);
956 } else {
957 RtlCopyMemory(qxy, sectors + (stripe * sector_size), sector_size);
958 RtlCopyMemory(pxy, sectors + (stripe * sector_size), sector_size);
959 }
960
961 do {
962 stripe--;
963
964 galois_double(qxy, sector_size);
965
966 if (stripe != missing1 && stripe != missing2) {
967 do_xor(qxy, sectors + (stripe * sector_size), sector_size);
968 do_xor(pxy, sectors + (stripe * sector_size), sector_size);
969 }
970 } while (stripe > 0);
971
972 gyx = gpow2(y > x ? (y-x) : (255-x+y));
973 gx = gpow2(255-x);
974
975 denom = gdiv(1, gyx ^ 1);
976 a = gmul(gyx, denom);
977 b = gmul(gx, denom);
978
979 p = sectors + ((num_stripes - 2) * sector_size);
980 q = sectors + ((num_stripes - 1) * sector_size);
981
982 for (j = 0; j < sector_size; j++) {
983 *qxy = gmul(a, *p ^ *pxy) ^ gmul(b, *q ^ *qxy);
984
985 p++;
986 q++;
987 pxy++;
988 qxy++;
989 }
990
991 do_xor(out + sector_size, out, sector_size);
992 do_xor(out + sector_size, sectors + ((num_stripes - 2) * sector_size), sector_size);
993 }
994 }
995
read_data_raid6(device_extension * Vcb,uint8_t * buf,uint64_t addr,uint32_t length,read_data_context * context,CHUNK_ITEM * ci,device ** devices,uint64_t offset,uint64_t generation,chunk * c,bool degraded)996 static NTSTATUS read_data_raid6(device_extension* Vcb, uint8_t* buf, uint64_t addr, uint32_t length, read_data_context* context, CHUNK_ITEM* ci,
997 device** devices, uint64_t offset, uint64_t generation, chunk* c, bool degraded) {
998 NTSTATUS Status;
999 bool checksum_error = false;
1000 CHUNK_ITEM_STRIPE* cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1001 uint16_t stripe = 0, j;
1002 bool no_success = true;
1003
1004 for (j = 0; j < ci->num_stripes; j++) {
1005 if (context->stripes[j].status == ReadDataStatus_Error) {
1006 WARN("stripe %u returned error %08lx\n", j, context->stripes[j].iosb.Status);
1007
1008 if (devices[j])
1009 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1010 return context->stripes[j].iosb.Status;
1011 } else if (context->stripes[j].status == ReadDataStatus_Success) {
1012 stripe = j;
1013 no_success = false;
1014 }
1015 }
1016
1017 if (c) { // check partial stripes
1018 LIST_ENTRY* le;
1019 uint64_t ps_length = (ci->num_stripes - 2) * ci->stripe_length;
1020
1021 ExAcquireResourceSharedLite(&c->partial_stripes_lock, true);
1022
1023 le = c->partial_stripes.Flink;
1024 while (le != &c->partial_stripes) {
1025 partial_stripe* ps = CONTAINING_RECORD(le, partial_stripe, list_entry);
1026
1027 if (ps->address + ps_length > addr && ps->address < addr + length) {
1028 ULONG runlength, index;
1029
1030 runlength = RtlFindFirstRunClear(&ps->bmp, &index);
1031
1032 while (runlength != 0) {
1033 if (index >= ps->bmplen)
1034 break;
1035
1036 if (index + runlength >= ps->bmplen) {
1037 runlength = ps->bmplen - index;
1038
1039 if (runlength == 0)
1040 break;
1041 }
1042
1043 uint64_t runstart = ps->address + (index << Vcb->sector_shift);
1044 uint64_t runend = runstart + (runlength << Vcb->sector_shift);
1045 uint64_t start = max(runstart, addr);
1046 uint64_t end = min(runend, addr + length);
1047
1048 if (end > start)
1049 RtlCopyMemory(buf + start - addr, &ps->data[start - ps->address], (ULONG)(end - start));
1050
1051 runlength = RtlFindNextForwardRunClear(&ps->bmp, index + runlength, &index);
1052 }
1053 } else if (ps->address >= addr + length)
1054 break;
1055
1056 le = le->Flink;
1057 }
1058
1059 ExReleaseResourceLite(&c->partial_stripes_lock);
1060 }
1061
1062 if (context->tree) {
1063 tree_header* th = (tree_header*)buf;
1064
1065 if (addr != th->address || !check_tree_checksum(Vcb, th)) {
1066 checksum_error = true;
1067 if (!no_success && !degraded && devices[stripe])
1068 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1069 } else if (generation != 0 && generation != th->generation) {
1070 checksum_error = true;
1071 if (!no_success && !degraded && devices[stripe])
1072 log_device_error(Vcb, devices[stripe], BTRFS_DEV_STAT_GENERATION_ERRORS);
1073 }
1074 } else if (context->csum) {
1075 Status = check_csum(Vcb, buf, length >> Vcb->sector_shift, context->csum);
1076
1077 if (Status == STATUS_CRC_ERROR) {
1078 if (!degraded)
1079 WARN("checksum error\n");
1080 checksum_error = true;
1081 } else if (!NT_SUCCESS(Status)) {
1082 ERR("check_csum returned %08lx\n", Status);
1083 return Status;
1084 }
1085 } else if (degraded)
1086 checksum_error = true;
1087
1088 if (!checksum_error)
1089 return STATUS_SUCCESS;
1090
1091 if (context->tree) {
1092 uint8_t* sector;
1093 uint16_t k, physstripe, parity1, parity2, error_stripe = 0;
1094 uint64_t off;
1095 bool recovered = false, failed = false;
1096 ULONG num_errors = 0;
1097
1098 sector = ExAllocatePoolWithTag(NonPagedPool, Vcb->superblock.node_size * (ci->num_stripes + 2), ALLOC_TAG);
1099 if (!sector) {
1100 ERR("out of memory\n");
1101 return STATUS_INSUFFICIENT_RESOURCES;
1102 }
1103
1104 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &off, &stripe);
1105
1106 parity1 = (((addr - offset) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1107 parity2 = (parity1 + 1) % ci->num_stripes;
1108
1109 physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1110
1111 j = (parity2 + 1) % ci->num_stripes;
1112
1113 for (k = 0; k < ci->num_stripes - 1; k++) {
1114 if (j != physstripe) {
1115 if (devices[j] && devices[j]->devobj) {
1116 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.node_size,
1117 sector + (k * Vcb->superblock.node_size), false);
1118 if (!NT_SUCCESS(Status)) {
1119 ERR("sync_read_phys returned %08lx\n", Status);
1120 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1121 num_errors++;
1122 error_stripe = k;
1123
1124 if (num_errors > 1) {
1125 failed = true;
1126 break;
1127 }
1128 }
1129 } else {
1130 num_errors++;
1131 error_stripe = k;
1132
1133 if (num_errors > 1) {
1134 failed = true;
1135 break;
1136 }
1137 }
1138 }
1139
1140 j = (j + 1) % ci->num_stripes;
1141 }
1142
1143 if (!failed) {
1144 if (num_errors == 0) {
1145 tree_header* th = (tree_header*)(sector + (stripe * Vcb->superblock.node_size));
1146
1147 RtlCopyMemory(sector + (stripe * Vcb->superblock.node_size), sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size),
1148 Vcb->superblock.node_size);
1149
1150 for (j = 0; j < ci->num_stripes - 2; j++) {
1151 if (j != stripe)
1152 do_xor(sector + (stripe * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size), Vcb->superblock.node_size);
1153 }
1154
1155 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1156 RtlCopyMemory(buf, sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1157
1158 if (devices[physstripe] && devices[physstripe]->devobj)
1159 ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1160
1161 recovered = true;
1162
1163 if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1164 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1165 sector + (stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1166 if (!NT_SUCCESS(Status)) {
1167 WARN("write_data_phys returned %08lx\n", Status);
1168 log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1169 }
1170 }
1171 }
1172 }
1173
1174 if (!recovered) {
1175 tree_header* th = (tree_header*)(sector + (ci->num_stripes * Vcb->superblock.node_size));
1176 bool read_q = false;
1177
1178 if (devices[parity2] && devices[parity2]->devobj) {
1179 Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1180 Vcb->superblock.node_size, sector + ((ci->num_stripes - 1) * Vcb->superblock.node_size), false);
1181 if (!NT_SUCCESS(Status)) {
1182 ERR("sync_read_phys returned %08lx\n", Status);
1183 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1184 } else
1185 read_q = true;
1186 }
1187
1188 if (read_q) {
1189 if (num_errors == 1) {
1190 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, error_stripe, sector + (ci->num_stripes * Vcb->superblock.node_size));
1191
1192 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation))
1193 recovered = true;
1194 } else {
1195 for (j = 0; j < ci->num_stripes - 1; j++) {
1196 if (j != stripe) {
1197 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.node_size, stripe, j, sector + (ci->num_stripes * Vcb->superblock.node_size));
1198
1199 if (th->address == addr && check_tree_checksum(Vcb, th) && (generation == 0 || th->generation == generation)) {
1200 recovered = true;
1201 error_stripe = j;
1202 break;
1203 }
1204 }
1205 }
1206 }
1207 }
1208
1209 if (recovered) {
1210 uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1211
1212 if (devices[physstripe] && devices[physstripe]->devobj)
1213 ERR("recovering from checksum error at %I64x, device %I64x\n", addr, devices[physstripe]->devitem.dev_id);
1214
1215 RtlCopyMemory(buf, sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1216
1217 if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1218 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1219 sector + (ci->num_stripes * Vcb->superblock.node_size), Vcb->superblock.node_size);
1220 if (!NT_SUCCESS(Status)) {
1221 WARN("write_data_phys returned %08lx\n", Status);
1222 log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1223 }
1224 }
1225
1226 if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1227 if (error_stripe == ci->num_stripes - 2) {
1228 ERR("recovering from parity error at %I64x, device %I64x\n", addr, devices[error_stripe_phys]->devitem.dev_id);
1229
1230 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1231
1232 RtlZeroMemory(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1233
1234 for (j = 0; j < ci->num_stripes - 2; j++) {
1235 if (j == stripe) {
1236 do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (ci->num_stripes * Vcb->superblock.node_size),
1237 Vcb->superblock.node_size);
1238 } else {
1239 do_xor(sector + ((ci->num_stripes - 2) * Vcb->superblock.node_size), sector + (j * Vcb->superblock.node_size),
1240 Vcb->superblock.node_size);
1241 }
1242 }
1243 } else {
1244 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((error_stripe - stripe) * ci->stripe_length),
1245 devices[error_stripe_phys]->devitem.dev_id);
1246
1247 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1248
1249 RtlCopyMemory(sector + (error_stripe * Vcb->superblock.node_size),
1250 sector + ((ci->num_stripes + 1) * Vcb->superblock.node_size), Vcb->superblock.node_size);
1251 }
1252 }
1253
1254 if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1255 Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1256 sector + (error_stripe * Vcb->superblock.node_size), Vcb->superblock.node_size);
1257 if (!NT_SUCCESS(Status)) {
1258 WARN("write_data_phys returned %08lx\n", Status);
1259 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1260 }
1261 }
1262 }
1263 }
1264 }
1265
1266 if (!recovered) {
1267 ERR("unrecoverable checksum error at %I64x\n", addr);
1268 ExFreePool(sector);
1269 return STATUS_CRC_ERROR;
1270 }
1271
1272 ExFreePool(sector);
1273 } else {
1274 ULONG sectors = length >> Vcb->sector_shift;
1275 uint8_t* sector;
1276 void* ptr = context->csum;
1277
1278 sector = ExAllocatePoolWithTag(NonPagedPool, (ci->num_stripes + 2) << Vcb->sector_shift, ALLOC_TAG);
1279 if (!sector) {
1280 ERR("out of memory\n");
1281 return STATUS_INSUFFICIENT_RESOURCES;
1282 }
1283
1284 for (ULONG i = 0; i < sectors; i++) {
1285 uint64_t off;
1286 uint16_t physstripe, parity1, parity2;
1287
1288 get_raid0_offset(addr - offset + ((uint64_t)i << Vcb->sector_shift), ci->stripe_length,
1289 ci->num_stripes - 2, &off, &stripe);
1290
1291 parity1 = (((addr - offset + ((uint64_t)i << Vcb->sector_shift)) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
1292 parity2 = (parity1 + 1) % ci->num_stripes;
1293
1294 physstripe = (parity2 + stripe + 1) % ci->num_stripes;
1295
1296 if (!devices[physstripe] || !devices[physstripe]->devobj || (context->csum && !check_sector_csum(Vcb, buf + (i << Vcb->sector_shift), ptr))) {
1297 uint16_t error_stripe = 0;
1298 bool recovered = false, failed = false;
1299 ULONG num_errors = 0;
1300
1301 if (devices[physstripe] && devices[physstripe]->devobj)
1302 log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_READ_ERRORS);
1303
1304 j = (parity2 + 1) % ci->num_stripes;
1305
1306 for (uint16_t k = 0; k < ci->num_stripes - 1; k++) {
1307 if (j != physstripe) {
1308 if (devices[j] && devices[j]->devobj) {
1309 Status = sync_read_phys(devices[j]->devobj, devices[j]->fileobj, cis[j].offset + off, Vcb->superblock.sector_size,
1310 sector + ((ULONG)k << Vcb->sector_shift), false);
1311 if (!NT_SUCCESS(Status)) {
1312 ERR("sync_read_phys returned %08lx\n", Status);
1313 log_device_error(Vcb, devices[j], BTRFS_DEV_STAT_READ_ERRORS);
1314 num_errors++;
1315 error_stripe = k;
1316
1317 if (num_errors > 1) {
1318 failed = true;
1319 break;
1320 }
1321 }
1322 } else {
1323 num_errors++;
1324 error_stripe = k;
1325
1326 if (num_errors > 1) {
1327 failed = true;
1328 break;
1329 }
1330 }
1331 }
1332
1333 j = (j + 1) % ci->num_stripes;
1334 }
1335
1336 if (!failed) {
1337 if (num_errors == 0) {
1338 RtlCopyMemory(sector + ((unsigned int)stripe << Vcb->sector_shift), sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), Vcb->superblock.sector_size);
1339
1340 for (j = 0; j < ci->num_stripes - 2; j++) {
1341 if (j != stripe)
1342 do_xor(sector + ((unsigned int)stripe << Vcb->sector_shift), sector + ((unsigned int)j << Vcb->sector_shift), Vcb->superblock.sector_size);
1343 }
1344
1345 if (!ptr || check_sector_csum(Vcb, sector + ((unsigned int)stripe << Vcb->sector_shift), ptr)) {
1346 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector + ((unsigned int)stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1347
1348 if (devices[physstripe] && devices[physstripe]->devobj)
1349 ERR("recovering from checksum error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift),
1350 devices[physstripe]->devitem.dev_id);
1351
1352 recovered = true;
1353
1354 if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1355 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1356 sector + ((unsigned int)stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1357 if (!NT_SUCCESS(Status)) {
1358 WARN("write_data_phys returned %08lx\n", Status);
1359 log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1360 }
1361 }
1362 }
1363 }
1364
1365 if (!recovered) {
1366 bool read_q = false;
1367
1368 if (devices[parity2] && devices[parity2]->devobj) {
1369 Status = sync_read_phys(devices[parity2]->devobj, devices[parity2]->fileobj, cis[parity2].offset + off,
1370 Vcb->superblock.sector_size, sector + ((unsigned int)(ci->num_stripes - 1) << Vcb->sector_shift), false);
1371 if (!NT_SUCCESS(Status)) {
1372 ERR("sync_read_phys returned %08lx\n", Status);
1373 log_device_error(Vcb, devices[parity2], BTRFS_DEV_STAT_READ_ERRORS);
1374 } else
1375 read_q = true;
1376 }
1377
1378 if (read_q) {
1379 if (num_errors == 1) {
1380 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, error_stripe, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift));
1381
1382 if (!devices[physstripe] || !devices[physstripe]->devobj)
1383 recovered = true;
1384 else
1385 recovered = check_sector_csum(Vcb, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), ptr);
1386 } else {
1387 for (j = 0; j < ci->num_stripes - 1; j++) {
1388 if (j != stripe) {
1389 raid6_recover2(sector, ci->num_stripes, Vcb->superblock.sector_size, stripe, j, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift));
1390
1391 if (check_sector_csum(Vcb, sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), ptr)) {
1392 recovered = true;
1393 error_stripe = j;
1394 break;
1395 }
1396 }
1397 }
1398 }
1399 }
1400
1401 if (recovered) {
1402 uint16_t error_stripe_phys = (parity2 + error_stripe + 1) % ci->num_stripes;
1403
1404 if (devices[physstripe] && devices[physstripe]->devobj)
1405 ERR("recovering from checksum error at %I64x, device %I64x\n",
1406 addr + ((uint64_t)i << Vcb->sector_shift), devices[physstripe]->devitem.dev_id);
1407
1408 RtlCopyMemory(buf + (i << Vcb->sector_shift), sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), Vcb->superblock.sector_size);
1409
1410 if (!Vcb->readonly && devices[physstripe] && devices[physstripe]->devobj && !devices[physstripe]->readonly) { // write good data over bad
1411 Status = write_data_phys(devices[physstripe]->devobj, devices[physstripe]->fileobj, cis[physstripe].offset + off,
1412 sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift), Vcb->superblock.sector_size);
1413 if (!NT_SUCCESS(Status)) {
1414 WARN("write_data_phys returned %08lx\n", Status);
1415 log_device_error(Vcb, devices[physstripe], BTRFS_DEV_STAT_WRITE_ERRORS);
1416 }
1417 }
1418
1419 if (devices[error_stripe_phys] && devices[error_stripe_phys]->devobj) {
1420 if (error_stripe == ci->num_stripes - 2) {
1421 ERR("recovering from parity error at %I64x, device %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift),
1422 devices[error_stripe_phys]->devitem.dev_id);
1423
1424 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1425
1426 RtlZeroMemory(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), Vcb->superblock.sector_size);
1427
1428 for (j = 0; j < ci->num_stripes - 2; j++) {
1429 if (j == stripe) {
1430 do_xor(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), sector + ((unsigned int)ci->num_stripes << Vcb->sector_shift),
1431 Vcb->superblock.sector_size);
1432 } else {
1433 do_xor(sector + ((unsigned int)(ci->num_stripes - 2) << Vcb->sector_shift), sector + ((unsigned int)j << Vcb->sector_shift),
1434 Vcb->superblock.sector_size);
1435 }
1436 }
1437 } else {
1438 ERR("recovering from checksum error at %I64x, device %I64x\n",
1439 addr + ((uint64_t)i << Vcb->sector_shift) + ((error_stripe - stripe) * ci->stripe_length),
1440 devices[error_stripe_phys]->devitem.dev_id);
1441
1442 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_CORRUPTION_ERRORS);
1443
1444 RtlCopyMemory(sector + ((unsigned int)error_stripe << Vcb->sector_shift),
1445 sector + ((unsigned int)(ci->num_stripes + 1) << Vcb->sector_shift), Vcb->superblock.sector_size);
1446 }
1447 }
1448
1449 if (!Vcb->readonly && devices[error_stripe_phys] && devices[error_stripe_phys]->devobj && !devices[error_stripe_phys]->readonly) { // write good data over bad
1450 Status = write_data_phys(devices[error_stripe_phys]->devobj, devices[error_stripe_phys]->fileobj, cis[error_stripe_phys].offset + off,
1451 sector + ((unsigned int)error_stripe << Vcb->sector_shift), Vcb->superblock.sector_size);
1452 if (!NT_SUCCESS(Status)) {
1453 WARN("write_data_phys returned %08lx\n", Status);
1454 log_device_error(Vcb, devices[error_stripe_phys], BTRFS_DEV_STAT_WRITE_ERRORS);
1455 }
1456 }
1457 }
1458 }
1459 }
1460
1461 if (!recovered) {
1462 ERR("unrecoverable checksum error at %I64x\n", addr + ((uint64_t)i << Vcb->sector_shift));
1463 ExFreePool(sector);
1464 return STATUS_CRC_ERROR;
1465 }
1466 }
1467
1468 if (ptr)
1469 ptr = (uint8_t*)ptr + Vcb->csum_size;
1470 }
1471
1472 ExFreePool(sector);
1473 }
1474
1475 return STATUS_SUCCESS;
1476 }
1477
1478 NTSTATUS read_data(_In_ device_extension* Vcb, _In_ uint64_t addr, _In_ uint32_t length, _In_reads_bytes_opt_(length*sizeof(uint32_t)/Vcb->superblock.sector_size) void* csum,
1479 _In_ bool is_tree, _Out_writes_bytes_(length) uint8_t* buf, _In_opt_ chunk* c, _Out_opt_ chunk** pc, _In_opt_ PIRP Irp, _In_ uint64_t generation, _In_ bool file_read,
1480 _In_ ULONG priority) {
1481 CHUNK_ITEM* ci;
1482 CHUNK_ITEM_STRIPE* cis;
1483 read_data_context context;
1484 uint64_t type, offset, total_reading = 0;
1485 NTSTATUS Status;
1486 device** devices = NULL;
1487 uint16_t i, startoffstripe, allowed_missing, missing_devices = 0;
1488 uint8_t* dummypage = NULL;
1489 PMDL dummy_mdl = NULL;
1490 bool need_to_wait;
1491 uint64_t lockaddr, locklen;
1492
1493 if (Vcb->log_to_phys_loaded) {
1494 if (!c) {
1495 c = get_chunk_from_address(Vcb, addr);
1496
1497 if (!c) {
1498 ERR("get_chunk_from_address failed\n");
1499 return STATUS_INTERNAL_ERROR;
1500 }
1501 }
1502
1503 ci = c->chunk_item;
1504 offset = c->offset;
1505 devices = c->devices;
1506
1507 if (pc)
1508 *pc = c;
1509 } else {
1510 LIST_ENTRY* le = Vcb->sys_chunks.Flink;
1511
1512 ci = NULL;
1513
1514 c = NULL;
1515 while (le != &Vcb->sys_chunks) {
1516 sys_chunk* sc = CONTAINING_RECORD(le, sys_chunk, list_entry);
1517
1518 if (sc->key.obj_id == 0x100 && sc->key.obj_type == TYPE_CHUNK_ITEM && sc->key.offset <= addr) {
1519 CHUNK_ITEM* chunk_item = sc->data;
1520
1521 if ((addr - sc->key.offset) < chunk_item->size && chunk_item->num_stripes > 0) {
1522 ci = chunk_item;
1523 offset = sc->key.offset;
1524 cis = (CHUNK_ITEM_STRIPE*)&chunk_item[1];
1525
1526 devices = ExAllocatePoolWithTag(NonPagedPool, sizeof(device*) * ci->num_stripes, ALLOC_TAG);
1527 if (!devices) {
1528 ERR("out of memory\n");
1529 return STATUS_INSUFFICIENT_RESOURCES;
1530 }
1531
1532 for (i = 0; i < ci->num_stripes; i++) {
1533 devices[i] = find_device_from_uuid(Vcb, &cis[i].dev_uuid);
1534 }
1535
1536 break;
1537 }
1538 }
1539
1540 le = le->Flink;
1541 }
1542
1543 if (!ci) {
1544 ERR("could not find chunk for %I64x in bootstrap\n", addr);
1545 return STATUS_INTERNAL_ERROR;
1546 }
1547
1548 if (pc)
1549 *pc = NULL;
1550 }
1551
1552 if (ci->type & BLOCK_FLAG_DUPLICATE) {
1553 type = BLOCK_FLAG_DUPLICATE;
1554 allowed_missing = ci->num_stripes - 1;
1555 } else if (ci->type & BLOCK_FLAG_RAID0) {
1556 type = BLOCK_FLAG_RAID0;
1557 allowed_missing = 0;
1558 } else if (ci->type & BLOCK_FLAG_RAID1) {
1559 type = BLOCK_FLAG_DUPLICATE;
1560 allowed_missing = 1;
1561 } else if (ci->type & BLOCK_FLAG_RAID10) {
1562 type = BLOCK_FLAG_RAID10;
1563 allowed_missing = 1;
1564 } else if (ci->type & BLOCK_FLAG_RAID5) {
1565 type = BLOCK_FLAG_RAID5;
1566 allowed_missing = 1;
1567 } else if (ci->type & BLOCK_FLAG_RAID6) {
1568 type = BLOCK_FLAG_RAID6;
1569 allowed_missing = 2;
1570 } else if (ci->type & BLOCK_FLAG_RAID1C3) {
1571 type = BLOCK_FLAG_DUPLICATE;
1572 allowed_missing = 2;
1573 } else if (ci->type & BLOCK_FLAG_RAID1C4) {
1574 type = BLOCK_FLAG_DUPLICATE;
1575 allowed_missing = 3;
1576 } else { // SINGLE
1577 type = BLOCK_FLAG_DUPLICATE;
1578 allowed_missing = 0;
1579 }
1580
1581 cis = (CHUNK_ITEM_STRIPE*)&ci[1];
1582
1583 RtlZeroMemory(&context, sizeof(read_data_context));
1584 KeInitializeEvent(&context.Event, NotificationEvent, false);
1585
1586 context.stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe) * ci->num_stripes, ALLOC_TAG);
1587 if (!context.stripes) {
1588 ERR("out of memory\n");
1589 return STATUS_INSUFFICIENT_RESOURCES;
1590 }
1591
1592 if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6)) {
1593 get_raid56_lock_range(c, addr, length, &lockaddr, &locklen);
1594 chunk_lock_range(Vcb, c, lockaddr, locklen);
1595 }
1596
1597 RtlZeroMemory(context.stripes, sizeof(read_data_stripe) * ci->num_stripes);
1598
1599 context.buflen = length;
1600 context.num_stripes = ci->num_stripes;
1601 context.stripes_left = context.num_stripes;
1602 context.sector_size = Vcb->superblock.sector_size;
1603 context.csum = csum;
1604 context.tree = is_tree;
1605 context.type = type;
1606
1607 if (type == BLOCK_FLAG_RAID0) {
1608 uint64_t startoff, endoff;
1609 uint16_t endoffstripe, stripe;
1610 uint32_t *stripeoff, pos;
1611 PMDL master_mdl;
1612 PFN_NUMBER* pfns;
1613
1614 // FIXME - test this still works if page size isn't the same as sector size
1615
1616 // This relies on the fact that MDLs are followed in memory by the page file numbers,
1617 // so with a bit of jiggery-pokery you can trick your disks into deinterlacing your RAID0
1618 // data for you without doing a memcpy yourself.
1619 // MDLs are officially opaque, so this might very well break in future versions of Windows.
1620
1621 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes, &startoff, &startoffstripe);
1622 get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes, &endoff, &endoffstripe);
1623
1624 if (file_read) {
1625 // Unfortunately we can't avoid doing at least one memcpy, as Windows can give us an MDL
1626 // with duplicated dummy PFNs, which confuse check_csum. Ah well.
1627 // See https://msdn.microsoft.com/en-us/library/windows/hardware/Dn614012.aspx if you're interested.
1628
1629 context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1630
1631 if (!context.va) {
1632 ERR("out of memory\n");
1633 Status = STATUS_INSUFFICIENT_RESOURCES;
1634 goto exit;
1635 }
1636 } else
1637 context.va = buf;
1638
1639 master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1640 if (!master_mdl) {
1641 ERR("out of memory\n");
1642 Status = STATUS_INSUFFICIENT_RESOURCES;
1643 goto exit;
1644 }
1645
1646 Status = STATUS_SUCCESS;
1647
1648 _SEH2_TRY {
1649 MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1650 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1651 Status = _SEH2_GetExceptionCode();
1652 } _SEH2_END;
1653
1654 if (!NT_SUCCESS(Status)) {
1655 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1656 IoFreeMdl(master_mdl);
1657 goto exit;
1658 }
1659
1660 pfns = (PFN_NUMBER*)(master_mdl + 1);
1661
1662 for (i = 0; i < ci->num_stripes; i++) {
1663 if (startoffstripe > i)
1664 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1665 else if (startoffstripe == i)
1666 context.stripes[i].stripestart = startoff;
1667 else
1668 context.stripes[i].stripestart = startoff - (startoff % ci->stripe_length);
1669
1670 if (endoffstripe > i)
1671 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1672 else if (endoffstripe == i)
1673 context.stripes[i].stripeend = endoff + 1;
1674 else
1675 context.stripes[i].stripeend = endoff - (endoff % ci->stripe_length);
1676
1677 if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
1678 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
1679
1680 if (!context.stripes[i].mdl) {
1681 ERR("IoAllocateMdl failed\n");
1682 MmUnlockPages(master_mdl);
1683 IoFreeMdl(master_mdl);
1684 Status = STATUS_INSUFFICIENT_RESOURCES;
1685 goto exit;
1686 }
1687 }
1688 }
1689
1690 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
1691 if (!stripeoff) {
1692 ERR("out of memory\n");
1693 MmUnlockPages(master_mdl);
1694 IoFreeMdl(master_mdl);
1695 Status = STATUS_INSUFFICIENT_RESOURCES;
1696 goto exit;
1697 }
1698
1699 RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
1700
1701 pos = 0;
1702 stripe = startoffstripe;
1703 while (pos < length) {
1704 PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
1705
1706 if (pos == 0) {
1707 uint32_t readlen = (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length));
1708
1709 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1710
1711 stripeoff[stripe] += readlen;
1712 pos += readlen;
1713 } else if (length - pos < ci->stripe_length) {
1714 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1715
1716 pos = length;
1717 } else {
1718 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1719
1720 stripeoff[stripe] += (uint32_t)ci->stripe_length;
1721 pos += (uint32_t)ci->stripe_length;
1722 }
1723
1724 stripe = (stripe + 1) % ci->num_stripes;
1725 }
1726
1727 MmUnlockPages(master_mdl);
1728 IoFreeMdl(master_mdl);
1729
1730 ExFreePool(stripeoff);
1731 } else if (type == BLOCK_FLAG_RAID10) {
1732 uint64_t startoff, endoff;
1733 uint16_t endoffstripe, j, stripe;
1734 ULONG orig_ls;
1735 PMDL master_mdl;
1736 PFN_NUMBER* pfns;
1737 uint32_t* stripeoff, pos;
1738 read_data_stripe** stripes;
1739
1740 if (c)
1741 orig_ls = c->last_stripe;
1742 else
1743 orig_ls = 0;
1744
1745 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &startoff, &startoffstripe);
1746 get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes / ci->sub_stripes, &endoff, &endoffstripe);
1747
1748 if ((ci->num_stripes % ci->sub_stripes) != 0) {
1749 ERR("chunk %I64x: num_stripes %x was not a multiple of sub_stripes %x!\n", offset, ci->num_stripes, ci->sub_stripes);
1750 Status = STATUS_INTERNAL_ERROR;
1751 goto exit;
1752 }
1753
1754 if (file_read) {
1755 context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1756
1757 if (!context.va) {
1758 ERR("out of memory\n");
1759 Status = STATUS_INSUFFICIENT_RESOURCES;
1760 goto exit;
1761 }
1762 } else
1763 context.va = buf;
1764
1765 context.firstoff = (uint16_t)((startoff % ci->stripe_length) >> Vcb->sector_shift);
1766 context.startoffstripe = startoffstripe;
1767 context.sectors_per_stripe = (uint16_t)(ci->stripe_length >> Vcb->sector_shift);
1768
1769 startoffstripe *= ci->sub_stripes;
1770 endoffstripe *= ci->sub_stripes;
1771
1772 if (c)
1773 c->last_stripe = (orig_ls + 1) % ci->sub_stripes;
1774
1775 master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1776 if (!master_mdl) {
1777 ERR("out of memory\n");
1778 Status = STATUS_INSUFFICIENT_RESOURCES;
1779 goto exit;
1780 }
1781
1782 Status = STATUS_SUCCESS;
1783
1784 _SEH2_TRY {
1785 MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1786 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1787 Status = _SEH2_GetExceptionCode();
1788 } _SEH2_END;
1789
1790 if (!NT_SUCCESS(Status)) {
1791 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1792 IoFreeMdl(master_mdl);
1793 goto exit;
1794 }
1795
1796 pfns = (PFN_NUMBER*)(master_mdl + 1);
1797
1798 stripes = ExAllocatePoolWithTag(NonPagedPool, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1799 if (!stripes) {
1800 ERR("out of memory\n");
1801 MmUnlockPages(master_mdl);
1802 IoFreeMdl(master_mdl);
1803 Status = STATUS_INSUFFICIENT_RESOURCES;
1804 goto exit;
1805 }
1806
1807 RtlZeroMemory(stripes, sizeof(read_data_stripe*) * ci->num_stripes / ci->sub_stripes);
1808
1809 for (i = 0; i < ci->num_stripes; i += ci->sub_stripes) {
1810 uint64_t sstart, send;
1811 bool stripeset = false;
1812
1813 if (startoffstripe > i)
1814 sstart = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
1815 else if (startoffstripe == i)
1816 sstart = startoff;
1817 else
1818 sstart = startoff - (startoff % ci->stripe_length);
1819
1820 if (endoffstripe > i)
1821 send = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
1822 else if (endoffstripe == i)
1823 send = endoff + 1;
1824 else
1825 send = endoff - (endoff % ci->stripe_length);
1826
1827 for (j = 0; j < ci->sub_stripes; j++) {
1828 if (j == orig_ls && devices[i+j] && devices[i+j]->devobj) {
1829 context.stripes[i+j].stripestart = sstart;
1830 context.stripes[i+j].stripeend = send;
1831 stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1832
1833 if (sstart != send) {
1834 context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1835
1836 if (!context.stripes[i+j].mdl) {
1837 ERR("IoAllocateMdl failed\n");
1838 MmUnlockPages(master_mdl);
1839 IoFreeMdl(master_mdl);
1840 Status = STATUS_INSUFFICIENT_RESOURCES;
1841 goto exit;
1842 }
1843 }
1844
1845 stripeset = true;
1846 } else
1847 context.stripes[i+j].status = ReadDataStatus_Skip;
1848 }
1849
1850 if (!stripeset) {
1851 for (j = 0; j < ci->sub_stripes; j++) {
1852 if (devices[i+j] && devices[i+j]->devobj) {
1853 context.stripes[i+j].stripestart = sstart;
1854 context.stripes[i+j].stripeend = send;
1855 context.stripes[i+j].status = ReadDataStatus_Pending;
1856 stripes[i / ci->sub_stripes] = &context.stripes[i+j];
1857
1858 if (sstart != send) {
1859 context.stripes[i+j].mdl = IoAllocateMdl(context.va, (ULONG)(send - sstart), false, false, NULL);
1860
1861 if (!context.stripes[i+j].mdl) {
1862 ERR("IoAllocateMdl failed\n");
1863 MmUnlockPages(master_mdl);
1864 IoFreeMdl(master_mdl);
1865 Status = STATUS_INSUFFICIENT_RESOURCES;
1866 goto exit;
1867 }
1868 }
1869
1870 stripeset = true;
1871 break;
1872 }
1873 }
1874
1875 if (!stripeset) {
1876 ERR("could not find stripe to read\n");
1877 Status = STATUS_DEVICE_NOT_READY;
1878 goto exit;
1879 }
1880 }
1881 }
1882
1883 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes, ALLOC_TAG);
1884 if (!stripeoff) {
1885 ERR("out of memory\n");
1886 MmUnlockPages(master_mdl);
1887 IoFreeMdl(master_mdl);
1888 Status = STATUS_INSUFFICIENT_RESOURCES;
1889 goto exit;
1890 }
1891
1892 RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes / ci->sub_stripes);
1893
1894 pos = 0;
1895 stripe = startoffstripe / ci->sub_stripes;
1896 while (pos < length) {
1897 PFN_NUMBER* stripe_pfns = (PFN_NUMBER*)(stripes[stripe]->mdl + 1);
1898
1899 if (pos == 0) {
1900 uint32_t readlen = (uint32_t)min(stripes[stripe]->stripeend - stripes[stripe]->stripestart,
1901 ci->stripe_length - (stripes[stripe]->stripestart % ci->stripe_length));
1902
1903 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1904
1905 stripeoff[stripe] += readlen;
1906 pos += readlen;
1907 } else if (length - pos < ci->stripe_length) {
1908 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (length - pos) * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
1909
1910 pos = length;
1911 } else {
1912 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
1913
1914 stripeoff[stripe] += (ULONG)ci->stripe_length;
1915 pos += (ULONG)ci->stripe_length;
1916 }
1917
1918 stripe = (stripe + 1) % (ci->num_stripes / ci->sub_stripes);
1919 }
1920
1921 MmUnlockPages(master_mdl);
1922 IoFreeMdl(master_mdl);
1923
1924 ExFreePool(stripeoff);
1925 ExFreePool(stripes);
1926 } else if (type == BLOCK_FLAG_DUPLICATE) {
1927 uint64_t orig_ls;
1928
1929 if (c)
1930 orig_ls = i = c->last_stripe;
1931 else
1932 orig_ls = i = 0;
1933
1934 while (!devices[i] || !devices[i]->devobj) {
1935 i = (i + 1) % ci->num_stripes;
1936
1937 if (i == orig_ls) {
1938 ERR("no devices available to service request\n");
1939 Status = STATUS_DEVICE_NOT_READY;
1940 goto exit;
1941 }
1942 }
1943
1944 if (c)
1945 c->last_stripe = (i + 1) % ci->num_stripes;
1946
1947 context.stripes[i].stripestart = addr - offset;
1948 context.stripes[i].stripeend = context.stripes[i].stripestart + length;
1949
1950 if (file_read) {
1951 context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
1952
1953 if (!context.va) {
1954 ERR("out of memory\n");
1955 Status = STATUS_INSUFFICIENT_RESOURCES;
1956 goto exit;
1957 }
1958
1959 context.stripes[i].mdl = IoAllocateMdl(context.va, length, false, false, NULL);
1960 if (!context.stripes[i].mdl) {
1961 ERR("IoAllocateMdl failed\n");
1962 Status = STATUS_INSUFFICIENT_RESOURCES;
1963 goto exit;
1964 }
1965
1966 MmBuildMdlForNonPagedPool(context.stripes[i].mdl);
1967 } else {
1968 context.stripes[i].mdl = IoAllocateMdl(buf, length, false, false, NULL);
1969
1970 if (!context.stripes[i].mdl) {
1971 ERR("IoAllocateMdl failed\n");
1972 Status = STATUS_INSUFFICIENT_RESOURCES;
1973 goto exit;
1974 }
1975
1976 Status = STATUS_SUCCESS;
1977
1978 _SEH2_TRY {
1979 MmProbeAndLockPages(context.stripes[i].mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)1980 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
1981 Status = _SEH2_GetExceptionCode();
1982 } _SEH2_END;
1983
1984 if (!NT_SUCCESS(Status)) {
1985 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
1986 goto exit;
1987 }
1988 }
1989 } else if (type == BLOCK_FLAG_RAID5) {
1990 uint64_t startoff, endoff;
1991 uint16_t endoffstripe, parity;
1992 uint32_t *stripeoff, pos;
1993 PMDL master_mdl;
1994 PFN_NUMBER *pfns, dummy = 0;
1995 bool need_dummy = false;
1996
1997 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 1, &startoff, &startoffstripe);
1998 get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 1, &endoff, &endoffstripe);
1999
2000 if (file_read) {
2001 context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2002
2003 if (!context.va) {
2004 ERR("out of memory\n");
2005 Status = STATUS_INSUFFICIENT_RESOURCES;
2006 goto exit;
2007 }
2008 } else
2009 context.va = buf;
2010
2011 master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2012 if (!master_mdl) {
2013 ERR("out of memory\n");
2014 Status = STATUS_INSUFFICIENT_RESOURCES;
2015 goto exit;
2016 }
2017
2018 Status = STATUS_SUCCESS;
2019
2020 _SEH2_TRY {
2021 MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)2022 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2023 Status = _SEH2_GetExceptionCode();
2024 } _SEH2_END;
2025
2026 if (!NT_SUCCESS(Status)) {
2027 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2028 IoFreeMdl(master_mdl);
2029 goto exit;
2030 }
2031
2032 pfns = (PFN_NUMBER*)(master_mdl + 1);
2033
2034 pos = 0;
2035 while (pos < length) {
2036 parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2037
2038 if (pos == 0) {
2039 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2040 ULONG skip, readlen;
2041
2042 i = startoffstripe;
2043 while (stripe != parity) {
2044 if (i == startoffstripe) {
2045 readlen = min(length, (ULONG)(ci->stripe_length - (startoff % ci->stripe_length)));
2046
2047 context.stripes[stripe].stripestart = startoff;
2048 context.stripes[stripe].stripeend = startoff + readlen;
2049
2050 pos += readlen;
2051
2052 if (pos == length)
2053 break;
2054 } else {
2055 readlen = min(length - pos, (ULONG)ci->stripe_length);
2056
2057 context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2058 context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2059
2060 pos += readlen;
2061
2062 if (pos == length)
2063 break;
2064 }
2065
2066 i++;
2067 stripe = (stripe + 1) % ci->num_stripes;
2068 }
2069
2070 if (pos == length)
2071 break;
2072
2073 for (i = 0; i < startoffstripe; i++) {
2074 uint16_t stripe2 = (parity + i + 1) % ci->num_stripes;
2075
2076 context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2077 }
2078
2079 context.stripes[parity].stripestart = context.stripes[parity].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2080
2081 if (length - pos > ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length) {
2082 skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 1) * ci->stripe_length)) - 1);
2083
2084 for (i = 0; i < ci->num_stripes; i++) {
2085 context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2086 }
2087
2088 pos += (uint32_t)(skip * (ci->num_stripes - 1) * ci->num_stripes * ci->stripe_length);
2089 need_dummy = true;
2090 }
2091 } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2092 for (i = 0; i < ci->num_stripes; i++) {
2093 context.stripes[i].stripeend += ci->stripe_length;
2094 }
2095
2096 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 1));
2097 need_dummy = true;
2098 } else {
2099 uint16_t stripe = (parity + 1) % ci->num_stripes;
2100
2101 i = 0;
2102 while (stripe != parity) {
2103 if (endoffstripe == i) {
2104 context.stripes[stripe].stripeend = endoff + 1;
2105 break;
2106 } else if (endoffstripe > i)
2107 context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2108
2109 i++;
2110 stripe = (stripe + 1) % ci->num_stripes;
2111 }
2112
2113 break;
2114 }
2115 }
2116
2117 for (i = 0; i < ci->num_stripes; i++) {
2118 if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2119 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart),
2120 false, false, NULL);
2121
2122 if (!context.stripes[i].mdl) {
2123 ERR("IoAllocateMdl failed\n");
2124 MmUnlockPages(master_mdl);
2125 IoFreeMdl(master_mdl);
2126 Status = STATUS_INSUFFICIENT_RESOURCES;
2127 goto exit;
2128 }
2129 }
2130 }
2131
2132 if (need_dummy) {
2133 dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2134 if (!dummypage) {
2135 ERR("out of memory\n");
2136 MmUnlockPages(master_mdl);
2137 IoFreeMdl(master_mdl);
2138 Status = STATUS_INSUFFICIENT_RESOURCES;
2139 goto exit;
2140 }
2141
2142 dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2143 if (!dummy_mdl) {
2144 ERR("IoAllocateMdl failed\n");
2145 MmUnlockPages(master_mdl);
2146 IoFreeMdl(master_mdl);
2147 Status = STATUS_INSUFFICIENT_RESOURCES;
2148 goto exit;
2149 }
2150
2151 MmBuildMdlForNonPagedPool(dummy_mdl);
2152
2153 dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2154 }
2155
2156 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2157 if (!stripeoff) {
2158 ERR("out of memory\n");
2159 MmUnlockPages(master_mdl);
2160 IoFreeMdl(master_mdl);
2161 Status = STATUS_INSUFFICIENT_RESOURCES;
2162 goto exit;
2163 }
2164
2165 RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2166
2167 pos = 0;
2168
2169 while (pos < length) {
2170 PFN_NUMBER* stripe_pfns;
2171
2172 parity = (((addr - offset + pos) / ((ci->num_stripes - 1) * ci->stripe_length)) + ci->num_stripes - 1) % ci->num_stripes;
2173
2174 if (pos == 0) {
2175 uint16_t stripe = (parity + startoffstripe + 1) % ci->num_stripes;
2176 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2177 ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2178
2179 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2180
2181 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2182
2183 stripeoff[stripe] = readlen;
2184 pos += readlen;
2185
2186 stripe = (stripe + 1) % ci->num_stripes;
2187
2188 while (stripe != parity) {
2189 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2190 readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2191
2192 if (readlen == 0)
2193 break;
2194
2195 RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2196
2197 stripeoff[stripe] = readlen;
2198 pos += readlen;
2199
2200 stripe = (stripe + 1) % ci->num_stripes;
2201 }
2202 } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 1)) {
2203 uint16_t stripe = (parity + 1) % ci->num_stripes;
2204 ULONG k;
2205
2206 while (stripe != parity) {
2207 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2208
2209 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2210
2211 stripeoff[stripe] += (uint32_t)ci->stripe_length;
2212 pos += (uint32_t)ci->stripe_length;
2213
2214 stripe = (stripe + 1) % ci->num_stripes;
2215 }
2216
2217 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity].mdl + 1);
2218
2219 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2220 stripe_pfns[stripeoff[parity] >> PAGE_SHIFT] = dummy;
2221 stripeoff[parity] += PAGE_SIZE;
2222 }
2223 } else {
2224 uint16_t stripe = (parity + 1) % ci->num_stripes;
2225 uint32_t readlen;
2226
2227 while (pos < length) {
2228 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2229 readlen = min(length - pos, (ULONG)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2230
2231 if (readlen == 0)
2232 break;
2233
2234 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2235
2236 stripeoff[stripe] += readlen;
2237 pos += readlen;
2238
2239 stripe = (stripe + 1) % ci->num_stripes;
2240 }
2241 }
2242 }
2243
2244 MmUnlockPages(master_mdl);
2245 IoFreeMdl(master_mdl);
2246
2247 ExFreePool(stripeoff);
2248 } else if (type == BLOCK_FLAG_RAID6) {
2249 uint64_t startoff, endoff;
2250 uint16_t endoffstripe, parity1;
2251 uint32_t *stripeoff, pos;
2252 PMDL master_mdl;
2253 PFN_NUMBER *pfns, dummy = 0;
2254 bool need_dummy = false;
2255
2256 get_raid0_offset(addr - offset, ci->stripe_length, ci->num_stripes - 2, &startoff, &startoffstripe);
2257 get_raid0_offset(addr + length - offset - 1, ci->stripe_length, ci->num_stripes - 2, &endoff, &endoffstripe);
2258
2259 if (file_read) {
2260 context.va = ExAllocatePoolWithTag(NonPagedPool, length, ALLOC_TAG);
2261
2262 if (!context.va) {
2263 ERR("out of memory\n");
2264 Status = STATUS_INSUFFICIENT_RESOURCES;
2265 goto exit;
2266 }
2267 } else
2268 context.va = buf;
2269
2270 master_mdl = IoAllocateMdl(context.va, length, false, false, NULL);
2271 if (!master_mdl) {
2272 ERR("out of memory\n");
2273 Status = STATUS_INSUFFICIENT_RESOURCES;
2274 goto exit;
2275 }
2276
2277 Status = STATUS_SUCCESS;
2278
2279 _SEH2_TRY {
2280 MmProbeAndLockPages(master_mdl, KernelMode, IoWriteAccess);
_SEH2_EXCEPT(EXCEPTION_EXECUTE_HANDLER)2281 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
2282 Status = _SEH2_GetExceptionCode();
2283 } _SEH2_END;
2284
2285 if (!NT_SUCCESS(Status)) {
2286 ERR("MmProbeAndLockPages threw exception %08lx\n", Status);
2287 IoFreeMdl(master_mdl);
2288 goto exit;
2289 }
2290
2291 pfns = (PFN_NUMBER*)(master_mdl + 1);
2292
2293 pos = 0;
2294 while (pos < length) {
2295 parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2296
2297 if (pos == 0) {
2298 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes, parity2;
2299 ULONG skip, readlen;
2300
2301 i = startoffstripe;
2302 while (stripe != parity1) {
2303 if (i == startoffstripe) {
2304 readlen = (ULONG)min(length, ci->stripe_length - (startoff % ci->stripe_length));
2305
2306 context.stripes[stripe].stripestart = startoff;
2307 context.stripes[stripe].stripeend = startoff + readlen;
2308
2309 pos += readlen;
2310
2311 if (pos == length)
2312 break;
2313 } else {
2314 readlen = min(length - pos, (ULONG)ci->stripe_length);
2315
2316 context.stripes[stripe].stripestart = startoff - (startoff % ci->stripe_length);
2317 context.stripes[stripe].stripeend = context.stripes[stripe].stripestart + readlen;
2318
2319 pos += readlen;
2320
2321 if (pos == length)
2322 break;
2323 }
2324
2325 i++;
2326 stripe = (stripe + 1) % ci->num_stripes;
2327 }
2328
2329 if (pos == length)
2330 break;
2331
2332 for (i = 0; i < startoffstripe; i++) {
2333 uint16_t stripe2 = (parity1 + i + 2) % ci->num_stripes;
2334
2335 context.stripes[stripe2].stripestart = context.stripes[stripe2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2336 }
2337
2338 context.stripes[parity1].stripestart = context.stripes[parity1].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2339
2340 parity2 = (parity1 + 1) % ci->num_stripes;
2341 context.stripes[parity2].stripestart = context.stripes[parity2].stripeend = startoff - (startoff % ci->stripe_length) + ci->stripe_length;
2342
2343 if (length - pos > ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length) {
2344 skip = (ULONG)(((length - pos) / (ci->num_stripes * (ci->num_stripes - 2) * ci->stripe_length)) - 1);
2345
2346 for (i = 0; i < ci->num_stripes; i++) {
2347 context.stripes[i].stripeend += skip * ci->num_stripes * ci->stripe_length;
2348 }
2349
2350 pos += (uint32_t)(skip * (ci->num_stripes - 2) * ci->num_stripes * ci->stripe_length);
2351 need_dummy = true;
2352 }
2353 } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2354 for (i = 0; i < ci->num_stripes; i++) {
2355 context.stripes[i].stripeend += ci->stripe_length;
2356 }
2357
2358 pos += (uint32_t)(ci->stripe_length * (ci->num_stripes - 2));
2359 need_dummy = true;
2360 } else {
2361 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2362
2363 i = 0;
2364 while (stripe != parity1) {
2365 if (endoffstripe == i) {
2366 context.stripes[stripe].stripeend = endoff + 1;
2367 break;
2368 } else if (endoffstripe > i)
2369 context.stripes[stripe].stripeend = endoff - (endoff % ci->stripe_length) + ci->stripe_length;
2370
2371 i++;
2372 stripe = (stripe + 1) % ci->num_stripes;
2373 }
2374
2375 break;
2376 }
2377 }
2378
2379 for (i = 0; i < ci->num_stripes; i++) {
2380 if (context.stripes[i].stripestart != context.stripes[i].stripeend) {
2381 context.stripes[i].mdl = IoAllocateMdl(context.va, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), false, false, NULL);
2382
2383 if (!context.stripes[i].mdl) {
2384 ERR("IoAllocateMdl failed\n");
2385 MmUnlockPages(master_mdl);
2386 IoFreeMdl(master_mdl);
2387 Status = STATUS_INSUFFICIENT_RESOURCES;
2388 goto exit;
2389 }
2390 }
2391 }
2392
2393 if (need_dummy) {
2394 dummypage = ExAllocatePoolWithTag(NonPagedPool, PAGE_SIZE, ALLOC_TAG);
2395 if (!dummypage) {
2396 ERR("out of memory\n");
2397 MmUnlockPages(master_mdl);
2398 IoFreeMdl(master_mdl);
2399 Status = STATUS_INSUFFICIENT_RESOURCES;
2400 goto exit;
2401 }
2402
2403 dummy_mdl = IoAllocateMdl(dummypage, PAGE_SIZE, false, false, NULL);
2404 if (!dummy_mdl) {
2405 ERR("IoAllocateMdl failed\n");
2406 MmUnlockPages(master_mdl);
2407 IoFreeMdl(master_mdl);
2408 Status = STATUS_INSUFFICIENT_RESOURCES;
2409 goto exit;
2410 }
2411
2412 MmBuildMdlForNonPagedPool(dummy_mdl);
2413
2414 dummy = *(PFN_NUMBER*)(dummy_mdl + 1);
2415 }
2416
2417 stripeoff = ExAllocatePoolWithTag(NonPagedPool, sizeof(uint32_t) * ci->num_stripes, ALLOC_TAG);
2418 if (!stripeoff) {
2419 ERR("out of memory\n");
2420 MmUnlockPages(master_mdl);
2421 IoFreeMdl(master_mdl);
2422 Status = STATUS_INSUFFICIENT_RESOURCES;
2423 goto exit;
2424 }
2425
2426 RtlZeroMemory(stripeoff, sizeof(uint32_t) * ci->num_stripes);
2427
2428 pos = 0;
2429
2430 while (pos < length) {
2431 PFN_NUMBER* stripe_pfns;
2432
2433 parity1 = (((addr - offset + pos) / ((ci->num_stripes - 2) * ci->stripe_length)) + ci->num_stripes - 2) % ci->num_stripes;
2434
2435 if (pos == 0) {
2436 uint16_t stripe = (parity1 + startoffstripe + 2) % ci->num_stripes;
2437 uint32_t readlen = min(length - pos, (uint32_t)min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart,
2438 ci->stripe_length - (context.stripes[stripe].stripestart % ci->stripe_length)));
2439
2440 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2441
2442 RtlCopyMemory(stripe_pfns, pfns, readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2443
2444 stripeoff[stripe] = readlen;
2445 pos += readlen;
2446
2447 stripe = (stripe + 1) % ci->num_stripes;
2448
2449 while (stripe != parity1) {
2450 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2451 readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2452
2453 if (readlen == 0)
2454 break;
2455
2456 RtlCopyMemory(stripe_pfns, &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2457
2458 stripeoff[stripe] = readlen;
2459 pos += readlen;
2460
2461 stripe = (stripe + 1) % ci->num_stripes;
2462 }
2463 } else if (length - pos >= ci->stripe_length * (ci->num_stripes - 2)) {
2464 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2465 uint16_t parity2 = (parity1 + 1) % ci->num_stripes;
2466 ULONG k;
2467
2468 while (stripe != parity1) {
2469 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2470
2471 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], (ULONG)(ci->stripe_length * sizeof(PFN_NUMBER) >> PAGE_SHIFT));
2472
2473 stripeoff[stripe] += (uint32_t)ci->stripe_length;
2474 pos += (uint32_t)ci->stripe_length;
2475
2476 stripe = (stripe + 1) % ci->num_stripes;
2477 }
2478
2479 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity1].mdl + 1);
2480
2481 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2482 stripe_pfns[stripeoff[parity1] >> PAGE_SHIFT] = dummy;
2483 stripeoff[parity1] += PAGE_SIZE;
2484 }
2485
2486 stripe_pfns = (PFN_NUMBER*)(context.stripes[parity2].mdl + 1);
2487
2488 for (k = 0; k < ci->stripe_length >> PAGE_SHIFT; k++) {
2489 stripe_pfns[stripeoff[parity2] >> PAGE_SHIFT] = dummy;
2490 stripeoff[parity2] += PAGE_SIZE;
2491 }
2492 } else {
2493 uint16_t stripe = (parity1 + 2) % ci->num_stripes;
2494 uint32_t readlen;
2495
2496 while (pos < length) {
2497 stripe_pfns = (PFN_NUMBER*)(context.stripes[stripe].mdl + 1);
2498 readlen = (uint32_t)min(length - pos, min(context.stripes[stripe].stripeend - context.stripes[stripe].stripestart, ci->stripe_length));
2499
2500 if (readlen == 0)
2501 break;
2502
2503 RtlCopyMemory(&stripe_pfns[stripeoff[stripe] >> PAGE_SHIFT], &pfns[pos >> PAGE_SHIFT], readlen * sizeof(PFN_NUMBER) >> PAGE_SHIFT);
2504
2505 stripeoff[stripe] += readlen;
2506 pos += readlen;
2507
2508 stripe = (stripe + 1) % ci->num_stripes;
2509 }
2510 }
2511 }
2512
2513 MmUnlockPages(master_mdl);
2514 IoFreeMdl(master_mdl);
2515
2516 ExFreePool(stripeoff);
2517 }
2518
2519 context.address = addr;
2520
2521 for (i = 0; i < ci->num_stripes; i++) {
2522 if (!devices[i] || !devices[i]->devobj || context.stripes[i].stripestart == context.stripes[i].stripeend) {
2523 context.stripes[i].status = ReadDataStatus_MissingDevice;
2524 context.stripes_left--;
2525
2526 if (!devices[i] || !devices[i]->devobj)
2527 missing_devices++;
2528 }
2529 }
2530
2531 if (missing_devices > allowed_missing) {
2532 ERR("not enough devices to service request (%u missing)\n", missing_devices);
2533 Status = STATUS_UNEXPECTED_IO_ERROR;
2534 goto exit;
2535 }
2536
2537 for (i = 0; i < ci->num_stripes; i++) {
2538 PIO_STACK_LOCATION IrpSp;
2539
2540 if (devices[i] && devices[i]->devobj && context.stripes[i].stripestart != context.stripes[i].stripeend && context.stripes[i].status != ReadDataStatus_Skip) {
2541 context.stripes[i].context = (struct read_data_context*)&context;
2542
2543 if (type == BLOCK_FLAG_RAID10) {
2544 context.stripes[i].stripenum = i / ci->sub_stripes;
2545 }
2546
2547 if (!Irp) {
2548 context.stripes[i].Irp = IoAllocateIrp(devices[i]->devobj->StackSize, false);
2549
2550 if (!context.stripes[i].Irp) {
2551 ERR("IoAllocateIrp failed\n");
2552 Status = STATUS_INSUFFICIENT_RESOURCES;
2553 goto exit;
2554 }
2555 } else {
2556 context.stripes[i].Irp = IoMakeAssociatedIrp(Irp, devices[i]->devobj->StackSize);
2557
2558 if (!context.stripes[i].Irp) {
2559 ERR("IoMakeAssociatedIrp failed\n");
2560 Status = STATUS_INSUFFICIENT_RESOURCES;
2561 goto exit;
2562 }
2563 }
2564
2565 IrpSp = IoGetNextIrpStackLocation(context.stripes[i].Irp);
2566 IrpSp->MajorFunction = IRP_MJ_READ;
2567 IrpSp->MinorFunction = IRP_MN_NORMAL;
2568 IrpSp->FileObject = devices[i]->fileobj;
2569
2570 if (devices[i]->devobj->Flags & DO_BUFFERED_IO) {
2571 context.stripes[i].Irp->AssociatedIrp.SystemBuffer = ExAllocatePoolWithTag(NonPagedPool, (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart), ALLOC_TAG);
2572 if (!context.stripes[i].Irp->AssociatedIrp.SystemBuffer) {
2573 ERR("out of memory\n");
2574 Status = STATUS_INSUFFICIENT_RESOURCES;
2575 goto exit;
2576 }
2577
2578 context.stripes[i].Irp->Flags |= IRP_BUFFERED_IO | IRP_DEALLOCATE_BUFFER | IRP_INPUT_OPERATION;
2579
2580 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2581 } else if (devices[i]->devobj->Flags & DO_DIRECT_IO)
2582 context.stripes[i].Irp->MdlAddress = context.stripes[i].mdl;
2583 else
2584 context.stripes[i].Irp->UserBuffer = MmGetSystemAddressForMdlSafe(context.stripes[i].mdl, priority);
2585
2586 IrpSp->Parameters.Read.Length = (ULONG)(context.stripes[i].stripeend - context.stripes[i].stripestart);
2587 IrpSp->Parameters.Read.ByteOffset.QuadPart = context.stripes[i].stripestart + cis[i].offset;
2588
2589 total_reading += IrpSp->Parameters.Read.Length;
2590
2591 context.stripes[i].Irp->UserIosb = &context.stripes[i].iosb;
2592
2593 IoSetCompletionRoutine(context.stripes[i].Irp, read_data_completion, &context.stripes[i], true, true, true);
2594
2595 context.stripes[i].status = ReadDataStatus_Pending;
2596 }
2597 }
2598
2599 need_to_wait = false;
2600 for (i = 0; i < ci->num_stripes; i++) {
2601 if (context.stripes[i].status != ReadDataStatus_MissingDevice && context.stripes[i].status != ReadDataStatus_Skip) {
2602 IoCallDriver(devices[i]->devobj, context.stripes[i].Irp);
2603 need_to_wait = true;
2604 }
2605 }
2606
2607 if (need_to_wait)
2608 KeWaitForSingleObject(&context.Event, Executive, KernelMode, false, NULL);
2609
2610 if (diskacc)
2611 fFsRtlUpdateDiskCounters(total_reading, 0);
2612
2613 // check if any of the devices return a "user-induced" error
2614
2615 for (i = 0; i < ci->num_stripes; i++) {
2616 if (context.stripes[i].status == ReadDataStatus_Error && IoIsErrorUserInduced(context.stripes[i].iosb.Status)) {
2617 Status = context.stripes[i].iosb.Status;
2618 goto exit;
2619 }
2620 }
2621
2622 if (type == BLOCK_FLAG_RAID0) {
2623 Status = read_data_raid0(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2624 if (!NT_SUCCESS(Status)) {
2625 ERR("read_data_raid0 returned %08lx\n", Status);
2626
2627 if (file_read)
2628 ExFreePool(context.va);
2629
2630 goto exit;
2631 }
2632
2633 if (file_read) {
2634 RtlCopyMemory(buf, context.va, length);
2635 ExFreePool(context.va);
2636 }
2637 } else if (type == BLOCK_FLAG_RAID10) {
2638 Status = read_data_raid10(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, generation, offset);
2639
2640 if (!NT_SUCCESS(Status)) {
2641 ERR("read_data_raid10 returned %08lx\n", Status);
2642
2643 if (file_read)
2644 ExFreePool(context.va);
2645
2646 goto exit;
2647 }
2648
2649 if (file_read) {
2650 RtlCopyMemory(buf, context.va, length);
2651 ExFreePool(context.va);
2652 }
2653 } else if (type == BLOCK_FLAG_DUPLICATE) {
2654 Status = read_data_dup(Vcb, file_read ? context.va : buf, addr, &context, ci, devices, generation);
2655 if (!NT_SUCCESS(Status)) {
2656 ERR("read_data_dup returned %08lx\n", Status);
2657
2658 if (file_read)
2659 ExFreePool(context.va);
2660
2661 goto exit;
2662 }
2663
2664 if (file_read) {
2665 RtlCopyMemory(buf, context.va, length);
2666 ExFreePool(context.va);
2667 }
2668 } else if (type == BLOCK_FLAG_RAID5) {
2669 Status = read_data_raid5(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2670 if (!NT_SUCCESS(Status)) {
2671 ERR("read_data_raid5 returned %08lx\n", Status);
2672
2673 if (file_read)
2674 ExFreePool(context.va);
2675
2676 goto exit;
2677 }
2678
2679 if (file_read) {
2680 RtlCopyMemory(buf, context.va, length);
2681 ExFreePool(context.va);
2682 }
2683 } else if (type == BLOCK_FLAG_RAID6) {
2684 Status = read_data_raid6(Vcb, file_read ? context.va : buf, addr, length, &context, ci, devices, offset, generation, c, missing_devices > 0 ? true : false);
2685 if (!NT_SUCCESS(Status)) {
2686 ERR("read_data_raid6 returned %08lx\n", Status);
2687
2688 if (file_read)
2689 ExFreePool(context.va);
2690
2691 goto exit;
2692 }
2693
2694 if (file_read) {
2695 RtlCopyMemory(buf, context.va, length);
2696 ExFreePool(context.va);
2697 }
2698 }
2699
2700 exit:
2701 if (c && (type == BLOCK_FLAG_RAID5 || type == BLOCK_FLAG_RAID6))
2702 chunk_unlock_range(Vcb, c, lockaddr, locklen);
2703
2704 if (dummy_mdl)
2705 IoFreeMdl(dummy_mdl);
2706
2707 if (dummypage)
2708 ExFreePool(dummypage);
2709
2710 for (i = 0; i < ci->num_stripes; i++) {
2711 if (context.stripes[i].mdl) {
2712 if (context.stripes[i].mdl->MdlFlags & MDL_PAGES_LOCKED)
2713 MmUnlockPages(context.stripes[i].mdl);
2714
2715 IoFreeMdl(context.stripes[i].mdl);
2716 }
2717
2718 if (context.stripes[i].Irp)
2719 IoFreeIrp(context.stripes[i].Irp);
2720 }
2721
2722 ExFreePool(context.stripes);
2723
2724 if (!Vcb->log_to_phys_loaded)
2725 ExFreePool(devices);
2726
2727 return Status;
2728 }
2729
2730 __attribute__((nonnull(1, 2)))
read_stream(fcb * fcb,uint8_t * data,uint64_t start,ULONG length,ULONG * pbr)2731 NTSTATUS read_stream(fcb* fcb, uint8_t* data, uint64_t start, ULONG length, ULONG* pbr) {
2732 ULONG readlen;
2733
2734 TRACE("(%p, %p, %I64x, %lx, %p)\n", fcb, data, start, length, pbr);
2735
2736 if (pbr) *pbr = 0;
2737
2738 if (start >= fcb->adsdata.Length) {
2739 TRACE("tried to read beyond end of stream\n");
2740 return STATUS_END_OF_FILE;
2741 }
2742
2743 if (length == 0) {
2744 WARN("tried to read zero bytes\n");
2745 return STATUS_SUCCESS;
2746 }
2747
2748 if (start + length < fcb->adsdata.Length)
2749 readlen = length;
2750 else
2751 readlen = fcb->adsdata.Length - (ULONG)start;
2752
2753 if (readlen > 0)
2754 RtlCopyMemory(data, fcb->adsdata.Buffer + start, readlen);
2755
2756 if (pbr) *pbr = readlen;
2757
2758 return STATUS_SUCCESS;
2759 }
2760
2761 typedef struct {
2762 uint64_t off;
2763 uint64_t ed_size;
2764 uint64_t ed_offset;
2765 uint64_t ed_num_bytes;
2766 } read_part_extent;
2767
2768 typedef struct {
2769 LIST_ENTRY list_entry;
2770 uint64_t addr;
2771 chunk* c;
2772 uint32_t read;
2773 uint32_t to_read;
2774 void* csum;
2775 bool csum_free;
2776 uint8_t* buf;
2777 bool buf_free;
2778 uint32_t bumpoff;
2779 bool mdl;
2780 void* data;
2781 uint8_t compression;
2782 unsigned int num_extents;
2783 read_part_extent extents[1];
2784 } read_part;
2785
2786 typedef struct {
2787 LIST_ENTRY list_entry;
2788 calc_job* cj;
2789 void* decomp;
2790 void* data;
2791 unsigned int offset;
2792 size_t length;
2793 } comp_calc_job;
2794
2795 __attribute__((nonnull(1, 2)))
read_file(fcb * fcb,uint8_t * data,uint64_t start,uint64_t length,ULONG * pbr,PIRP Irp)2796 NTSTATUS read_file(fcb* fcb, uint8_t* data, uint64_t start, uint64_t length, ULONG* pbr, PIRP Irp) {
2797 NTSTATUS Status;
2798 uint32_t bytes_read = 0;
2799 uint64_t last_end;
2800 LIST_ENTRY* le;
2801 POOL_TYPE pool_type;
2802 LIST_ENTRY read_parts, calc_jobs;
2803
2804 TRACE("(%p, %p, %I64x, %I64x, %p)\n", fcb, data, start, length, pbr);
2805
2806 if (pbr)
2807 *pbr = 0;
2808
2809 if (start >= fcb->inode_item.st_size) {
2810 WARN("Tried to read beyond end of file\n");
2811 return STATUS_END_OF_FILE;
2812 }
2813
2814 InitializeListHead(&read_parts);
2815 InitializeListHead(&calc_jobs);
2816
2817 pool_type = fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? NonPagedPool : PagedPool;
2818
2819 le = fcb->extents.Flink;
2820
2821 last_end = start;
2822
2823 while (le != &fcb->extents) {
2824 extent* ext = CONTAINING_RECORD(le, extent, list_entry);
2825
2826 if (!ext->ignore) {
2827 EXTENT_DATA* ed = &ext->extent_data;
2828 uint64_t len;
2829
2830 if (ed->type == EXTENT_TYPE_REGULAR || ed->type == EXTENT_TYPE_PREALLOC)
2831 len = ((EXTENT_DATA2*)ed->data)->num_bytes;
2832 else
2833 len = ed->decoded_size;
2834
2835 if (ext->offset + len <= start) {
2836 last_end = ext->offset + len;
2837 goto nextitem;
2838 }
2839
2840 if (ext->offset > last_end && ext->offset > start + bytes_read) {
2841 uint32_t read = (uint32_t)min(length, ext->offset - max(start, last_end));
2842
2843 RtlZeroMemory(data + bytes_read, read);
2844 bytes_read += read;
2845 length -= read;
2846 }
2847
2848 if (length == 0 || ext->offset > start + bytes_read + length)
2849 break;
2850
2851 if (ed->encryption != BTRFS_ENCRYPTION_NONE) {
2852 WARN("Encryption not supported\n");
2853 Status = STATUS_NOT_IMPLEMENTED;
2854 goto exit;
2855 }
2856
2857 if (ed->encoding != BTRFS_ENCODING_NONE) {
2858 WARN("Other encodings not supported\n");
2859 Status = STATUS_NOT_IMPLEMENTED;
2860 goto exit;
2861 }
2862
2863 switch (ed->type) {
2864 case EXTENT_TYPE_INLINE:
2865 {
2866 uint64_t off = start + bytes_read - ext->offset;
2867 uint32_t read;
2868
2869 if (ed->compression == BTRFS_COMPRESSION_NONE) {
2870 read = (uint32_t)min(min(len, ext->datalen) - off, length);
2871
2872 RtlCopyMemory(data + bytes_read, &ed->data[off], read);
2873 } else if (ed->compression == BTRFS_COMPRESSION_ZLIB || ed->compression == BTRFS_COMPRESSION_LZO || ed->compression == BTRFS_COMPRESSION_ZSTD) {
2874 uint8_t* decomp;
2875 bool decomp_alloc;
2876 uint16_t inlen = ext->datalen - (uint16_t)offsetof(EXTENT_DATA, data[0]);
2877
2878 if (ed->decoded_size == 0 || ed->decoded_size > 0xffffffff) {
2879 ERR("ed->decoded_size was invalid (%I64x)\n", ed->decoded_size);
2880 Status = STATUS_INTERNAL_ERROR;
2881 goto exit;
2882 }
2883
2884 read = (uint32_t)min(ed->decoded_size - off, length);
2885
2886 if (off > 0) {
2887 decomp = ExAllocatePoolWithTag(NonPagedPool, (uint32_t)ed->decoded_size, ALLOC_TAG);
2888 if (!decomp) {
2889 ERR("out of memory\n");
2890 Status = STATUS_INSUFFICIENT_RESOURCES;
2891 goto exit;
2892 }
2893
2894 decomp_alloc = true;
2895 } else {
2896 decomp = data + bytes_read;
2897 decomp_alloc = false;
2898 }
2899
2900 if (ed->compression == BTRFS_COMPRESSION_ZLIB) {
2901 Status = zlib_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2902 if (!NT_SUCCESS(Status)) {
2903 ERR("zlib_decompress returned %08lx\n", Status);
2904 if (decomp_alloc) ExFreePool(decomp);
2905 goto exit;
2906 }
2907 } else if (ed->compression == BTRFS_COMPRESSION_LZO) {
2908 if (inlen < sizeof(uint32_t)) {
2909 ERR("extent data was truncated\n");
2910 Status = STATUS_INTERNAL_ERROR;
2911 if (decomp_alloc) ExFreePool(decomp);
2912 goto exit;
2913 } else
2914 inlen -= sizeof(uint32_t);
2915
2916 Status = lzo_decompress(ed->data + sizeof(uint32_t), inlen, decomp, (uint32_t)(read + off), sizeof(uint32_t));
2917 if (!NT_SUCCESS(Status)) {
2918 ERR("lzo_decompress returned %08lx\n", Status);
2919 if (decomp_alloc) ExFreePool(decomp);
2920 goto exit;
2921 }
2922 } else if (ed->compression == BTRFS_COMPRESSION_ZSTD) {
2923 Status = zstd_decompress(ed->data, inlen, decomp, (uint32_t)(read + off));
2924 if (!NT_SUCCESS(Status)) {
2925 ERR("zstd_decompress returned %08lx\n", Status);
2926 if (decomp_alloc) ExFreePool(decomp);
2927 goto exit;
2928 }
2929 }
2930
2931 if (decomp_alloc) {
2932 RtlCopyMemory(data + bytes_read, decomp + off, read);
2933 ExFreePool(decomp);
2934 }
2935 } else {
2936 ERR("unhandled compression type %x\n", ed->compression);
2937 Status = STATUS_NOT_IMPLEMENTED;
2938 goto exit;
2939 }
2940
2941 bytes_read += read;
2942 length -= read;
2943
2944 break;
2945 }
2946
2947 case EXTENT_TYPE_REGULAR:
2948 {
2949 EXTENT_DATA2* ed2 = (EXTENT_DATA2*)ed->data;
2950 read_part* rp;
2951
2952 rp = ExAllocatePoolWithTag(pool_type, sizeof(read_part), ALLOC_TAG);
2953 if (!rp) {
2954 ERR("out of memory\n");
2955 Status = STATUS_INSUFFICIENT_RESOURCES;
2956 goto exit;
2957 }
2958
2959 rp->mdl = (Irp && Irp->MdlAddress) ? true : false;
2960 rp->extents[0].off = start + bytes_read - ext->offset;
2961 rp->bumpoff = 0;
2962 rp->num_extents = 1;
2963 rp->csum_free = false;
2964
2965 rp->read = (uint32_t)(len - rp->extents[0].off);
2966 if (rp->read > length) rp->read = (uint32_t)length;
2967
2968 if (ed->compression == BTRFS_COMPRESSION_NONE) {
2969 rp->addr = ed2->address + ed2->offset + rp->extents[0].off;
2970 rp->to_read = (uint32_t)sector_align(rp->read, fcb->Vcb->superblock.sector_size);
2971
2972 if (rp->addr & (fcb->Vcb->superblock.sector_size - 1)) {
2973 rp->bumpoff = rp->addr & (fcb->Vcb->superblock.sector_size - 1);
2974 rp->addr -= rp->bumpoff;
2975 rp->to_read = (uint32_t)sector_align(rp->read + rp->bumpoff, fcb->Vcb->superblock.sector_size);
2976 }
2977 } else {
2978 rp->addr = ed2->address;
2979 rp->to_read = (uint32_t)sector_align(ed2->size, fcb->Vcb->superblock.sector_size);
2980 }
2981
2982 if (ed->compression == BTRFS_COMPRESSION_NONE && (start & (fcb->Vcb->superblock.sector_size - 1)) == 0 &&
2983 (length & (fcb->Vcb->superblock.sector_size - 1)) == 0) {
2984 rp->buf = data + bytes_read;
2985 rp->buf_free = false;
2986 } else {
2987 rp->buf = ExAllocatePoolWithTag(pool_type, rp->to_read, ALLOC_TAG);
2988 rp->buf_free = true;
2989
2990 if (!rp->buf) {
2991 ERR("out of memory\n");
2992 Status = STATUS_INSUFFICIENT_RESOURCES;
2993 ExFreePool(rp);
2994 goto exit;
2995 }
2996
2997 rp->mdl = false;
2998 }
2999
3000 rp->c = get_chunk_from_address(fcb->Vcb, rp->addr);
3001
3002 if (!rp->c) {
3003 ERR("get_chunk_from_address(%I64x) failed\n", rp->addr);
3004
3005 if (rp->buf_free)
3006 ExFreePool(rp->buf);
3007
3008 ExFreePool(rp);
3009
3010 Status = STATUS_INTERNAL_ERROR;
3011 goto exit;
3012 }
3013
3014 if (ext->csum) {
3015 if (ed->compression == BTRFS_COMPRESSION_NONE) {
3016 rp->csum = (uint8_t*)ext->csum + (fcb->Vcb->csum_size * (rp->extents[0].off >> fcb->Vcb->sector_shift));
3017 } else
3018 rp->csum = ext->csum;
3019 } else
3020 rp->csum = NULL;
3021
3022 rp->data = data + bytes_read;
3023 rp->compression = ed->compression;
3024 rp->extents[0].ed_offset = ed2->offset;
3025 rp->extents[0].ed_size = ed2->size;
3026 rp->extents[0].ed_num_bytes = ed2->num_bytes;
3027
3028 InsertTailList(&read_parts, &rp->list_entry);
3029
3030 bytes_read += rp->read;
3031 length -= rp->read;
3032
3033 break;
3034 }
3035
3036 case EXTENT_TYPE_PREALLOC:
3037 {
3038 uint64_t off = start + bytes_read - ext->offset;
3039 uint32_t read = (uint32_t)(len - off);
3040
3041 if (read > length) read = (uint32_t)length;
3042
3043 RtlZeroMemory(data + bytes_read, read);
3044
3045 bytes_read += read;
3046 length -= read;
3047
3048 break;
3049 }
3050
3051 default:
3052 WARN("Unsupported extent data type %u\n", ed->type);
3053 Status = STATUS_NOT_IMPLEMENTED;
3054 goto exit;
3055 }
3056
3057 last_end = ext->offset + len;
3058
3059 if (length == 0)
3060 break;
3061 }
3062
3063 nextitem:
3064 le = le->Flink;
3065 }
3066
3067 if (!IsListEmpty(&read_parts) && read_parts.Flink->Flink != &read_parts) { // at least two entries in list
3068 read_part* last_rp = CONTAINING_RECORD(read_parts.Flink, read_part, list_entry);
3069
3070 le = read_parts.Flink->Flink;
3071 while (le != &read_parts) {
3072 LIST_ENTRY* le2 = le->Flink;
3073 read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3074
3075 // merge together runs
3076 if (rp->compression != BTRFS_COMPRESSION_NONE && rp->compression == last_rp->compression && rp->addr == last_rp->addr + last_rp->to_read &&
3077 rp->data == (uint8_t*)last_rp->data + last_rp->read && rp->c == last_rp->c && ((rp->csum && last_rp->csum) || (!rp->csum && !last_rp->csum))) {
3078 read_part* rp2;
3079
3080 rp2 = ExAllocatePoolWithTag(pool_type, offsetof(read_part, extents) + (sizeof(read_part_extent) * (last_rp->num_extents + 1)), ALLOC_TAG);
3081
3082 rp2->addr = last_rp->addr;
3083 rp2->c = last_rp->c;
3084 rp2->read = last_rp->read + rp->read;
3085 rp2->to_read = last_rp->to_read + rp->to_read;
3086 rp2->csum_free = false;
3087
3088 if (last_rp->csum) {
3089 uint32_t sectors = (last_rp->to_read + rp->to_read) >> fcb->Vcb->sector_shift;
3090
3091 rp2->csum = ExAllocatePoolWithTag(pool_type, sectors * fcb->Vcb->csum_size, ALLOC_TAG);
3092 if (!rp2->csum) {
3093 ERR("out of memory\n");
3094 ExFreePool(rp2);
3095 Status = STATUS_INSUFFICIENT_RESOURCES;
3096 goto exit;
3097 }
3098
3099 RtlCopyMemory(rp2->csum, last_rp->csum, (last_rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift);
3100 RtlCopyMemory((uint8_t*)rp2->csum + ((last_rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift), rp->csum,
3101 (rp->to_read * fcb->Vcb->csum_size) >> fcb->Vcb->sector_shift);
3102
3103 rp2->csum_free = true;
3104 } else
3105 rp2->csum = NULL;
3106
3107 rp2->buf = ExAllocatePoolWithTag(pool_type, rp2->to_read, ALLOC_TAG);
3108 if (!rp2->buf) {
3109 ERR("out of memory\n");
3110
3111 if (rp2->csum)
3112 ExFreePool(rp2->csum);
3113
3114 ExFreePool(rp2);
3115 Status = STATUS_INSUFFICIENT_RESOURCES;
3116 goto exit;
3117 }
3118
3119 rp2->buf_free = true;
3120 rp2->bumpoff = 0;
3121 rp2->mdl = false;
3122 rp2->data = last_rp->data;
3123 rp2->compression = last_rp->compression;
3124 rp2->num_extents = last_rp->num_extents + 1;
3125
3126 RtlCopyMemory(rp2->extents, last_rp->extents, last_rp->num_extents * sizeof(read_part_extent));
3127 RtlCopyMemory(&rp2->extents[last_rp->num_extents], rp->extents, sizeof(read_part_extent));
3128
3129 InsertHeadList(le->Blink, &rp2->list_entry);
3130
3131 if (rp->buf_free)
3132 ExFreePool(rp->buf);
3133
3134 if (rp->csum_free)
3135 ExFreePool(rp->csum);
3136
3137 RemoveEntryList(&rp->list_entry);
3138
3139 ExFreePool(rp);
3140
3141 if (last_rp->buf_free)
3142 ExFreePool(last_rp->buf);
3143
3144 if (last_rp->csum_free)
3145 ExFreePool(last_rp->csum);
3146
3147 RemoveEntryList(&last_rp->list_entry);
3148
3149 ExFreePool(last_rp);
3150
3151 last_rp = rp2;
3152 } else
3153 last_rp = rp;
3154
3155 le = le2;
3156 }
3157 }
3158
3159 le = read_parts.Flink;
3160 while (le != &read_parts) {
3161 read_part* rp = CONTAINING_RECORD(le, read_part, list_entry);
3162
3163 Status = read_data(fcb->Vcb, rp->addr, rp->to_read, rp->csum, false, rp->buf, rp->c, NULL, Irp, 0, rp->mdl,
3164 fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3165 if (!NT_SUCCESS(Status)) {
3166 ERR("read_data returned %08lx\n", Status);
3167 goto exit;
3168 }
3169
3170 if (rp->compression == BTRFS_COMPRESSION_NONE) {
3171 if (rp->buf_free)
3172 RtlCopyMemory(rp->data, rp->buf + rp->bumpoff, rp->read);
3173 } else {
3174 uint8_t* buf = rp->buf;
3175
3176 for (unsigned int i = 0; i < rp->num_extents; i++) {
3177 uint8_t *decomp = NULL, *buf2;
3178 ULONG outlen, inlen, off2;
3179 uint32_t inpageoff = 0;
3180 comp_calc_job* ccj;
3181
3182 off2 = (ULONG)(rp->extents[i].ed_offset + rp->extents[i].off);
3183 buf2 = buf;
3184 inlen = (ULONG)rp->extents[i].ed_size;
3185
3186 if (rp->compression == BTRFS_COMPRESSION_LZO) {
3187 ULONG inoff = sizeof(uint32_t);
3188
3189 inlen -= sizeof(uint32_t);
3190
3191 // If reading a few sectors in, skip to the interesting bit
3192 while (off2 > LZO_PAGE_SIZE) {
3193 uint32_t partlen;
3194
3195 if (inlen < sizeof(uint32_t))
3196 break;
3197
3198 partlen = *(uint32_t*)(buf2 + inoff);
3199
3200 if (partlen < inlen) {
3201 off2 -= LZO_PAGE_SIZE;
3202 inoff += partlen + sizeof(uint32_t);
3203 inlen -= partlen + sizeof(uint32_t);
3204
3205 if (LZO_PAGE_SIZE - (inoff % LZO_PAGE_SIZE) < sizeof(uint32_t))
3206 inoff = ((inoff / LZO_PAGE_SIZE) + 1) * LZO_PAGE_SIZE;
3207 } else
3208 break;
3209 }
3210
3211 buf2 = &buf2[inoff];
3212 inpageoff = inoff % LZO_PAGE_SIZE;
3213 }
3214
3215 /* Previous versions of this code decompressed directly into the destination buffer,
3216 * but unfortunately that can't be relied on - Windows likes to use dummy pages sometimes
3217 * when mmap-ing, which breaks the backtracking used by e.g. zstd. */
3218
3219 if (off2 != 0)
3220 outlen = off2 + min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3221 else
3222 outlen = min(rp->read, (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off));
3223
3224 decomp = ExAllocatePoolWithTag(pool_type, outlen, ALLOC_TAG);
3225 if (!decomp) {
3226 ERR("out of memory\n");
3227 Status = STATUS_INSUFFICIENT_RESOURCES;
3228 goto exit;
3229 }
3230
3231 ccj = (comp_calc_job*)ExAllocatePoolWithTag(pool_type, sizeof(comp_calc_job), ALLOC_TAG);
3232 if (!ccj) {
3233 ERR("out of memory\n");
3234
3235 ExFreePool(decomp);
3236
3237 Status = STATUS_INSUFFICIENT_RESOURCES;
3238 goto exit;
3239 }
3240
3241 ccj->data = rp->data;
3242 ccj->decomp = decomp;
3243
3244 ccj->offset = off2;
3245 ccj->length = (size_t)min(rp->read, rp->extents[i].ed_num_bytes - rp->extents[i].off);
3246
3247 Status = add_calc_job_decomp(fcb->Vcb, rp->compression, buf2, inlen, decomp, outlen,
3248 inpageoff, &ccj->cj);
3249 if (!NT_SUCCESS(Status)) {
3250 ERR("add_calc_job_decomp returned %08lx\n", Status);
3251
3252 ExFreePool(decomp);
3253 ExFreePool(ccj);
3254
3255 goto exit;
3256 }
3257
3258 InsertTailList(&calc_jobs, &ccj->list_entry);
3259
3260 buf += rp->extents[i].ed_size;
3261 rp->data = (uint8_t*)rp->data + rp->extents[i].ed_num_bytes - rp->extents[i].off;
3262 rp->read -= (uint32_t)(rp->extents[i].ed_num_bytes - rp->extents[i].off);
3263 }
3264 }
3265
3266 le = le->Flink;
3267 }
3268
3269 if (length > 0 && start + bytes_read < fcb->inode_item.st_size) {
3270 uint32_t read = (uint32_t)min(fcb->inode_item.st_size - start - bytes_read, length);
3271
3272 RtlZeroMemory(data + bytes_read, read);
3273
3274 bytes_read += read;
3275 length -= read;
3276 }
3277
3278 Status = STATUS_SUCCESS;
3279
3280 while (!IsListEmpty(&calc_jobs)) {
3281 comp_calc_job* ccj = CONTAINING_RECORD(RemoveTailList(&calc_jobs), comp_calc_job, list_entry);
3282
3283 calc_thread_main(fcb->Vcb, ccj->cj);
3284
3285 KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3286
3287 if (!NT_SUCCESS(ccj->cj->Status))
3288 Status = ccj->cj->Status;
3289
3290 RtlCopyMemory(ccj->data, (uint8_t*)ccj->decomp + ccj->offset, ccj->length);
3291 ExFreePool(ccj->decomp);
3292
3293 ExFreePool(ccj);
3294 }
3295
3296 if (pbr)
3297 *pbr = bytes_read;
3298
3299 exit:
3300 while (!IsListEmpty(&read_parts)) {
3301 read_part* rp = CONTAINING_RECORD(RemoveHeadList(&read_parts), read_part, list_entry);
3302
3303 if (rp->buf_free)
3304 ExFreePool(rp->buf);
3305
3306 if (rp->csum_free)
3307 ExFreePool(rp->csum);
3308
3309 ExFreePool(rp);
3310 }
3311
3312 while (!IsListEmpty(&calc_jobs)) {
3313 comp_calc_job* ccj = CONTAINING_RECORD(RemoveHeadList(&calc_jobs), comp_calc_job, list_entry);
3314
3315 KeWaitForSingleObject(&ccj->cj->event, Executive, KernelMode, false, NULL);
3316
3317 if (ccj->decomp)
3318 ExFreePool(ccj->decomp);
3319
3320 ExFreePool(ccj->cj);
3321
3322 ExFreePool(ccj);
3323 }
3324
3325 return Status;
3326 }
3327
do_read(PIRP Irp,bool wait,ULONG * bytes_read)3328 NTSTATUS do_read(PIRP Irp, bool wait, ULONG* bytes_read) {
3329 PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3330 PFILE_OBJECT FileObject = IrpSp->FileObject;
3331 fcb* fcb = FileObject->FsContext;
3332 uint8_t* data = NULL;
3333 ULONG length = IrpSp->Parameters.Read.Length, addon = 0;
3334 uint64_t start = IrpSp->Parameters.Read.ByteOffset.QuadPart;
3335
3336 *bytes_read = 0;
3337
3338 if (!fcb || !fcb->Vcb || !fcb->subvol)
3339 return STATUS_INTERNAL_ERROR;
3340
3341 TRACE("fcb = %p\n", fcb);
3342 TRACE("offset = %I64x, length = %lx\n", start, length);
3343 TRACE("paging_io = %s, no cache = %s\n", Irp->Flags & IRP_PAGING_IO ? "true" : "false", Irp->Flags & IRP_NOCACHE ? "true" : "false");
3344
3345 if (!fcb->ads && fcb->type == BTRFS_TYPE_DIRECTORY)
3346 return STATUS_INVALID_DEVICE_REQUEST;
3347
3348 if (!(Irp->Flags & IRP_PAGING_IO) && !FsRtlCheckLockForReadAccess(&fcb->lock, Irp)) {
3349 WARN("tried to read locked region\n");
3350 return STATUS_FILE_LOCK_CONFLICT;
3351 }
3352
3353 if (length == 0) {
3354 TRACE("tried to read zero bytes\n");
3355 return STATUS_SUCCESS;
3356 }
3357
3358 if (start >= (uint64_t)fcb->Header.FileSize.QuadPart) {
3359 TRACE("tried to read with offset after file end (%I64x >= %I64x)\n", start, fcb->Header.FileSize.QuadPart);
3360 return STATUS_END_OF_FILE;
3361 }
3362
3363 TRACE("FileObject %p fcb %p FileSize = %I64x st_size = %I64x (%p)\n", FileObject, fcb, fcb->Header.FileSize.QuadPart, fcb->inode_item.st_size, &fcb->inode_item.st_size);
3364
3365 if (!(Irp->Flags & IRP_NOCACHE) && IrpSp->MinorFunction & IRP_MN_MDL) {
3366 NTSTATUS Status = STATUS_SUCCESS;
3367
3368 _SEH2_TRY {
3369 if (!FileObject->PrivateCacheMap) {
3370 CC_FILE_SIZES ccfs;
3371
3372 ccfs.AllocationSize = fcb->Header.AllocationSize;
3373 ccfs.FileSize = fcb->Header.FileSize;
3374 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
3375
3376 init_file_cache(FileObject, &ccfs);
3377 }
3378
3379 CcMdlRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, &Irp->MdlAddress, &Irp->IoStatus);
3380 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
3381 Status = _SEH2_GetExceptionCode();
3382 } _SEH2_END;
3383
3384 if (NT_SUCCESS(Status)) {
3385 Status = Irp->IoStatus.Status;
3386 Irp->IoStatus.Information += addon;
3387 *bytes_read = (ULONG)Irp->IoStatus.Information;
3388 } else
3389 ERR("EXCEPTION - %08lx\n", Status);
3390
3391 return Status;
3392 }
3393
3394 data = map_user_buffer(Irp, fcb->Header.Flags2 & FSRTL_FLAG2_IS_PAGING_FILE ? HighPagePriority : NormalPagePriority);
3395
3396 if (Irp->MdlAddress && !data) {
3397 ERR("MmGetSystemAddressForMdlSafe returned NULL\n");
3398 return STATUS_INSUFFICIENT_RESOURCES;
3399 }
3400
3401 if (start >= (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3402 length = (ULONG)min(length, min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3403 RtlZeroMemory(data, length);
3404 Irp->IoStatus.Information = *bytes_read = length;
3405 return STATUS_SUCCESS;
3406 }
3407
3408 if (length + start > (uint64_t)fcb->Header.ValidDataLength.QuadPart) {
3409 addon = (ULONG)(min(start + length, (uint64_t)fcb->Header.FileSize.QuadPart) - fcb->Header.ValidDataLength.QuadPart);
3410 RtlZeroMemory(data + (fcb->Header.ValidDataLength.QuadPart - start), addon);
3411 length = (ULONG)(fcb->Header.ValidDataLength.QuadPart - start);
3412 }
3413
3414 if (!(Irp->Flags & IRP_NOCACHE)) {
3415 NTSTATUS Status = STATUS_SUCCESS;
3416
3417 _SEH2_TRY {
3418 if (!FileObject->PrivateCacheMap) {
3419 CC_FILE_SIZES ccfs;
3420
3421 ccfs.AllocationSize = fcb->Header.AllocationSize;
3422 ccfs.FileSize = fcb->Header.FileSize;
3423 ccfs.ValidDataLength = fcb->Header.ValidDataLength;
3424
3425 init_file_cache(FileObject, &ccfs);
3426 }
3427
3428 if (fCcCopyReadEx) {
3429 TRACE("CcCopyReadEx(%p, %I64x, %lx, %u, %p, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart,
3430 length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread);
3431 TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3432 if (!fCcCopyReadEx(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus, Irp->Tail.Overlay.Thread)) {
3433 TRACE("CcCopyReadEx could not wait\n");
3434
3435 IoMarkIrpPending(Irp);
3436 return STATUS_PENDING;
3437 }
3438 TRACE("CcCopyReadEx finished\n");
3439 } else {
3440 TRACE("CcCopyRead(%p, %I64x, %lx, %u, %p, %p)\n", FileObject, IrpSp->Parameters.Read.ByteOffset.QuadPart, length, wait, data, &Irp->IoStatus);
3441 TRACE("sizes = %I64x, %I64x, %I64x\n", fcb->Header.AllocationSize.QuadPart, fcb->Header.FileSize.QuadPart, fcb->Header.ValidDataLength.QuadPart);
3442 if (!CcCopyRead(FileObject, &IrpSp->Parameters.Read.ByteOffset, length, wait, data, &Irp->IoStatus)) {
3443 TRACE("CcCopyRead could not wait\n");
3444
3445 IoMarkIrpPending(Irp);
3446 return STATUS_PENDING;
3447 }
3448 TRACE("CcCopyRead finished\n");
3449 }
3450 } _SEH2_EXCEPT (EXCEPTION_EXECUTE_HANDLER) {
3451 Status = _SEH2_GetExceptionCode();
3452 } _SEH2_END;
3453
3454 if (NT_SUCCESS(Status)) {
3455 Status = Irp->IoStatus.Status;
3456 Irp->IoStatus.Information += addon;
3457 *bytes_read = (ULONG)Irp->IoStatus.Information;
3458 } else
3459 ERR("EXCEPTION - %08lx\n", Status);
3460
3461 return Status;
3462 } else {
3463 NTSTATUS Status;
3464
3465 if (!wait) {
3466 IoMarkIrpPending(Irp);
3467 return STATUS_PENDING;
3468 }
3469
3470 if (fcb->ads) {
3471 Status = read_stream(fcb, data, start, length, bytes_read);
3472
3473 if (!NT_SUCCESS(Status))
3474 ERR("read_stream returned %08lx\n", Status);
3475 } else {
3476 Status = read_file(fcb, data, start, length, bytes_read, Irp);
3477
3478 if (!NT_SUCCESS(Status))
3479 ERR("read_file returned %08lx\n", Status);
3480 }
3481
3482 *bytes_read += addon;
3483 TRACE("read %lu bytes\n", *bytes_read);
3484
3485 Irp->IoStatus.Information = *bytes_read;
3486
3487 if (diskacc && Status != STATUS_PENDING) {
3488 PETHREAD thread = NULL;
3489
3490 if (Irp->Tail.Overlay.Thread && !IoIsSystemThread(Irp->Tail.Overlay.Thread))
3491 thread = Irp->Tail.Overlay.Thread;
3492 else if (!IoIsSystemThread(PsGetCurrentThread()))
3493 thread = PsGetCurrentThread();
3494 else if (IoIsSystemThread(PsGetCurrentThread()) && IoGetTopLevelIrp() == Irp)
3495 thread = PsGetCurrentThread();
3496
3497 if (thread)
3498 fPsUpdateDiskCounters(PsGetThreadProcess(thread), *bytes_read, 0, 1, 0, 0);
3499 }
3500
3501 return Status;
3502 }
3503 }
3504
3505 _Dispatch_type_(IRP_MJ_READ)
_Function_class_(DRIVER_DISPATCH)3506 _Function_class_(DRIVER_DISPATCH)
3507 NTSTATUS __stdcall drv_read(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
3508 device_extension* Vcb = DeviceObject->DeviceExtension;
3509 PIO_STACK_LOCATION IrpSp = IoGetCurrentIrpStackLocation(Irp);
3510 PFILE_OBJECT FileObject = IrpSp->FileObject;
3511 ULONG bytes_read = 0;
3512 NTSTATUS Status;
3513 bool top_level;
3514 fcb* fcb;
3515 ccb* ccb;
3516 bool acquired_fcb_lock = false, wait;
3517
3518 FsRtlEnterFileSystem();
3519
3520 top_level = is_top_level(Irp);
3521
3522 TRACE("read\n");
3523
3524 if (Vcb && Vcb->type == VCB_TYPE_VOLUME) {
3525 Status = vol_read(DeviceObject, Irp);
3526 goto exit2;
3527 } else if (!Vcb || Vcb->type != VCB_TYPE_FS) {
3528 Status = STATUS_INVALID_PARAMETER;
3529 goto end;
3530 }
3531
3532 Irp->IoStatus.Information = 0;
3533
3534 if (IrpSp->MinorFunction & IRP_MN_COMPLETE) {
3535 CcMdlReadComplete(IrpSp->FileObject, Irp->MdlAddress);
3536
3537 Irp->MdlAddress = NULL;
3538 Status = STATUS_SUCCESS;
3539
3540 goto exit;
3541 }
3542
3543 fcb = FileObject->FsContext;
3544
3545 if (!fcb) {
3546 ERR("fcb was NULL\n");
3547 Status = STATUS_INVALID_PARAMETER;
3548 goto exit;
3549 }
3550
3551 ccb = FileObject->FsContext2;
3552
3553 if (!ccb) {
3554 ERR("ccb was NULL\n");
3555 Status = STATUS_INVALID_PARAMETER;
3556 goto exit;
3557 }
3558
3559 if (Irp->RequestorMode == UserMode && !(ccb->access & FILE_READ_DATA)) {
3560 WARN("insufficient privileges\n");
3561 Status = STATUS_ACCESS_DENIED;
3562 goto exit;
3563 }
3564
3565 if (fcb == Vcb->volume_fcb) {
3566 TRACE("reading volume FCB\n");
3567
3568 IoSkipCurrentIrpStackLocation(Irp);
3569
3570 Status = IoCallDriver(Vcb->Vpb->RealDevice, Irp);
3571
3572 goto exit2;
3573 }
3574
3575 if (!(Irp->Flags & IRP_PAGING_IO))
3576 FsRtlCheckOplock(fcb_oplock(fcb), Irp, NULL, NULL, NULL);
3577
3578 wait = IoIsOperationSynchronous(Irp);
3579
3580 // Don't offload jobs when doing paging IO - otherwise this can lead to
3581 // deadlocks in CcCopyRead.
3582 if (Irp->Flags & IRP_PAGING_IO)
3583 wait = true;
3584
3585 if (!(Irp->Flags & IRP_PAGING_IO) && FileObject->SectionObjectPointer && FileObject->SectionObjectPointer->DataSectionObject) {
3586 IO_STATUS_BLOCK iosb;
3587
3588 CcFlushCache(FileObject->SectionObjectPointer, &IrpSp->Parameters.Read.ByteOffset, IrpSp->Parameters.Read.Length, &iosb);
3589 if (!NT_SUCCESS(iosb.Status)) {
3590 ERR("CcFlushCache returned %08lx\n", iosb.Status);
3591 return iosb.Status;
3592 }
3593 }
3594
3595 if (!ExIsResourceAcquiredSharedLite(fcb->Header.Resource)) {
3596 if (!ExAcquireResourceSharedLite(fcb->Header.Resource, wait)) {
3597 Status = STATUS_PENDING;
3598 IoMarkIrpPending(Irp);
3599 goto exit;
3600 }
3601
3602 acquired_fcb_lock = true;
3603 }
3604
3605 Status = do_read(Irp, wait, &bytes_read);
3606
3607 if (acquired_fcb_lock)
3608 ExReleaseResourceLite(fcb->Header.Resource);
3609
3610 exit:
3611 if (FileObject->Flags & FO_SYNCHRONOUS_IO && !(Irp->Flags & IRP_PAGING_IO))
3612 FileObject->CurrentByteOffset.QuadPart = IrpSp->Parameters.Read.ByteOffset.QuadPart + (NT_SUCCESS(Status) ? bytes_read : 0);
3613
3614 end:
3615 Irp->IoStatus.Status = Status;
3616
3617 TRACE("Irp->IoStatus.Status = %08lx\n", Irp->IoStatus.Status);
3618 TRACE("Irp->IoStatus.Information = %Iu\n", Irp->IoStatus.Information);
3619 TRACE("returning %08lx\n", Status);
3620
3621 if (Status != STATUS_PENDING)
3622 IoCompleteRequest(Irp, IO_NO_INCREMENT);
3623 else {
3624 if (!add_thread_job(Vcb, Irp))
3625 Status = do_read_job(Irp);
3626 }
3627
3628 exit2:
3629 if (top_level)
3630 IoSetTopLevelIrp(NULL);
3631
3632 FsRtlExitFileSystem();
3633
3634 return Status;
3635 }
3636