1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 #include <lz4.h> 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define kmem_alloc(size, flag) zfs_alloc((size)) 40 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 41 42 static void 43 zfs_init_crc(void) 44 { 45 int i, j; 46 uint64_t *ct; 47 48 /* 49 * Calculate the crc64 table (used for the zap hash 50 * function). 51 */ 52 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 53 memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table)); 54 for (i = 0; i < 256; i++) { 55 ct = zfs_crc64_table + i; 56 for (*ct = i, j = 8; j > 0; j--) 57 *ct = (*ct >> 1) ^ 58 (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 } 62 63 static void 64 zio_checksum_off(const void *buf __unused, uint64_t size __unused, 65 const void *ctx_template __unused, zio_cksum_t *zcp) 66 { 67 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 68 } 69 70 /* 71 * Signature for checksum functions. 72 */ 73 typedef void zio_checksum_t(const void *data, uint64_t size, 74 const void *ctx_template, zio_cksum_t *zcp); 75 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 76 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 77 78 typedef enum zio_checksum_flags { 79 /* Strong enough for metadata? */ 80 ZCHECKSUM_FLAG_METADATA = (1 << 1), 81 /* ZIO embedded checksum */ 82 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 83 /* Strong enough for dedup (without verification)? */ 84 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 85 /* Uses salt value */ 86 ZCHECKSUM_FLAG_SALTED = (1 << 4), 87 /* Strong enough for nopwrite? */ 88 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 89 } zio_checksum_flags_t; 90 91 /* 92 * Information about each checksum function. 93 */ 94 typedef struct zio_checksum_info { 95 /* checksum function for each byteorder */ 96 zio_checksum_t *ci_func[2]; 97 zio_checksum_tmpl_init_t *ci_tmpl_init; 98 zio_checksum_tmpl_free_t *ci_tmpl_free; 99 zio_checksum_flags_t ci_flags; 100 const char *ci_name; /* descriptive name */ 101 } zio_checksum_info_t; 102 103 #include "blkptr.c" 104 105 #include "fletcher.c" 106 #include "sha256.c" 107 #include "skein_zfs.c" 108 #include "edonr_zfs.c" 109 110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 111 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 112 {{NULL, NULL}, NULL, NULL, 0, "on"}, 113 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 114 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 115 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 116 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 117 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 118 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 119 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 120 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 121 0, "fletcher2"}, 122 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 123 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 124 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 125 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 126 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 127 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 128 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 129 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 130 0, "noparity"}, 131 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 132 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 133 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 134 /* no skein and edonr for now */ 135 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 136 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 137 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 139 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, 140 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, 141 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | 142 ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 143 }; 144 145 /* 146 * Common signature for all zio compress/decompress functions. 147 */ 148 typedef size_t zio_compress_func_t(void *src, void *dst, 149 size_t s_len, size_t d_len, int); 150 typedef int zio_decompress_func_t(void *src, void *dst, 151 size_t s_len, size_t d_len, int); 152 153 extern int gzip_decompress(void *src, void *dst, 154 size_t s_len, size_t d_len, int); 155 /* 156 * Information about each compression function. 157 */ 158 typedef struct zio_compress_info { 159 zio_compress_func_t *ci_compress; /* compression function */ 160 zio_decompress_func_t *ci_decompress; /* decompression function */ 161 int ci_level; /* level parameter */ 162 const char *ci_name; /* algorithm name */ 163 } zio_compress_info_t; 164 165 #include "lzjb.c" 166 #include "zle.c" 167 168 /* 169 * Compression vectors. 170 */ 171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 172 {NULL, NULL, 0, "inherit"}, 173 {NULL, NULL, 0, "on"}, 174 {NULL, NULL, 0, "uncompressed"}, 175 {NULL, lzjb_decompress, 0, "lzjb"}, 176 {NULL, NULL, 0, "empty"}, 177 {NULL, gzip_decompress, 1, "gzip-1"}, 178 {NULL, gzip_decompress, 2, "gzip-2"}, 179 {NULL, gzip_decompress, 3, "gzip-3"}, 180 {NULL, gzip_decompress, 4, "gzip-4"}, 181 {NULL, gzip_decompress, 5, "gzip-5"}, 182 {NULL, gzip_decompress, 6, "gzip-6"}, 183 {NULL, gzip_decompress, 7, "gzip-7"}, 184 {NULL, gzip_decompress, 8, "gzip-8"}, 185 {NULL, gzip_decompress, 9, "gzip-9"}, 186 {NULL, zle_decompress, 64, "zle"}, 187 {NULL, lz4_decompress, 0, "lz4"}, 188 }; 189 190 static void 191 byteswap_uint64_array(void *vbuf, size_t size) 192 { 193 uint64_t *buf = vbuf; 194 size_t count = size >> 3; 195 int i; 196 197 ASSERT((size & 7) == 0); 198 199 for (i = 0; i < count; i++) 200 buf[i] = BSWAP_64(buf[i]); 201 } 202 203 /* 204 * Set the external verifier for a gang block based on <vdev, offset, txg>, 205 * a tuple which is guaranteed to be unique for the life of the pool. 206 */ 207 static void 208 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 209 { 210 const dva_t *dva = BP_IDENTITY(bp); 211 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 212 213 ASSERT(BP_IS_GANG(bp)); 214 215 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 216 } 217 218 /* 219 * Set the external verifier for a label block based on its offset. 220 * The vdev is implicit, and the txg is unknowable at pool open time -- 221 * hence the logic in vdev_uberblock_load() to find the most recent copy. 222 */ 223 static void 224 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 225 { 226 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 227 } 228 229 /* 230 * Calls the template init function of a checksum which supports context 231 * templates and installs the template into the spa_t. 232 */ 233 static void 234 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 235 { 236 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 237 238 if (ci->ci_tmpl_init == NULL) 239 return; 240 241 if (spa->spa_cksum_tmpls[checksum] != NULL) 242 return; 243 244 if (spa->spa_cksum_tmpls[checksum] == NULL) { 245 spa->spa_cksum_tmpls[checksum] = 246 ci->ci_tmpl_init(&spa->spa_cksum_salt); 247 } 248 } 249 250 /* 251 * Called by a spa_t that's about to be deallocated. This steps through 252 * all of the checksum context templates and deallocates any that were 253 * initialized using the algorithm-specific template init function. 254 */ 255 void 256 zio_checksum_templates_free(spa_t *spa) 257 { 258 for (enum zio_checksum checksum = 0; 259 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 260 if (spa->spa_cksum_tmpls[checksum] != NULL) { 261 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 262 263 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 264 spa->spa_cksum_tmpls[checksum] = NULL; 265 } 266 } 267 } 268 269 static int 270 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 271 { 272 uint64_t size; 273 unsigned int checksum; 274 zio_checksum_info_t *ci; 275 void *ctx = NULL; 276 zio_cksum_t actual_cksum, expected_cksum, verifier; 277 int byteswap; 278 279 checksum = BP_GET_CHECKSUM(bp); 280 size = BP_GET_PSIZE(bp); 281 282 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 283 return (EINVAL); 284 ci = &zio_checksum_table[checksum]; 285 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 286 return (EINVAL); 287 288 if (spa != NULL) { 289 zio_checksum_template_init(checksum, (spa_t *)spa); 290 ctx = spa->spa_cksum_tmpls[checksum]; 291 } 292 293 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 294 zio_eck_t *eck; 295 296 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 297 checksum == ZIO_CHECKSUM_LABEL); 298 299 eck = (zio_eck_t *)((char *)data + size) - 1; 300 301 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 302 zio_checksum_gang_verifier(&verifier, bp); 303 else if (checksum == ZIO_CHECKSUM_LABEL) 304 zio_checksum_label_verifier(&verifier, 305 DVA_GET_OFFSET(BP_IDENTITY(bp))); 306 else 307 verifier = bp->blk_cksum; 308 309 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 310 311 if (byteswap) 312 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 313 314 expected_cksum = eck->zec_cksum; 315 eck->zec_cksum = verifier; 316 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 317 eck->zec_cksum = expected_cksum; 318 319 if (byteswap) 320 byteswap_uint64_array(&expected_cksum, 321 sizeof (zio_cksum_t)); 322 } else { 323 byteswap = BP_SHOULD_BYTESWAP(bp); 324 expected_cksum = bp->blk_cksum; 325 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 326 } 327 328 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 329 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */ 330 return (EIO); 331 } 332 333 return (0); 334 } 335 336 static int 337 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 338 void *dest, uint64_t destsize) 339 { 340 zio_compress_info_t *ci; 341 342 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 343 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 344 return (EIO); 345 } 346 347 ci = &zio_compress_table[cpfunc]; 348 if (!ci->ci_decompress) { 349 printf("ZFS: unsupported compression algorithm %s\n", 350 ci->ci_name); 351 return (EIO); 352 } 353 354 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 355 } 356 357 static uint64_t 358 zap_hash(uint64_t salt, const char *name) 359 { 360 const uint8_t *cp; 361 uint8_t c; 362 uint64_t crc = salt; 363 364 ASSERT(crc != 0); 365 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 366 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 367 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 368 369 /* 370 * Only use 28 bits, since we need 4 bits in the cookie for the 371 * collision differentiator. We MUST use the high bits, since 372 * those are the onces that we first pay attention to when 373 * chosing the bucket. 374 */ 375 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 376 377 return (crc); 378 } 379 380 static void *zfs_alloc(size_t size); 381 static void zfs_free(void *ptr, size_t size); 382 383 typedef struct raidz_col { 384 uint64_t rc_devidx; /* child device index for I/O */ 385 uint64_t rc_offset; /* device offset */ 386 uint64_t rc_size; /* I/O size */ 387 void *rc_data; /* I/O data */ 388 int rc_error; /* I/O error for this device */ 389 uint8_t rc_tried; /* Did we attempt this I/O column? */ 390 uint8_t rc_skipped; /* Did we skip this I/O column? */ 391 } raidz_col_t; 392 393 typedef struct raidz_map { 394 uint64_t rm_cols; /* Regular column count */ 395 uint64_t rm_scols; /* Count including skipped columns */ 396 uint64_t rm_bigcols; /* Number of oversized columns */ 397 uint64_t rm_asize; /* Actual total I/O size */ 398 uint64_t rm_missingdata; /* Count of missing data devices */ 399 uint64_t rm_missingparity; /* Count of missing parity devices */ 400 uint64_t rm_firstdatacol; /* First data column/parity count */ 401 uint64_t rm_nskip; /* Skipped sectors for padding */ 402 uint64_t rm_skipstart; /* Column index of padding start */ 403 uintptr_t rm_reports; /* # of referencing checksum reports */ 404 uint8_t rm_freed; /* map no longer has referencing ZIO */ 405 uint8_t rm_ecksuminjected; /* checksum error was injected */ 406 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 407 } raidz_map_t; 408 409 #define VDEV_RAIDZ_P 0 410 #define VDEV_RAIDZ_Q 1 411 #define VDEV_RAIDZ_R 2 412 413 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 414 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 415 416 /* 417 * We provide a mechanism to perform the field multiplication operation on a 418 * 64-bit value all at once rather than a byte at a time. This works by 419 * creating a mask from the top bit in each byte and using that to 420 * conditionally apply the XOR of 0x1d. 421 */ 422 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 423 { \ 424 (mask) = (x) & 0x8080808080808080ULL; \ 425 (mask) = ((mask) << 1) - ((mask) >> 7); \ 426 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 427 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 428 } 429 430 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 431 { \ 432 VDEV_RAIDZ_64MUL_2((x), mask); \ 433 VDEV_RAIDZ_64MUL_2((x), mask); \ 434 } 435 436 /* 437 * These two tables represent powers and logs of 2 in the Galois field defined 438 * above. These values were computed by repeatedly multiplying by 2 as above. 439 */ 440 static const uint8_t vdev_raidz_pow2[256] = { 441 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 442 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 443 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 444 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 445 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 446 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 447 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 448 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 449 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 450 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 451 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 452 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 453 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 454 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 455 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 456 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 457 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 458 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 459 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 460 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 461 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 462 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 463 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 464 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 465 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 466 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 467 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 468 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 469 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 470 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 471 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 472 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 473 }; 474 static const uint8_t vdev_raidz_log2[256] = { 475 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 476 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 477 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 478 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 479 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 480 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 481 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 482 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 483 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 484 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 485 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 486 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 487 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 488 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 489 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 490 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 491 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 492 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 493 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 494 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 495 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 496 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 497 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 498 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 499 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 500 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 501 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 502 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 503 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 504 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 505 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 506 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 507 }; 508 509 /* 510 * Multiply a given number by 2 raised to the given power. 511 */ 512 static uint8_t 513 vdev_raidz_exp2(uint8_t a, int exp) 514 { 515 if (a == 0) 516 return (0); 517 518 ASSERT(exp >= 0); 519 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 520 521 exp += vdev_raidz_log2[a]; 522 if (exp > 255) 523 exp -= 255; 524 525 return (vdev_raidz_pow2[exp]); 526 } 527 528 static void 529 vdev_raidz_generate_parity_p(raidz_map_t *rm) 530 { 531 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 532 int c; 533 534 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 535 536 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 537 src = rm->rm_col[c].rc_data; 538 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 539 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 540 541 if (c == rm->rm_firstdatacol) { 542 ASSERT(ccount == pcount); 543 for (i = 0; i < ccount; i++, src++, p++) { 544 *p = *src; 545 } 546 } else { 547 ASSERT(ccount <= pcount); 548 for (i = 0; i < ccount; i++, src++, p++) { 549 *p ^= *src; 550 } 551 } 552 } 553 } 554 555 static void 556 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 557 { 558 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 559 int c; 560 561 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 562 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 563 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 564 565 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 566 src = rm->rm_col[c].rc_data; 567 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 568 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 569 570 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 571 572 if (c == rm->rm_firstdatacol) { 573 ASSERT(ccnt == pcnt || ccnt == 0); 574 for (i = 0; i < ccnt; i++, src++, p++, q++) { 575 *p = *src; 576 *q = *src; 577 } 578 for (; i < pcnt; i++, src++, p++, q++) { 579 *p = 0; 580 *q = 0; 581 } 582 } else { 583 ASSERT(ccnt <= pcnt); 584 585 /* 586 * Apply the algorithm described above by multiplying 587 * the previous result and adding in the new value. 588 */ 589 for (i = 0; i < ccnt; i++, src++, p++, q++) { 590 *p ^= *src; 591 592 VDEV_RAIDZ_64MUL_2(*q, mask); 593 *q ^= *src; 594 } 595 596 /* 597 * Treat short columns as though they are full of 0s. 598 * Note that there's therefore nothing needed for P. 599 */ 600 for (; i < pcnt; i++, q++) { 601 VDEV_RAIDZ_64MUL_2(*q, mask); 602 } 603 } 604 } 605 } 606 607 static void 608 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 609 { 610 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 611 int c; 612 613 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 614 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 615 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 617 rm->rm_col[VDEV_RAIDZ_R].rc_size); 618 619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 620 src = rm->rm_col[c].rc_data; 621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 623 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 624 625 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 626 627 if (c == rm->rm_firstdatacol) { 628 ASSERT(ccnt == pcnt || ccnt == 0); 629 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 630 *p = *src; 631 *q = *src; 632 *r = *src; 633 } 634 for (; i < pcnt; i++, src++, p++, q++, r++) { 635 *p = 0; 636 *q = 0; 637 *r = 0; 638 } 639 } else { 640 ASSERT(ccnt <= pcnt); 641 642 /* 643 * Apply the algorithm described above by multiplying 644 * the previous result and adding in the new value. 645 */ 646 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 647 *p ^= *src; 648 649 VDEV_RAIDZ_64MUL_2(*q, mask); 650 *q ^= *src; 651 652 VDEV_RAIDZ_64MUL_4(*r, mask); 653 *r ^= *src; 654 } 655 656 /* 657 * Treat short columns as though they are full of 0s. 658 * Note that there's therefore nothing needed for P. 659 */ 660 for (; i < pcnt; i++, q++, r++) { 661 VDEV_RAIDZ_64MUL_2(*q, mask); 662 VDEV_RAIDZ_64MUL_4(*r, mask); 663 } 664 } 665 } 666 } 667 668 /* 669 * Generate RAID parity in the first virtual columns according to the number of 670 * parity columns available. 671 */ 672 static void 673 vdev_raidz_generate_parity(raidz_map_t *rm) 674 { 675 switch (rm->rm_firstdatacol) { 676 case 1: 677 vdev_raidz_generate_parity_p(rm); 678 break; 679 case 2: 680 vdev_raidz_generate_parity_pq(rm); 681 break; 682 case 3: 683 vdev_raidz_generate_parity_pqr(rm); 684 break; 685 default: 686 panic("invalid RAID-Z configuration"); 687 } 688 } 689 690 /* BEGIN CSTYLED */ 691 /* 692 * In the general case of reconstruction, we must solve the system of linear 693 * equations defined by the coeffecients used to generate parity as well as 694 * the contents of the data and parity disks. This can be expressed with 695 * vectors for the original data (D) and the actual data (d) and parity (p) 696 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 697 * 698 * __ __ __ __ 699 * | | __ __ | p_0 | 700 * | V | | D_0 | | p_m-1 | 701 * | | x | : | = | d_0 | 702 * | I | | D_n-1 | | : | 703 * | | ~~ ~~ | d_n-1 | 704 * ~~ ~~ ~~ ~~ 705 * 706 * I is simply a square identity matrix of size n, and V is a vandermonde 707 * matrix defined by the coeffecients we chose for the various parity columns 708 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 709 * computation as well as linear separability. 710 * 711 * __ __ __ __ 712 * | 1 .. 1 1 1 | | p_0 | 713 * | 2^n-1 .. 4 2 1 | __ __ | : | 714 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 715 * | 1 .. 0 0 0 | | D_1 | | d_0 | 716 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 717 * | : : : : | | : | | d_2 | 718 * | 0 .. 1 0 0 | | D_n-1 | | : | 719 * | 0 .. 0 1 0 | ~~ ~~ | : | 720 * | 0 .. 0 0 1 | | d_n-1 | 721 * ~~ ~~ ~~ ~~ 722 * 723 * Note that I, V, d, and p are known. To compute D, we must invert the 724 * matrix and use the known data and parity values to reconstruct the unknown 725 * data values. We begin by removing the rows in V|I and d|p that correspond 726 * to failed or missing columns; we then make V|I square (n x n) and d|p 727 * sized n by removing rows corresponding to unused parity from the bottom up 728 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 729 * using Gauss-Jordan elimination. In the example below we use m=3 parity 730 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 731 * __ __ 732 * | 1 1 1 1 1 1 1 1 | 733 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 734 * | 19 205 116 29 64 16 4 1 | / / 735 * | 1 0 0 0 0 0 0 0 | / / 736 * | 0 1 0 0 0 0 0 0 | <--' / 737 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 738 * | 0 0 0 1 0 0 0 0 | 739 * | 0 0 0 0 1 0 0 0 | 740 * | 0 0 0 0 0 1 0 0 | 741 * | 0 0 0 0 0 0 1 0 | 742 * | 0 0 0 0 0 0 0 1 | 743 * ~~ ~~ 744 * __ __ 745 * | 1 1 1 1 1 1 1 1 | 746 * | 128 64 32 16 8 4 2 1 | 747 * | 19 205 116 29 64 16 4 1 | 748 * | 1 0 0 0 0 0 0 0 | 749 * | 0 1 0 0 0 0 0 0 | 750 * (V|I)' = | 0 0 1 0 0 0 0 0 | 751 * | 0 0 0 1 0 0 0 0 | 752 * | 0 0 0 0 1 0 0 0 | 753 * | 0 0 0 0 0 1 0 0 | 754 * | 0 0 0 0 0 0 1 0 | 755 * | 0 0 0 0 0 0 0 1 | 756 * ~~ ~~ 757 * 758 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 759 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 760 * matrix is not singular. 761 * __ __ 762 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 763 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 764 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 765 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 766 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 767 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 768 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 769 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 770 * ~~ ~~ 771 * __ __ 772 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 773 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 774 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 775 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 776 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 777 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 778 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 779 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 780 * ~~ ~~ 781 * __ __ 782 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 783 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 784 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 785 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 786 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 787 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 788 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 789 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 790 * ~~ ~~ 791 * __ __ 792 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 793 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 794 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 795 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 796 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 797 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 798 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 799 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 800 * ~~ ~~ 801 * __ __ 802 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 803 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 804 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 805 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 806 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 807 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 808 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 809 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 810 * ~~ ~~ 811 * __ __ 812 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 813 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 814 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 815 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 816 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 817 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 818 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 819 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 820 * ~~ ~~ 821 * __ __ 822 * | 0 0 1 0 0 0 0 0 | 823 * | 167 100 5 41 159 169 217 208 | 824 * | 166 100 4 40 158 168 216 209 | 825 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 826 * | 0 0 0 0 1 0 0 0 | 827 * | 0 0 0 0 0 1 0 0 | 828 * | 0 0 0 0 0 0 1 0 | 829 * | 0 0 0 0 0 0 0 1 | 830 * ~~ ~~ 831 * 832 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 833 * of the missing data. 834 * 835 * As is apparent from the example above, the only non-trivial rows in the 836 * inverse matrix correspond to the data disks that we're trying to 837 * reconstruct. Indeed, those are the only rows we need as the others would 838 * only be useful for reconstructing data known or assumed to be valid. For 839 * that reason, we only build the coefficients in the rows that correspond to 840 * targeted columns. 841 */ 842 /* END CSTYLED */ 843 844 static void 845 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map, 846 uint8_t **rows) 847 { 848 int i, j; 849 int pow; 850 851 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 852 853 /* 854 * Fill in the missing rows of interest. 855 */ 856 for (i = 0; i < nmap; i++) { 857 ASSERT3S(0, <=, map[i]); 858 ASSERT3S(map[i], <=, 2); 859 860 pow = map[i] * n; 861 if (pow > 255) 862 pow -= 255; 863 ASSERT(pow <= 255); 864 865 for (j = 0; j < n; j++) { 866 pow -= map[i]; 867 if (pow < 0) 868 pow += 255; 869 rows[i][j] = vdev_raidz_pow2[pow]; 870 } 871 } 872 } 873 874 static void 875 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 876 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 877 { 878 int i, j, ii, jj; 879 uint8_t log; 880 881 /* 882 * Assert that the first nmissing entries from the array of used 883 * columns correspond to parity columns and that subsequent entries 884 * correspond to data columns. 885 */ 886 for (i = 0; i < nmissing; i++) { 887 ASSERT3S(used[i], <, rm->rm_firstdatacol); 888 } 889 for (; i < n; i++) { 890 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 891 } 892 893 /* 894 * First initialize the storage where we'll compute the inverse rows. 895 */ 896 for (i = 0; i < nmissing; i++) { 897 for (j = 0; j < n; j++) { 898 invrows[i][j] = (i == j) ? 1 : 0; 899 } 900 } 901 902 /* 903 * Subtract all trivial rows from the rows of consequence. 904 */ 905 for (i = 0; i < nmissing; i++) { 906 for (j = nmissing; j < n; j++) { 907 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 908 jj = used[j] - rm->rm_firstdatacol; 909 ASSERT3S(jj, <, n); 910 invrows[i][j] = rows[i][jj]; 911 rows[i][jj] = 0; 912 } 913 } 914 915 /* 916 * For each of the rows of interest, we must normalize it and subtract 917 * a multiple of it from the other rows. 918 */ 919 for (i = 0; i < nmissing; i++) { 920 for (j = 0; j < missing[i]; j++) { 921 ASSERT3U(rows[i][j], ==, 0); 922 } 923 ASSERT3U(rows[i][missing[i]], !=, 0); 924 925 /* 926 * Compute the inverse of the first element and multiply each 927 * element in the row by that value. 928 */ 929 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 930 931 for (j = 0; j < n; j++) { 932 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 933 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 934 } 935 936 for (ii = 0; ii < nmissing; ii++) { 937 if (i == ii) 938 continue; 939 940 ASSERT3U(rows[ii][missing[i]], !=, 0); 941 942 log = vdev_raidz_log2[rows[ii][missing[i]]]; 943 944 for (j = 0; j < n; j++) { 945 rows[ii][j] ^= 946 vdev_raidz_exp2(rows[i][j], log); 947 invrows[ii][j] ^= 948 vdev_raidz_exp2(invrows[i][j], log); 949 } 950 } 951 } 952 953 /* 954 * Verify that the data that is left in the rows are properly part of 955 * an identity matrix. 956 */ 957 for (i = 0; i < nmissing; i++) { 958 for (j = 0; j < n; j++) { 959 if (j == missing[i]) { 960 ASSERT3U(rows[i][j], ==, 1); 961 } else { 962 ASSERT3U(rows[i][j], ==, 0); 963 } 964 } 965 } 966 } 967 968 static void 969 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 970 int *missing, uint8_t **invrows, const uint8_t *used) 971 { 972 int i, j, x, cc, c; 973 uint8_t *src; 974 uint64_t ccount; 975 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 976 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 977 uint8_t log, val; 978 int ll; 979 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 980 uint8_t *p, *pp; 981 size_t psize; 982 983 log = 0; /* gcc */ 984 psize = sizeof (invlog[0][0]) * n * nmissing; 985 p = zfs_alloc(psize); 986 987 for (pp = p, i = 0; i < nmissing; i++) { 988 invlog[i] = pp; 989 pp += n; 990 } 991 992 for (i = 0; i < nmissing; i++) { 993 for (j = 0; j < n; j++) { 994 ASSERT3U(invrows[i][j], !=, 0); 995 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 996 } 997 } 998 999 for (i = 0; i < n; i++) { 1000 c = used[i]; 1001 ASSERT3U(c, <, rm->rm_cols); 1002 1003 src = rm->rm_col[c].rc_data; 1004 ccount = rm->rm_col[c].rc_size; 1005 for (j = 0; j < nmissing; j++) { 1006 cc = missing[j] + rm->rm_firstdatacol; 1007 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1008 ASSERT3U(cc, <, rm->rm_cols); 1009 ASSERT3U(cc, !=, c); 1010 1011 dst[j] = rm->rm_col[cc].rc_data; 1012 dcount[j] = rm->rm_col[cc].rc_size; 1013 } 1014 1015 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1016 1017 for (x = 0; x < ccount; x++, src++) { 1018 if (*src != 0) 1019 log = vdev_raidz_log2[*src]; 1020 1021 for (cc = 0; cc < nmissing; cc++) { 1022 if (x >= dcount[cc]) 1023 continue; 1024 1025 if (*src == 0) { 1026 val = 0; 1027 } else { 1028 if ((ll = log + invlog[cc][i]) >= 255) 1029 ll -= 255; 1030 val = vdev_raidz_pow2[ll]; 1031 } 1032 1033 if (i == 0) 1034 dst[cc][x] = val; 1035 else 1036 dst[cc][x] ^= val; 1037 } 1038 } 1039 } 1040 1041 zfs_free(p, psize); 1042 } 1043 1044 static int 1045 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1046 { 1047 int n, i, c, t, tt; 1048 int nmissing_rows; 1049 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1050 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1051 1052 uint8_t *p, *pp; 1053 size_t psize; 1054 1055 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1056 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1057 uint8_t *used; 1058 1059 int code = 0; 1060 1061 1062 n = rm->rm_cols - rm->rm_firstdatacol; 1063 1064 /* 1065 * Figure out which data columns are missing. 1066 */ 1067 nmissing_rows = 0; 1068 for (t = 0; t < ntgts; t++) { 1069 if (tgts[t] >= rm->rm_firstdatacol) { 1070 missing_rows[nmissing_rows++] = 1071 tgts[t] - rm->rm_firstdatacol; 1072 } 1073 } 1074 1075 /* 1076 * Figure out which parity columns to use to help generate the missing 1077 * data columns. 1078 */ 1079 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1080 ASSERT(tt < ntgts); 1081 ASSERT(c < rm->rm_firstdatacol); 1082 1083 /* 1084 * Skip any targeted parity columns. 1085 */ 1086 if (c == tgts[tt]) { 1087 tt++; 1088 continue; 1089 } 1090 1091 code |= 1 << c; 1092 1093 parity_map[i] = c; 1094 i++; 1095 } 1096 1097 ASSERT(code != 0); 1098 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1099 1100 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1101 nmissing_rows * n + sizeof (used[0]) * n; 1102 p = kmem_alloc(psize, KM_SLEEP); 1103 1104 for (pp = p, i = 0; i < nmissing_rows; i++) { 1105 rows[i] = pp; 1106 pp += n; 1107 invrows[i] = pp; 1108 pp += n; 1109 } 1110 used = pp; 1111 1112 for (i = 0; i < nmissing_rows; i++) { 1113 used[i] = parity_map[i]; 1114 } 1115 1116 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1117 if (tt < nmissing_rows && 1118 c == missing_rows[tt] + rm->rm_firstdatacol) { 1119 tt++; 1120 continue; 1121 } 1122 1123 ASSERT3S(i, <, n); 1124 used[i] = c; 1125 i++; 1126 } 1127 1128 /* 1129 * Initialize the interesting rows of the matrix. 1130 */ 1131 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1132 1133 /* 1134 * Invert the matrix. 1135 */ 1136 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1137 invrows, used); 1138 1139 /* 1140 * Reconstruct the missing data using the generated matrix. 1141 */ 1142 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1143 invrows, used); 1144 1145 kmem_free(p, psize); 1146 1147 return (code); 1148 } 1149 1150 static int 1151 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1152 { 1153 int tgts[VDEV_RAIDZ_MAXPARITY]; 1154 int ntgts; 1155 int i, c; 1156 int code; 1157 int nbadparity, nbaddata; 1158 1159 /* 1160 * The tgts list must already be sorted. 1161 */ 1162 for (i = 1; i < nt; i++) { 1163 ASSERT(t[i] > t[i - 1]); 1164 } 1165 1166 nbadparity = rm->rm_firstdatacol; 1167 nbaddata = rm->rm_cols - nbadparity; 1168 ntgts = 0; 1169 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1170 if (i < nt && c == t[i]) { 1171 tgts[ntgts++] = c; 1172 i++; 1173 } else if (rm->rm_col[c].rc_error != 0) { 1174 tgts[ntgts++] = c; 1175 } else if (c >= rm->rm_firstdatacol) { 1176 nbaddata--; 1177 } else { 1178 nbadparity--; 1179 } 1180 } 1181 1182 ASSERT(ntgts >= nt); 1183 ASSERT(nbaddata >= 0); 1184 ASSERT(nbaddata + nbadparity == ntgts); 1185 1186 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1187 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1188 ASSERT(code > 0); 1189 return (code); 1190 } 1191 1192 static raidz_map_t * 1193 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1194 uint64_t dcols, uint64_t nparity) 1195 { 1196 raidz_map_t *rm; 1197 uint64_t b = offset >> unit_shift; 1198 uint64_t s = size >> unit_shift; 1199 uint64_t f = b % dcols; 1200 uint64_t o = (b / dcols) << unit_shift; 1201 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1202 1203 q = s / (dcols - nparity); 1204 r = s - q * (dcols - nparity); 1205 bc = (r == 0 ? 0 : r + nparity); 1206 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1207 1208 if (q == 0) { 1209 acols = bc; 1210 scols = MIN(dcols, roundup(bc, nparity + 1)); 1211 } else { 1212 acols = dcols; 1213 scols = dcols; 1214 } 1215 1216 ASSERT3U(acols, <=, scols); 1217 1218 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1219 1220 rm->rm_cols = acols; 1221 rm->rm_scols = scols; 1222 rm->rm_bigcols = bc; 1223 rm->rm_skipstart = bc; 1224 rm->rm_missingdata = 0; 1225 rm->rm_missingparity = 0; 1226 rm->rm_firstdatacol = nparity; 1227 rm->rm_reports = 0; 1228 rm->rm_freed = 0; 1229 rm->rm_ecksuminjected = 0; 1230 1231 asize = 0; 1232 1233 for (c = 0; c < scols; c++) { 1234 col = f + c; 1235 coff = o; 1236 if (col >= dcols) { 1237 col -= dcols; 1238 coff += 1ULL << unit_shift; 1239 } 1240 rm->rm_col[c].rc_devidx = col; 1241 rm->rm_col[c].rc_offset = coff; 1242 rm->rm_col[c].rc_data = NULL; 1243 rm->rm_col[c].rc_error = 0; 1244 rm->rm_col[c].rc_tried = 0; 1245 rm->rm_col[c].rc_skipped = 0; 1246 1247 if (c >= acols) 1248 rm->rm_col[c].rc_size = 0; 1249 else if (c < bc) 1250 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1251 else 1252 rm->rm_col[c].rc_size = q << unit_shift; 1253 1254 asize += rm->rm_col[c].rc_size; 1255 } 1256 1257 ASSERT3U(asize, ==, tot << unit_shift); 1258 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1259 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1260 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1261 ASSERT3U(rm->rm_nskip, <=, nparity); 1262 1263 for (c = 0; c < rm->rm_firstdatacol; c++) 1264 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1265 1266 rm->rm_col[c].rc_data = data; 1267 1268 for (c = c + 1; c < acols; c++) 1269 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1270 rm->rm_col[c - 1].rc_size; 1271 1272 /* 1273 * If all data stored spans all columns, there's a danger that parity 1274 * will always be on the same device and, since parity isn't read 1275 * during normal operation, that that device's I/O bandwidth won't be 1276 * used effectively. We therefore switch the parity every 1MB. 1277 * 1278 * ... at least that was, ostensibly, the theory. As a practical 1279 * matter unless we juggle the parity between all devices evenly, we 1280 * won't see any benefit. Further, occasional writes that aren't a 1281 * multiple of the LCM of the number of children and the minimum 1282 * stripe width are sufficient to avoid pessimal behavior. 1283 * Unfortunately, this decision created an implicit on-disk format 1284 * requirement that we need to support for all eternity, but only 1285 * for single-parity RAID-Z. 1286 * 1287 * If we intend to skip a sector in the zeroth column for padding 1288 * we must make sure to note this swap. We will never intend to 1289 * skip the first column since at least one data and one parity 1290 * column must appear in each row. 1291 */ 1292 ASSERT(rm->rm_cols >= 2); 1293 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1294 1295 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1296 devidx = rm->rm_col[0].rc_devidx; 1297 o = rm->rm_col[0].rc_offset; 1298 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1299 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1300 rm->rm_col[1].rc_devidx = devidx; 1301 rm->rm_col[1].rc_offset = o; 1302 1303 if (rm->rm_skipstart == 0) 1304 rm->rm_skipstart = 1; 1305 } 1306 1307 return (rm); 1308 } 1309 1310 static void 1311 vdev_raidz_map_free(raidz_map_t *rm) 1312 { 1313 int c; 1314 1315 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1316 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1317 1318 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1319 } 1320 1321 static vdev_t * 1322 vdev_child(vdev_t *pvd, uint64_t devidx) 1323 { 1324 vdev_t *cvd; 1325 1326 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1327 if (cvd->v_id == devidx) 1328 break; 1329 } 1330 1331 return (cvd); 1332 } 1333 1334 /* 1335 * We keep track of whether or not there were any injected errors, so that 1336 * any ereports we generate can note it. 1337 */ 1338 static int 1339 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1340 uint64_t size __unused) 1341 { 1342 1343 return (zio_checksum_verify(spa, bp, data)); 1344 } 1345 1346 /* 1347 * Generate the parity from the data columns. If we tried and were able to 1348 * read the parity without error, verify that the generated parity matches the 1349 * data we read. If it doesn't, we fire off a checksum error. Return the 1350 * number such failures. 1351 */ 1352 static int 1353 raidz_parity_verify(raidz_map_t *rm) 1354 { 1355 void *orig[VDEV_RAIDZ_MAXPARITY]; 1356 int c, ret = 0; 1357 raidz_col_t *rc; 1358 1359 for (c = 0; c < rm->rm_firstdatacol; c++) { 1360 rc = &rm->rm_col[c]; 1361 if (!rc->rc_tried || rc->rc_error != 0) 1362 continue; 1363 orig[c] = zfs_alloc(rc->rc_size); 1364 bcopy(rc->rc_data, orig[c], rc->rc_size); 1365 } 1366 1367 vdev_raidz_generate_parity(rm); 1368 1369 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1370 rc = &rm->rm_col[c]; 1371 if (!rc->rc_tried || rc->rc_error != 0) 1372 continue; 1373 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1374 rc->rc_error = ECKSUM; 1375 ret++; 1376 } 1377 zfs_free(orig[c], rc->rc_size); 1378 } 1379 1380 return (ret); 1381 } 1382 1383 /* 1384 * Iterate over all combinations of bad data and attempt a reconstruction. 1385 * Note that the algorithm below is non-optimal because it doesn't take into 1386 * account how reconstruction is actually performed. For example, with 1387 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1388 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1389 * cases we'd only use parity information in column 0. 1390 */ 1391 static int 1392 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1393 void *data, off_t offset __unused, uint64_t bytes, int total_errors, 1394 int data_errors) 1395 { 1396 raidz_col_t *rc; 1397 void *orig[VDEV_RAIDZ_MAXPARITY]; 1398 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1399 int *tgts = &tstore[1]; 1400 int current, next, i, c, n; 1401 int code, ret = 0; 1402 1403 ASSERT(total_errors < rm->rm_firstdatacol); 1404 1405 /* 1406 * This simplifies one edge condition. 1407 */ 1408 tgts[-1] = -1; 1409 1410 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1411 /* 1412 * Initialize the targets array by finding the first n columns 1413 * that contain no error. 1414 * 1415 * If there were no data errors, we need to ensure that we're 1416 * always explicitly attempting to reconstruct at least one 1417 * data column. To do this, we simply push the highest target 1418 * up into the data columns. 1419 */ 1420 for (c = 0, i = 0; i < n; i++) { 1421 if (i == n - 1 && data_errors == 0 && 1422 c < rm->rm_firstdatacol) { 1423 c = rm->rm_firstdatacol; 1424 } 1425 1426 while (rm->rm_col[c].rc_error != 0) { 1427 c++; 1428 ASSERT3S(c, <, rm->rm_cols); 1429 } 1430 1431 tgts[i] = c++; 1432 } 1433 1434 /* 1435 * Setting tgts[n] simplifies the other edge condition. 1436 */ 1437 tgts[n] = rm->rm_cols; 1438 1439 /* 1440 * These buffers were allocated in previous iterations. 1441 */ 1442 for (i = 0; i < n - 1; i++) { 1443 ASSERT(orig[i] != NULL); 1444 } 1445 1446 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1447 1448 current = 0; 1449 next = tgts[current]; 1450 1451 while (current != n) { 1452 tgts[current] = next; 1453 current = 0; 1454 1455 /* 1456 * Save off the original data that we're going to 1457 * attempt to reconstruct. 1458 */ 1459 for (i = 0; i < n; i++) { 1460 ASSERT(orig[i] != NULL); 1461 c = tgts[i]; 1462 ASSERT3S(c, >=, 0); 1463 ASSERT3S(c, <, rm->rm_cols); 1464 rc = &rm->rm_col[c]; 1465 bcopy(rc->rc_data, orig[i], rc->rc_size); 1466 } 1467 1468 /* 1469 * Attempt a reconstruction and exit the outer loop on 1470 * success. 1471 */ 1472 code = vdev_raidz_reconstruct(rm, tgts, n); 1473 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1474 for (i = 0; i < n; i++) { 1475 c = tgts[i]; 1476 rc = &rm->rm_col[c]; 1477 ASSERT(rc->rc_error == 0); 1478 rc->rc_error = ECKSUM; 1479 } 1480 1481 ret = code; 1482 goto done; 1483 } 1484 1485 /* 1486 * Restore the original data. 1487 */ 1488 for (i = 0; i < n; i++) { 1489 c = tgts[i]; 1490 rc = &rm->rm_col[c]; 1491 bcopy(orig[i], rc->rc_data, rc->rc_size); 1492 } 1493 1494 do { 1495 /* 1496 * Find the next valid column after the current 1497 * position.. 1498 */ 1499 for (next = tgts[current] + 1; 1500 next < rm->rm_cols && 1501 rm->rm_col[next].rc_error != 0; next++) 1502 continue; 1503 1504 ASSERT(next <= tgts[current + 1]); 1505 1506 /* 1507 * If that spot is available, we're done here. 1508 */ 1509 if (next != tgts[current + 1]) 1510 break; 1511 1512 /* 1513 * Otherwise, find the next valid column after 1514 * the previous position. 1515 */ 1516 for (c = tgts[current - 1] + 1; 1517 rm->rm_col[c].rc_error != 0; c++) 1518 continue; 1519 1520 tgts[current] = c; 1521 current++; 1522 1523 } while (current != n); 1524 } 1525 } 1526 n--; 1527 done: 1528 for (i = n - 1; i >= 0; i--) { 1529 zfs_free(orig[i], rm->rm_col[0].rc_size); 1530 } 1531 1532 return (ret); 1533 } 1534 1535 static int 1536 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1537 off_t offset, size_t bytes) 1538 { 1539 vdev_t *tvd = vd->v_top; 1540 vdev_t *cvd; 1541 raidz_map_t *rm; 1542 raidz_col_t *rc; 1543 int c, error; 1544 int unexpected_errors; 1545 int parity_errors; 1546 int parity_untried; 1547 int data_errors; 1548 int total_errors; 1549 int n; 1550 int tgts[VDEV_RAIDZ_MAXPARITY]; 1551 int code; 1552 1553 rc = NULL; /* gcc */ 1554 error = 0; 1555 1556 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1557 vd->v_nchildren, vd->v_nparity); 1558 1559 /* 1560 * Iterate over the columns in reverse order so that we hit the parity 1561 * last -- any errors along the way will force us to read the parity. 1562 */ 1563 for (c = rm->rm_cols - 1; c >= 0; c--) { 1564 rc = &rm->rm_col[c]; 1565 cvd = vdev_child(vd, rc->rc_devidx); 1566 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1567 if (c >= rm->rm_firstdatacol) 1568 rm->rm_missingdata++; 1569 else 1570 rm->rm_missingparity++; 1571 rc->rc_error = ENXIO; 1572 rc->rc_tried = 1; /* don't even try */ 1573 rc->rc_skipped = 1; 1574 continue; 1575 } 1576 #if 0 /* XXX: Too hard for the boot code. */ 1577 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1578 if (c >= rm->rm_firstdatacol) 1579 rm->rm_missingdata++; 1580 else 1581 rm->rm_missingparity++; 1582 rc->rc_error = ESTALE; 1583 rc->rc_skipped = 1; 1584 continue; 1585 } 1586 #endif 1587 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1588 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1589 rc->rc_offset, rc->rc_size); 1590 rc->rc_tried = 1; 1591 rc->rc_skipped = 0; 1592 } 1593 } 1594 1595 reconstruct: 1596 unexpected_errors = 0; 1597 parity_errors = 0; 1598 parity_untried = 0; 1599 data_errors = 0; 1600 total_errors = 0; 1601 1602 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1603 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1604 1605 for (c = 0; c < rm->rm_cols; c++) { 1606 rc = &rm->rm_col[c]; 1607 1608 if (rc->rc_error) { 1609 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1610 1611 if (c < rm->rm_firstdatacol) 1612 parity_errors++; 1613 else 1614 data_errors++; 1615 1616 if (!rc->rc_skipped) 1617 unexpected_errors++; 1618 1619 total_errors++; 1620 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1621 parity_untried++; 1622 } 1623 } 1624 1625 /* 1626 * There are three potential phases for a read: 1627 * 1. produce valid data from the columns read 1628 * 2. read all disks and try again 1629 * 3. perform combinatorial reconstruction 1630 * 1631 * Each phase is progressively both more expensive and less likely to 1632 * occur. If we encounter more errors than we can repair or all phases 1633 * fail, we have no choice but to return an error. 1634 */ 1635 1636 /* 1637 * If the number of errors we saw was correctable -- less than or equal 1638 * to the number of parity disks read -- attempt to produce data that 1639 * has a valid checksum. Naturally, this case applies in the absence of 1640 * any errors. 1641 */ 1642 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1643 int rv; 1644 1645 if (data_errors == 0) { 1646 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1647 if (rv == 0) { 1648 /* 1649 * If we read parity information (unnecessarily 1650 * as it happens since no reconstruction was 1651 * needed) regenerate and verify the parity. 1652 * We also regenerate parity when resilvering 1653 * so we can write it out to the failed device 1654 * later. 1655 */ 1656 if (parity_errors + parity_untried < 1657 rm->rm_firstdatacol) { 1658 n = raidz_parity_verify(rm); 1659 unexpected_errors += n; 1660 ASSERT(parity_errors + n <= 1661 rm->rm_firstdatacol); 1662 } 1663 goto done; 1664 } 1665 } else { 1666 /* 1667 * We either attempt to read all the parity columns or 1668 * none of them. If we didn't try to read parity, we 1669 * wouldn't be here in the correctable case. There must 1670 * also have been fewer parity errors than parity 1671 * columns or, again, we wouldn't be in this code path. 1672 */ 1673 ASSERT(parity_untried == 0); 1674 ASSERT(parity_errors < rm->rm_firstdatacol); 1675 1676 /* 1677 * Identify the data columns that reported an error. 1678 */ 1679 n = 0; 1680 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1681 rc = &rm->rm_col[c]; 1682 if (rc->rc_error != 0) { 1683 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1684 tgts[n++] = c; 1685 } 1686 } 1687 1688 ASSERT(rm->rm_firstdatacol >= n); 1689 1690 code = vdev_raidz_reconstruct(rm, tgts, n); 1691 1692 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1693 if (rv == 0) { 1694 /* 1695 * If we read more parity disks than were used 1696 * for reconstruction, confirm that the other 1697 * parity disks produced correct data. This 1698 * routine is suboptimal in that it regenerates 1699 * the parity that we already used in addition 1700 * to the parity that we're attempting to 1701 * verify, but this should be a relatively 1702 * uncommon case, and can be optimized if it 1703 * becomes a problem. Note that we regenerate 1704 * parity when resilvering so we can write it 1705 * out to failed devices later. 1706 */ 1707 if (parity_errors < rm->rm_firstdatacol - n) { 1708 n = raidz_parity_verify(rm); 1709 unexpected_errors += n; 1710 ASSERT(parity_errors + n <= 1711 rm->rm_firstdatacol); 1712 } 1713 1714 goto done; 1715 } 1716 } 1717 } 1718 1719 /* 1720 * This isn't a typical situation -- either we got a read 1721 * error or a child silently returned bad data. Read every 1722 * block so we can try again with as much data and parity as 1723 * we can track down. If we've already been through once 1724 * before, all children will be marked as tried so we'll 1725 * proceed to combinatorial reconstruction. 1726 */ 1727 unexpected_errors = 1; 1728 rm->rm_missingdata = 0; 1729 rm->rm_missingparity = 0; 1730 1731 n = 0; 1732 for (c = 0; c < rm->rm_cols; c++) { 1733 rc = &rm->rm_col[c]; 1734 1735 if (rc->rc_tried) 1736 continue; 1737 1738 cvd = vdev_child(vd, rc->rc_devidx); 1739 ASSERT(cvd != NULL); 1740 rc->rc_error = cvd->v_read(cvd, NULL, 1741 rc->rc_data, rc->rc_offset, rc->rc_size); 1742 if (rc->rc_error == 0) 1743 n++; 1744 rc->rc_tried = 1; 1745 rc->rc_skipped = 0; 1746 } 1747 /* 1748 * If we managed to read anything more, retry the 1749 * reconstruction. 1750 */ 1751 if (n > 0) 1752 goto reconstruct; 1753 1754 /* 1755 * At this point we've attempted to reconstruct the data given the 1756 * errors we detected, and we've attempted to read all columns. There 1757 * must, therefore, be one or more additional problems -- silent errors 1758 * resulting in invalid data rather than explicit I/O errors resulting 1759 * in absent data. We check if there is enough additional data to 1760 * possibly reconstruct the data and then perform combinatorial 1761 * reconstruction over all possible combinations. If that fails, 1762 * we're cooked. 1763 */ 1764 if (total_errors > rm->rm_firstdatacol) { 1765 error = EIO; 1766 } else if (total_errors < rm->rm_firstdatacol && 1767 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1768 total_errors, data_errors)) != 0) { 1769 /* 1770 * If we didn't use all the available parity for the 1771 * combinatorial reconstruction, verify that the remaining 1772 * parity is correct. 1773 */ 1774 if (code != (1 << rm->rm_firstdatacol) - 1) 1775 (void) raidz_parity_verify(rm); 1776 } else { 1777 /* 1778 * We're here because either: 1779 * 1780 * total_errors == rm_first_datacol, or 1781 * vdev_raidz_combrec() failed 1782 * 1783 * In either case, there is enough bad data to prevent 1784 * reconstruction. 1785 * 1786 * Start checksum ereports for all children which haven't 1787 * failed, and the IO wasn't speculative. 1788 */ 1789 error = ECKSUM; 1790 } 1791 1792 done: 1793 vdev_raidz_map_free(rm); 1794 1795 return (error); 1796 } 1797