1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 #include <lz4.h> 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 static void 40 zfs_init_crc(void) 41 { 42 int i, j; 43 uint64_t *ct; 44 45 /* 46 * Calculate the crc64 table (used for the zap hash 47 * function). 48 */ 49 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 50 memset(zfs_crc64_table, 0, sizeof (zfs_crc64_table)); 51 for (i = 0; i < 256; i++) { 52 ct = zfs_crc64_table + i; 53 for (*ct = i, j = 8; j > 0; j--) 54 *ct = (*ct >> 1) ^ 55 (-(*ct & 1) & ZFS_CRC64_POLY); 56 } 57 } 58 } 59 60 static void 61 zio_checksum_off(const void *buf __unused, uint64_t size __unused, 62 const void *ctx_template __unused, zio_cksum_t *zcp) 63 { 64 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 65 } 66 67 /* 68 * Signature for checksum functions. 69 */ 70 typedef void zio_checksum_t(const void *data, uint64_t size, 71 const void *ctx_template, zio_cksum_t *zcp); 72 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 73 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 74 75 typedef enum zio_checksum_flags { 76 /* Strong enough for metadata? */ 77 ZCHECKSUM_FLAG_METADATA = (1 << 1), 78 /* ZIO embedded checksum */ 79 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 80 /* Strong enough for dedup (without verification)? */ 81 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 82 /* Uses salt value */ 83 ZCHECKSUM_FLAG_SALTED = (1 << 4), 84 /* Strong enough for nopwrite? */ 85 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 86 } zio_checksum_flags_t; 87 88 /* 89 * Information about each checksum function. 90 */ 91 typedef struct zio_checksum_info { 92 /* checksum function for each byteorder */ 93 zio_checksum_t *ci_func[2]; 94 zio_checksum_tmpl_init_t *ci_tmpl_init; 95 zio_checksum_tmpl_free_t *ci_tmpl_free; 96 zio_checksum_flags_t ci_flags; 97 const char *ci_name; /* descriptive name */ 98 } zio_checksum_info_t; 99 100 #include "blkptr.c" 101 102 #include "fletcher.c" 103 #include "sha256.c" 104 #include "skein_zfs.c" 105 #include "edonr_zfs.c" 106 107 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 108 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 109 {{NULL, NULL}, NULL, NULL, 0, "on"}, 110 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 111 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 112 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 113 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 114 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 115 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 116 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 117 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 118 0, "fletcher2"}, 119 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 120 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 121 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 122 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 123 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 124 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 125 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 126 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 127 0, "noparity"}, 128 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 129 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 130 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 131 /* no skein and edonr for now */ 132 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 133 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 134 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 135 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 136 {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap}, 137 zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free, 138 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | 139 ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 140 }; 141 142 /* 143 * Common signature for all zio compress/decompress functions. 144 */ 145 typedef size_t zio_compress_func_t(void *src, void *dst, 146 size_t s_len, size_t d_len, int); 147 typedef int zio_decompress_func_t(void *src, void *dst, 148 size_t s_len, size_t d_len, int); 149 150 extern int gzip_decompress(void *src, void *dst, 151 size_t s_len, size_t d_len, int); 152 /* 153 * Information about each compression function. 154 */ 155 typedef struct zio_compress_info { 156 zio_compress_func_t *ci_compress; /* compression function */ 157 zio_decompress_func_t *ci_decompress; /* decompression function */ 158 int ci_level; /* level parameter */ 159 const char *ci_name; /* algorithm name */ 160 } zio_compress_info_t; 161 162 #include "lzjb.c" 163 #include "zle.c" 164 165 /* 166 * Compression vectors. 167 */ 168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 169 {NULL, NULL, 0, "inherit"}, 170 {NULL, NULL, 0, "on"}, 171 {NULL, NULL, 0, "uncompressed"}, 172 {NULL, lzjb_decompress, 0, "lzjb"}, 173 {NULL, NULL, 0, "empty"}, 174 {NULL, gzip_decompress, 1, "gzip-1"}, 175 {NULL, gzip_decompress, 2, "gzip-2"}, 176 {NULL, gzip_decompress, 3, "gzip-3"}, 177 {NULL, gzip_decompress, 4, "gzip-4"}, 178 {NULL, gzip_decompress, 5, "gzip-5"}, 179 {NULL, gzip_decompress, 6, "gzip-6"}, 180 {NULL, gzip_decompress, 7, "gzip-7"}, 181 {NULL, gzip_decompress, 8, "gzip-8"}, 182 {NULL, gzip_decompress, 9, "gzip-9"}, 183 {NULL, zle_decompress, 64, "zle"}, 184 {NULL, lz4_decompress, 0, "lz4"}, 185 }; 186 187 static void 188 byteswap_uint64_array(void *vbuf, size_t size) 189 { 190 uint64_t *buf = vbuf; 191 size_t count = size >> 3; 192 int i; 193 194 ASSERT((size & 7) == 0); 195 196 for (i = 0; i < count; i++) 197 buf[i] = BSWAP_64(buf[i]); 198 } 199 200 /* 201 * Set the external verifier for a gang block based on <vdev, offset, txg>, 202 * a tuple which is guaranteed to be unique for the life of the pool. 203 */ 204 static void 205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 206 { 207 const dva_t *dva = BP_IDENTITY(bp); 208 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 209 210 ASSERT(BP_IS_GANG(bp)); 211 212 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 213 } 214 215 /* 216 * Set the external verifier for a label block based on its offset. 217 * The vdev is implicit, and the txg is unknowable at pool open time -- 218 * hence the logic in vdev_uberblock_load() to find the most recent copy. 219 */ 220 static void 221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 222 { 223 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 224 } 225 226 /* 227 * Calls the template init function of a checksum which supports context 228 * templates and installs the template into the spa_t. 229 */ 230 static void 231 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 232 { 233 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 234 235 if (ci->ci_tmpl_init == NULL) 236 return; 237 238 if (spa->spa_cksum_tmpls[checksum] != NULL) 239 return; 240 241 if (spa->spa_cksum_tmpls[checksum] == NULL) { 242 spa->spa_cksum_tmpls[checksum] = 243 ci->ci_tmpl_init(&spa->spa_cksum_salt); 244 } 245 } 246 247 /* 248 * Called by a spa_t that's about to be deallocated. This steps through 249 * all of the checksum context templates and deallocates any that were 250 * initialized using the algorithm-specific template init function. 251 */ 252 void 253 zio_checksum_templates_free(spa_t *spa) 254 { 255 for (enum zio_checksum checksum = 0; 256 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 257 if (spa->spa_cksum_tmpls[checksum] != NULL) { 258 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 259 260 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 261 spa->spa_cksum_tmpls[checksum] = NULL; 262 } 263 } 264 } 265 266 static int 267 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 268 { 269 uint64_t size; 270 unsigned int checksum; 271 zio_checksum_info_t *ci; 272 void *ctx = NULL; 273 zio_cksum_t actual_cksum, expected_cksum, verifier; 274 int byteswap; 275 276 checksum = BP_GET_CHECKSUM(bp); 277 size = BP_GET_PSIZE(bp); 278 279 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 280 return (EINVAL); 281 ci = &zio_checksum_table[checksum]; 282 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 283 return (EINVAL); 284 285 if (spa != NULL) { 286 zio_checksum_template_init(checksum, (spa_t *)spa); 287 ctx = spa->spa_cksum_tmpls[checksum]; 288 } 289 290 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 291 zio_eck_t *eck; 292 293 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 294 checksum == ZIO_CHECKSUM_LABEL); 295 296 eck = (zio_eck_t *)((char *)data + size) - 1; 297 298 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 299 zio_checksum_gang_verifier(&verifier, bp); 300 else if (checksum == ZIO_CHECKSUM_LABEL) 301 zio_checksum_label_verifier(&verifier, 302 DVA_GET_OFFSET(BP_IDENTITY(bp))); 303 else 304 verifier = bp->blk_cksum; 305 306 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 307 308 if (byteswap) 309 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 310 311 expected_cksum = eck->zec_cksum; 312 eck->zec_cksum = verifier; 313 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 314 eck->zec_cksum = expected_cksum; 315 316 if (byteswap) 317 byteswap_uint64_array(&expected_cksum, 318 sizeof (zio_cksum_t)); 319 } else { 320 byteswap = BP_SHOULD_BYTESWAP(bp); 321 expected_cksum = bp->blk_cksum; 322 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 323 } 324 325 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 326 /* printf("ZFS: read checksum %s failed\n", ci->ci_name); */ 327 return (EIO); 328 } 329 330 return (0); 331 } 332 333 static int 334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 335 void *dest, uint64_t destsize) 336 { 337 zio_compress_info_t *ci; 338 339 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 340 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 341 return (EIO); 342 } 343 344 ci = &zio_compress_table[cpfunc]; 345 if (!ci->ci_decompress) { 346 printf("ZFS: unsupported compression algorithm %s\n", 347 ci->ci_name); 348 return (EIO); 349 } 350 351 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 352 } 353 354 static uint64_t 355 zap_hash(uint64_t salt, const char *name) 356 { 357 const uint8_t *cp; 358 uint8_t c; 359 uint64_t crc = salt; 360 361 ASSERT(crc != 0); 362 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 363 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 364 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 365 366 /* 367 * Only use 28 bits, since we need 4 bits in the cookie for the 368 * collision differentiator. We MUST use the high bits, since 369 * those are the onces that we first pay attention to when 370 * chosing the bucket. 371 */ 372 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 373 374 return (crc); 375 } 376 377 typedef struct raidz_col { 378 uint64_t rc_devidx; /* child device index for I/O */ 379 uint64_t rc_offset; /* device offset */ 380 uint64_t rc_size; /* I/O size */ 381 void *rc_data; /* I/O data */ 382 int rc_error; /* I/O error for this device */ 383 uint8_t rc_tried; /* Did we attempt this I/O column? */ 384 uint8_t rc_skipped; /* Did we skip this I/O column? */ 385 } raidz_col_t; 386 387 typedef struct raidz_map { 388 uint64_t rm_cols; /* Regular column count */ 389 uint64_t rm_scols; /* Count including skipped columns */ 390 uint64_t rm_bigcols; /* Number of oversized columns */ 391 uint64_t rm_asize; /* Actual total I/O size */ 392 uint64_t rm_missingdata; /* Count of missing data devices */ 393 uint64_t rm_missingparity; /* Count of missing parity devices */ 394 uint64_t rm_firstdatacol; /* First data column/parity count */ 395 uint64_t rm_nskip; /* Skipped sectors for padding */ 396 uint64_t rm_skipstart; /* Column index of padding start */ 397 uintptr_t rm_reports; /* # of referencing checksum reports */ 398 uint8_t rm_freed; /* map no longer has referencing ZIO */ 399 uint8_t rm_ecksuminjected; /* checksum error was injected */ 400 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 401 } raidz_map_t; 402 403 #define VDEV_RAIDZ_P 0 404 #define VDEV_RAIDZ_Q 1 405 #define VDEV_RAIDZ_R 2 406 407 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 408 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 409 410 /* 411 * We provide a mechanism to perform the field multiplication operation on a 412 * 64-bit value all at once rather than a byte at a time. This works by 413 * creating a mask from the top bit in each byte and using that to 414 * conditionally apply the XOR of 0x1d. 415 */ 416 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 417 { \ 418 (mask) = (x) & 0x8080808080808080ULL; \ 419 (mask) = ((mask) << 1) - ((mask) >> 7); \ 420 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 421 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 422 } 423 424 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 425 { \ 426 VDEV_RAIDZ_64MUL_2((x), mask); \ 427 VDEV_RAIDZ_64MUL_2((x), mask); \ 428 } 429 430 /* 431 * These two tables represent powers and logs of 2 in the Galois field defined 432 * above. These values were computed by repeatedly multiplying by 2 as above. 433 */ 434 static const uint8_t vdev_raidz_pow2[256] = { 435 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 436 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 437 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 438 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 439 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 440 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 441 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 442 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 443 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 444 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 445 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 446 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 447 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 448 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 449 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 450 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 451 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 452 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 453 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 454 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 455 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 456 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 457 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 458 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 459 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 460 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 461 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 462 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 463 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 464 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 465 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 466 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 467 }; 468 static const uint8_t vdev_raidz_log2[256] = { 469 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 470 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 471 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 472 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 473 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 474 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 475 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 476 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 477 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 478 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 479 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 480 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 481 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 482 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 483 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 484 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 485 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 486 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 487 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 488 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 489 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 490 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 491 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 492 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 493 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 494 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 495 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 496 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 497 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 498 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 499 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 500 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 501 }; 502 503 /* 504 * Multiply a given number by 2 raised to the given power. 505 */ 506 static uint8_t 507 vdev_raidz_exp2(uint8_t a, int exp) 508 { 509 if (a == 0) 510 return (0); 511 512 ASSERT(exp >= 0); 513 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 514 515 exp += vdev_raidz_log2[a]; 516 if (exp > 255) 517 exp -= 255; 518 519 return (vdev_raidz_pow2[exp]); 520 } 521 522 static void 523 vdev_raidz_generate_parity_p(raidz_map_t *rm) 524 { 525 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 526 int c; 527 528 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 529 530 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 531 src = rm->rm_col[c].rc_data; 532 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 533 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 534 535 if (c == rm->rm_firstdatacol) { 536 ASSERT(ccount == pcount); 537 for (i = 0; i < ccount; i++, src++, p++) { 538 *p = *src; 539 } 540 } else { 541 ASSERT(ccount <= pcount); 542 for (i = 0; i < ccount; i++, src++, p++) { 543 *p ^= *src; 544 } 545 } 546 } 547 } 548 549 static void 550 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 551 { 552 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 553 int c; 554 555 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 556 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 557 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 558 559 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 560 src = rm->rm_col[c].rc_data; 561 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 562 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 563 564 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 565 566 if (c == rm->rm_firstdatacol) { 567 ASSERT(ccnt == pcnt || ccnt == 0); 568 for (i = 0; i < ccnt; i++, src++, p++, q++) { 569 *p = *src; 570 *q = *src; 571 } 572 for (; i < pcnt; i++, src++, p++, q++) { 573 *p = 0; 574 *q = 0; 575 } 576 } else { 577 ASSERT(ccnt <= pcnt); 578 579 /* 580 * Apply the algorithm described above by multiplying 581 * the previous result and adding in the new value. 582 */ 583 for (i = 0; i < ccnt; i++, src++, p++, q++) { 584 *p ^= *src; 585 586 VDEV_RAIDZ_64MUL_2(*q, mask); 587 *q ^= *src; 588 } 589 590 /* 591 * Treat short columns as though they are full of 0s. 592 * Note that there's therefore nothing needed for P. 593 */ 594 for (; i < pcnt; i++, q++) { 595 VDEV_RAIDZ_64MUL_2(*q, mask); 596 } 597 } 598 } 599 } 600 601 static void 602 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 603 { 604 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 605 int c; 606 607 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 608 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 609 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 610 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 611 rm->rm_col[VDEV_RAIDZ_R].rc_size); 612 613 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 614 src = rm->rm_col[c].rc_data; 615 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 616 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 617 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 618 619 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 620 621 if (c == rm->rm_firstdatacol) { 622 ASSERT(ccnt == pcnt || ccnt == 0); 623 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 624 *p = *src; 625 *q = *src; 626 *r = *src; 627 } 628 for (; i < pcnt; i++, src++, p++, q++, r++) { 629 *p = 0; 630 *q = 0; 631 *r = 0; 632 } 633 } else { 634 ASSERT(ccnt <= pcnt); 635 636 /* 637 * Apply the algorithm described above by multiplying 638 * the previous result and adding in the new value. 639 */ 640 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 641 *p ^= *src; 642 643 VDEV_RAIDZ_64MUL_2(*q, mask); 644 *q ^= *src; 645 646 VDEV_RAIDZ_64MUL_4(*r, mask); 647 *r ^= *src; 648 } 649 650 /* 651 * Treat short columns as though they are full of 0s. 652 * Note that there's therefore nothing needed for P. 653 */ 654 for (; i < pcnt; i++, q++, r++) { 655 VDEV_RAIDZ_64MUL_2(*q, mask); 656 VDEV_RAIDZ_64MUL_4(*r, mask); 657 } 658 } 659 } 660 } 661 662 /* 663 * Generate RAID parity in the first virtual columns according to the number of 664 * parity columns available. 665 */ 666 static void 667 vdev_raidz_generate_parity(raidz_map_t *rm) 668 { 669 switch (rm->rm_firstdatacol) { 670 case 1: 671 vdev_raidz_generate_parity_p(rm); 672 break; 673 case 2: 674 vdev_raidz_generate_parity_pq(rm); 675 break; 676 case 3: 677 vdev_raidz_generate_parity_pqr(rm); 678 break; 679 default: 680 panic("invalid RAID-Z configuration"); 681 } 682 } 683 684 /* BEGIN CSTYLED */ 685 /* 686 * In the general case of reconstruction, we must solve the system of linear 687 * equations defined by the coeffecients used to generate parity as well as 688 * the contents of the data and parity disks. This can be expressed with 689 * vectors for the original data (D) and the actual data (d) and parity (p) 690 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 691 * 692 * __ __ __ __ 693 * | | __ __ | p_0 | 694 * | V | | D_0 | | p_m-1 | 695 * | | x | : | = | d_0 | 696 * | I | | D_n-1 | | : | 697 * | | ~~ ~~ | d_n-1 | 698 * ~~ ~~ ~~ ~~ 699 * 700 * I is simply a square identity matrix of size n, and V is a vandermonde 701 * matrix defined by the coeffecients we chose for the various parity columns 702 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 703 * computation as well as linear separability. 704 * 705 * __ __ __ __ 706 * | 1 .. 1 1 1 | | p_0 | 707 * | 2^n-1 .. 4 2 1 | __ __ | : | 708 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 709 * | 1 .. 0 0 0 | | D_1 | | d_0 | 710 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 711 * | : : : : | | : | | d_2 | 712 * | 0 .. 1 0 0 | | D_n-1 | | : | 713 * | 0 .. 0 1 0 | ~~ ~~ | : | 714 * | 0 .. 0 0 1 | | d_n-1 | 715 * ~~ ~~ ~~ ~~ 716 * 717 * Note that I, V, d, and p are known. To compute D, we must invert the 718 * matrix and use the known data and parity values to reconstruct the unknown 719 * data values. We begin by removing the rows in V|I and d|p that correspond 720 * to failed or missing columns; we then make V|I square (n x n) and d|p 721 * sized n by removing rows corresponding to unused parity from the bottom up 722 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 723 * using Gauss-Jordan elimination. In the example below we use m=3 parity 724 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 725 * __ __ 726 * | 1 1 1 1 1 1 1 1 | 727 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 728 * | 19 205 116 29 64 16 4 1 | / / 729 * | 1 0 0 0 0 0 0 0 | / / 730 * | 0 1 0 0 0 0 0 0 | <--' / 731 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 732 * | 0 0 0 1 0 0 0 0 | 733 * | 0 0 0 0 1 0 0 0 | 734 * | 0 0 0 0 0 1 0 0 | 735 * | 0 0 0 0 0 0 1 0 | 736 * | 0 0 0 0 0 0 0 1 | 737 * ~~ ~~ 738 * __ __ 739 * | 1 1 1 1 1 1 1 1 | 740 * | 128 64 32 16 8 4 2 1 | 741 * | 19 205 116 29 64 16 4 1 | 742 * | 1 0 0 0 0 0 0 0 | 743 * | 0 1 0 0 0 0 0 0 | 744 * (V|I)' = | 0 0 1 0 0 0 0 0 | 745 * | 0 0 0 1 0 0 0 0 | 746 * | 0 0 0 0 1 0 0 0 | 747 * | 0 0 0 0 0 1 0 0 | 748 * | 0 0 0 0 0 0 1 0 | 749 * | 0 0 0 0 0 0 0 1 | 750 * ~~ ~~ 751 * 752 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 753 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 754 * matrix is not singular. 755 * __ __ 756 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 757 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 758 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 759 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 760 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 761 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 762 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 763 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 764 * ~~ ~~ 765 * __ __ 766 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 767 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 768 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 769 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 770 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 771 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 772 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 773 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 774 * ~~ ~~ 775 * __ __ 776 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 777 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 778 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 779 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 780 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 781 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 782 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 783 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 784 * ~~ ~~ 785 * __ __ 786 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 787 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 788 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 789 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 790 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 791 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 792 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 793 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 794 * ~~ ~~ 795 * __ __ 796 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 797 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 798 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 799 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 800 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 801 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 802 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 803 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 804 * ~~ ~~ 805 * __ __ 806 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 807 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 808 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 809 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 810 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 811 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 812 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 813 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 814 * ~~ ~~ 815 * __ __ 816 * | 0 0 1 0 0 0 0 0 | 817 * | 167 100 5 41 159 169 217 208 | 818 * | 166 100 4 40 158 168 216 209 | 819 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 820 * | 0 0 0 0 1 0 0 0 | 821 * | 0 0 0 0 0 1 0 0 | 822 * | 0 0 0 0 0 0 1 0 | 823 * | 0 0 0 0 0 0 0 1 | 824 * ~~ ~~ 825 * 826 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 827 * of the missing data. 828 * 829 * As is apparent from the example above, the only non-trivial rows in the 830 * inverse matrix correspond to the data disks that we're trying to 831 * reconstruct. Indeed, those are the only rows we need as the others would 832 * only be useful for reconstructing data known or assumed to be valid. For 833 * that reason, we only build the coefficients in the rows that correspond to 834 * targeted columns. 835 */ 836 /* END CSTYLED */ 837 838 static void 839 vdev_raidz_matrix_init(raidz_map_t *rm __unused, int n, int nmap, int *map, 840 uint8_t **rows) 841 { 842 int i, j; 843 int pow; 844 845 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 846 847 /* 848 * Fill in the missing rows of interest. 849 */ 850 for (i = 0; i < nmap; i++) { 851 ASSERT3S(0, <=, map[i]); 852 ASSERT3S(map[i], <=, 2); 853 854 pow = map[i] * n; 855 if (pow > 255) 856 pow -= 255; 857 ASSERT(pow <= 255); 858 859 for (j = 0; j < n; j++) { 860 pow -= map[i]; 861 if (pow < 0) 862 pow += 255; 863 rows[i][j] = vdev_raidz_pow2[pow]; 864 } 865 } 866 } 867 868 static void 869 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 870 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 871 { 872 int i, j, ii, jj; 873 uint8_t log; 874 875 /* 876 * Assert that the first nmissing entries from the array of used 877 * columns correspond to parity columns and that subsequent entries 878 * correspond to data columns. 879 */ 880 for (i = 0; i < nmissing; i++) { 881 ASSERT3S(used[i], <, rm->rm_firstdatacol); 882 } 883 for (; i < n; i++) { 884 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 885 } 886 887 /* 888 * First initialize the storage where we'll compute the inverse rows. 889 */ 890 for (i = 0; i < nmissing; i++) { 891 for (j = 0; j < n; j++) { 892 invrows[i][j] = (i == j) ? 1 : 0; 893 } 894 } 895 896 /* 897 * Subtract all trivial rows from the rows of consequence. 898 */ 899 for (i = 0; i < nmissing; i++) { 900 for (j = nmissing; j < n; j++) { 901 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 902 jj = used[j] - rm->rm_firstdatacol; 903 ASSERT3S(jj, <, n); 904 invrows[i][j] = rows[i][jj]; 905 rows[i][jj] = 0; 906 } 907 } 908 909 /* 910 * For each of the rows of interest, we must normalize it and subtract 911 * a multiple of it from the other rows. 912 */ 913 for (i = 0; i < nmissing; i++) { 914 for (j = 0; j < missing[i]; j++) { 915 ASSERT3U(rows[i][j], ==, 0); 916 } 917 ASSERT3U(rows[i][missing[i]], !=, 0); 918 919 /* 920 * Compute the inverse of the first element and multiply each 921 * element in the row by that value. 922 */ 923 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 924 925 for (j = 0; j < n; j++) { 926 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 927 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 928 } 929 930 for (ii = 0; ii < nmissing; ii++) { 931 if (i == ii) 932 continue; 933 934 ASSERT3U(rows[ii][missing[i]], !=, 0); 935 936 log = vdev_raidz_log2[rows[ii][missing[i]]]; 937 938 for (j = 0; j < n; j++) { 939 rows[ii][j] ^= 940 vdev_raidz_exp2(rows[i][j], log); 941 invrows[ii][j] ^= 942 vdev_raidz_exp2(invrows[i][j], log); 943 } 944 } 945 } 946 947 /* 948 * Verify that the data that is left in the rows are properly part of 949 * an identity matrix. 950 */ 951 for (i = 0; i < nmissing; i++) { 952 for (j = 0; j < n; j++) { 953 if (j == missing[i]) { 954 ASSERT3U(rows[i][j], ==, 1); 955 } else { 956 ASSERT3U(rows[i][j], ==, 0); 957 } 958 } 959 } 960 } 961 962 static void 963 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 964 int *missing, uint8_t **invrows, const uint8_t *used) 965 { 966 int i, j, x, cc, c; 967 uint8_t *src; 968 uint64_t ccount; 969 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 970 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 971 uint8_t log, val; 972 int ll; 973 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 974 uint8_t *p, *pp; 975 size_t psize; 976 977 log = 0; /* gcc */ 978 psize = sizeof (invlog[0][0]) * n * nmissing; 979 p = malloc(psize); 980 if (p == NULL) { 981 printf("Out of memory\n"); 982 return; 983 } 984 985 for (pp = p, i = 0; i < nmissing; i++) { 986 invlog[i] = pp; 987 pp += n; 988 } 989 990 for (i = 0; i < nmissing; i++) { 991 for (j = 0; j < n; j++) { 992 ASSERT3U(invrows[i][j], !=, 0); 993 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 994 } 995 } 996 997 for (i = 0; i < n; i++) { 998 c = used[i]; 999 ASSERT3U(c, <, rm->rm_cols); 1000 1001 src = rm->rm_col[c].rc_data; 1002 ccount = rm->rm_col[c].rc_size; 1003 for (j = 0; j < nmissing; j++) { 1004 cc = missing[j] + rm->rm_firstdatacol; 1005 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1006 ASSERT3U(cc, <, rm->rm_cols); 1007 ASSERT3U(cc, !=, c); 1008 1009 dst[j] = rm->rm_col[cc].rc_data; 1010 dcount[j] = rm->rm_col[cc].rc_size; 1011 } 1012 1013 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1014 1015 for (x = 0; x < ccount; x++, src++) { 1016 if (*src != 0) 1017 log = vdev_raidz_log2[*src]; 1018 1019 for (cc = 0; cc < nmissing; cc++) { 1020 if (x >= dcount[cc]) 1021 continue; 1022 1023 if (*src == 0) { 1024 val = 0; 1025 } else { 1026 if ((ll = log + invlog[cc][i]) >= 255) 1027 ll -= 255; 1028 val = vdev_raidz_pow2[ll]; 1029 } 1030 1031 if (i == 0) 1032 dst[cc][x] = val; 1033 else 1034 dst[cc][x] ^= val; 1035 } 1036 } 1037 } 1038 1039 free(p); 1040 } 1041 1042 static int 1043 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1044 { 1045 int n, i, c, t, tt; 1046 int nmissing_rows; 1047 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1048 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1049 1050 uint8_t *p, *pp; 1051 size_t psize; 1052 1053 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1054 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1055 uint8_t *used; 1056 1057 int code = 0; 1058 1059 1060 n = rm->rm_cols - rm->rm_firstdatacol; 1061 1062 /* 1063 * Figure out which data columns are missing. 1064 */ 1065 nmissing_rows = 0; 1066 for (t = 0; t < ntgts; t++) { 1067 if (tgts[t] >= rm->rm_firstdatacol) { 1068 missing_rows[nmissing_rows++] = 1069 tgts[t] - rm->rm_firstdatacol; 1070 } 1071 } 1072 1073 /* 1074 * Figure out which parity columns to use to help generate the missing 1075 * data columns. 1076 */ 1077 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1078 ASSERT(tt < ntgts); 1079 ASSERT(c < rm->rm_firstdatacol); 1080 1081 /* 1082 * Skip any targeted parity columns. 1083 */ 1084 if (c == tgts[tt]) { 1085 tt++; 1086 continue; 1087 } 1088 1089 code |= 1 << c; 1090 1091 parity_map[i] = c; 1092 i++; 1093 } 1094 1095 ASSERT(code != 0); 1096 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1097 1098 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1099 nmissing_rows * n + sizeof (used[0]) * n; 1100 p = malloc(psize); 1101 if (p == NULL) { 1102 printf("Out of memory\n"); 1103 return (code); 1104 } 1105 1106 for (pp = p, i = 0; i < nmissing_rows; i++) { 1107 rows[i] = pp; 1108 pp += n; 1109 invrows[i] = pp; 1110 pp += n; 1111 } 1112 used = pp; 1113 1114 for (i = 0; i < nmissing_rows; i++) { 1115 used[i] = parity_map[i]; 1116 } 1117 1118 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1119 if (tt < nmissing_rows && 1120 c == missing_rows[tt] + rm->rm_firstdatacol) { 1121 tt++; 1122 continue; 1123 } 1124 1125 ASSERT3S(i, <, n); 1126 used[i] = c; 1127 i++; 1128 } 1129 1130 /* 1131 * Initialize the interesting rows of the matrix. 1132 */ 1133 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1134 1135 /* 1136 * Invert the matrix. 1137 */ 1138 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1139 invrows, used); 1140 1141 /* 1142 * Reconstruct the missing data using the generated matrix. 1143 */ 1144 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1145 invrows, used); 1146 1147 free(p); 1148 1149 return (code); 1150 } 1151 1152 static int 1153 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1154 { 1155 int tgts[VDEV_RAIDZ_MAXPARITY]; 1156 int ntgts; 1157 int i, c; 1158 int code; 1159 int nbadparity, nbaddata; 1160 1161 /* 1162 * The tgts list must already be sorted. 1163 */ 1164 for (i = 1; i < nt; i++) { 1165 ASSERT(t[i] > t[i - 1]); 1166 } 1167 1168 nbadparity = rm->rm_firstdatacol; 1169 nbaddata = rm->rm_cols - nbadparity; 1170 ntgts = 0; 1171 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1172 if (i < nt && c == t[i]) { 1173 tgts[ntgts++] = c; 1174 i++; 1175 } else if (rm->rm_col[c].rc_error != 0) { 1176 tgts[ntgts++] = c; 1177 } else if (c >= rm->rm_firstdatacol) { 1178 nbaddata--; 1179 } else { 1180 nbadparity--; 1181 } 1182 } 1183 1184 ASSERT(ntgts >= nt); 1185 ASSERT(nbaddata >= 0); 1186 ASSERT(nbaddata + nbadparity == ntgts); 1187 1188 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1189 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1190 ASSERT(code > 0); 1191 return (code); 1192 } 1193 1194 static raidz_map_t * 1195 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1196 uint64_t dcols, uint64_t nparity) 1197 { 1198 raidz_map_t *rm; 1199 uint64_t b = offset >> unit_shift; 1200 uint64_t s = size >> unit_shift; 1201 uint64_t f = b % dcols; 1202 uint64_t o = (b / dcols) << unit_shift; 1203 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1204 1205 q = s / (dcols - nparity); 1206 r = s - q * (dcols - nparity); 1207 bc = (r == 0 ? 0 : r + nparity); 1208 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1209 1210 if (q == 0) { 1211 acols = bc; 1212 scols = MIN(dcols, roundup(bc, nparity + 1)); 1213 } else { 1214 acols = dcols; 1215 scols = dcols; 1216 } 1217 1218 ASSERT3U(acols, <=, scols); 1219 1220 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1221 if (rm == NULL) 1222 return (rm); 1223 1224 rm->rm_cols = acols; 1225 rm->rm_scols = scols; 1226 rm->rm_bigcols = bc; 1227 rm->rm_skipstart = bc; 1228 rm->rm_missingdata = 0; 1229 rm->rm_missingparity = 0; 1230 rm->rm_firstdatacol = nparity; 1231 rm->rm_reports = 0; 1232 rm->rm_freed = 0; 1233 rm->rm_ecksuminjected = 0; 1234 1235 asize = 0; 1236 1237 for (c = 0; c < scols; c++) { 1238 col = f + c; 1239 coff = o; 1240 if (col >= dcols) { 1241 col -= dcols; 1242 coff += 1ULL << unit_shift; 1243 } 1244 rm->rm_col[c].rc_devidx = col; 1245 rm->rm_col[c].rc_offset = coff; 1246 rm->rm_col[c].rc_data = NULL; 1247 rm->rm_col[c].rc_error = 0; 1248 rm->rm_col[c].rc_tried = 0; 1249 rm->rm_col[c].rc_skipped = 0; 1250 1251 if (c >= acols) 1252 rm->rm_col[c].rc_size = 0; 1253 else if (c < bc) 1254 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1255 else 1256 rm->rm_col[c].rc_size = q << unit_shift; 1257 1258 asize += rm->rm_col[c].rc_size; 1259 } 1260 1261 ASSERT3U(asize, ==, tot << unit_shift); 1262 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1263 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1264 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1265 ASSERT3U(rm->rm_nskip, <=, nparity); 1266 1267 for (c = 0; c < rm->rm_firstdatacol; c++) { 1268 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1269 if (rm->rm_col[c].rc_data == NULL) { 1270 c++; 1271 while (c != 0) 1272 free(rm->rm_col[--c].rc_data); 1273 free(rm); 1274 return (NULL); 1275 } 1276 } 1277 1278 rm->rm_col[c].rc_data = data; 1279 1280 for (c = c + 1; c < acols; c++) 1281 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1282 rm->rm_col[c - 1].rc_size; 1283 1284 /* 1285 * If all data stored spans all columns, there's a danger that parity 1286 * will always be on the same device and, since parity isn't read 1287 * during normal operation, that that device's I/O bandwidth won't be 1288 * used effectively. We therefore switch the parity every 1MB. 1289 * 1290 * ... at least that was, ostensibly, the theory. As a practical 1291 * matter unless we juggle the parity between all devices evenly, we 1292 * won't see any benefit. Further, occasional writes that aren't a 1293 * multiple of the LCM of the number of children and the minimum 1294 * stripe width are sufficient to avoid pessimal behavior. 1295 * Unfortunately, this decision created an implicit on-disk format 1296 * requirement that we need to support for all eternity, but only 1297 * for single-parity RAID-Z. 1298 * 1299 * If we intend to skip a sector in the zeroth column for padding 1300 * we must make sure to note this swap. We will never intend to 1301 * skip the first column since at least one data and one parity 1302 * column must appear in each row. 1303 */ 1304 ASSERT(rm->rm_cols >= 2); 1305 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1306 1307 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1308 devidx = rm->rm_col[0].rc_devidx; 1309 o = rm->rm_col[0].rc_offset; 1310 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1311 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1312 rm->rm_col[1].rc_devidx = devidx; 1313 rm->rm_col[1].rc_offset = o; 1314 1315 if (rm->rm_skipstart == 0) 1316 rm->rm_skipstart = 1; 1317 } 1318 1319 return (rm); 1320 } 1321 1322 static void 1323 vdev_raidz_map_free(raidz_map_t *rm) 1324 { 1325 int c; 1326 1327 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1328 free(rm->rm_col[c].rc_data); 1329 1330 free(rm); 1331 } 1332 1333 static vdev_t * 1334 vdev_child(vdev_t *pvd, uint64_t devidx) 1335 { 1336 vdev_t *cvd; 1337 1338 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1339 if (cvd->v_id == devidx) 1340 break; 1341 } 1342 1343 return (cvd); 1344 } 1345 1346 /* 1347 * We keep track of whether or not there were any injected errors, so that 1348 * any ereports we generate can note it. 1349 */ 1350 static int 1351 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1352 uint64_t size __unused) 1353 { 1354 1355 return (zio_checksum_verify(spa, bp, data)); 1356 } 1357 1358 /* 1359 * Generate the parity from the data columns. If we tried and were able to 1360 * read the parity without error, verify that the generated parity matches the 1361 * data we read. If it doesn't, we fire off a checksum error. Return the 1362 * number such failures. 1363 */ 1364 static int 1365 raidz_parity_verify(raidz_map_t *rm) 1366 { 1367 void *orig[VDEV_RAIDZ_MAXPARITY]; 1368 int c, ret = 0; 1369 raidz_col_t *rc; 1370 1371 for (c = 0; c < rm->rm_firstdatacol; c++) { 1372 rc = &rm->rm_col[c]; 1373 if (!rc->rc_tried || rc->rc_error != 0) 1374 continue; 1375 orig[c] = malloc(rc->rc_size); 1376 if (orig[c] != NULL) { 1377 bcopy(rc->rc_data, orig[c], rc->rc_size); 1378 } else { 1379 printf("Out of memory\n"); 1380 } 1381 } 1382 1383 vdev_raidz_generate_parity(rm); 1384 1385 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1386 rc = &rm->rm_col[c]; 1387 if (!rc->rc_tried || rc->rc_error != 0) 1388 continue; 1389 if (orig[c] == NULL || 1390 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1391 rc->rc_error = ECKSUM; 1392 ret++; 1393 } 1394 free(orig[c]); 1395 } 1396 1397 return (ret); 1398 } 1399 1400 /* 1401 * Iterate over all combinations of bad data and attempt a reconstruction. 1402 * Note that the algorithm below is non-optimal because it doesn't take into 1403 * account how reconstruction is actually performed. For example, with 1404 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1405 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1406 * cases we'd only use parity information in column 0. 1407 */ 1408 static int 1409 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1410 void *data, off_t offset __unused, uint64_t bytes, int total_errors, 1411 int data_errors) 1412 { 1413 raidz_col_t *rc; 1414 void *orig[VDEV_RAIDZ_MAXPARITY]; 1415 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1416 int *tgts = &tstore[1]; 1417 int current, next, i, c, n; 1418 int code, ret = 0; 1419 1420 ASSERT(total_errors < rm->rm_firstdatacol); 1421 1422 /* 1423 * This simplifies one edge condition. 1424 */ 1425 tgts[-1] = -1; 1426 1427 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1428 /* 1429 * Initialize the targets array by finding the first n columns 1430 * that contain no error. 1431 * 1432 * If there were no data errors, we need to ensure that we're 1433 * always explicitly attempting to reconstruct at least one 1434 * data column. To do this, we simply push the highest target 1435 * up into the data columns. 1436 */ 1437 for (c = 0, i = 0; i < n; i++) { 1438 if (i == n - 1 && data_errors == 0 && 1439 c < rm->rm_firstdatacol) { 1440 c = rm->rm_firstdatacol; 1441 } 1442 1443 while (rm->rm_col[c].rc_error != 0) { 1444 c++; 1445 ASSERT3S(c, <, rm->rm_cols); 1446 } 1447 1448 tgts[i] = c++; 1449 } 1450 1451 /* 1452 * Setting tgts[n] simplifies the other edge condition. 1453 */ 1454 tgts[n] = rm->rm_cols; 1455 1456 /* 1457 * These buffers were allocated in previous iterations. 1458 */ 1459 for (i = 0; i < n - 1; i++) { 1460 ASSERT(orig[i] != NULL); 1461 } 1462 1463 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1464 if (orig[n - 1] == NULL) { 1465 ret = ENOMEM; 1466 goto done; 1467 } 1468 1469 current = 0; 1470 next = tgts[current]; 1471 1472 while (current != n) { 1473 tgts[current] = next; 1474 current = 0; 1475 1476 /* 1477 * Save off the original data that we're going to 1478 * attempt to reconstruct. 1479 */ 1480 for (i = 0; i < n; i++) { 1481 ASSERT(orig[i] != NULL); 1482 c = tgts[i]; 1483 ASSERT3S(c, >=, 0); 1484 ASSERT3S(c, <, rm->rm_cols); 1485 rc = &rm->rm_col[c]; 1486 bcopy(rc->rc_data, orig[i], rc->rc_size); 1487 } 1488 1489 /* 1490 * Attempt a reconstruction and exit the outer loop on 1491 * success. 1492 */ 1493 code = vdev_raidz_reconstruct(rm, tgts, n); 1494 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1495 for (i = 0; i < n; i++) { 1496 c = tgts[i]; 1497 rc = &rm->rm_col[c]; 1498 ASSERT(rc->rc_error == 0); 1499 rc->rc_error = ECKSUM; 1500 } 1501 1502 ret = code; 1503 goto done; 1504 } 1505 1506 /* 1507 * Restore the original data. 1508 */ 1509 for (i = 0; i < n; i++) { 1510 c = tgts[i]; 1511 rc = &rm->rm_col[c]; 1512 bcopy(orig[i], rc->rc_data, rc->rc_size); 1513 } 1514 1515 do { 1516 /* 1517 * Find the next valid column after the current 1518 * position.. 1519 */ 1520 for (next = tgts[current] + 1; 1521 next < rm->rm_cols && 1522 rm->rm_col[next].rc_error != 0; next++) 1523 continue; 1524 1525 ASSERT(next <= tgts[current + 1]); 1526 1527 /* 1528 * If that spot is available, we're done here. 1529 */ 1530 if (next != tgts[current + 1]) 1531 break; 1532 1533 /* 1534 * Otherwise, find the next valid column after 1535 * the previous position. 1536 */ 1537 for (c = tgts[current - 1] + 1; 1538 rm->rm_col[c].rc_error != 0; c++) 1539 continue; 1540 1541 tgts[current] = c; 1542 current++; 1543 1544 } while (current != n); 1545 } 1546 } 1547 n--; 1548 done: 1549 for (i = n - 1; i >= 0; i--) { 1550 free(orig[i]); 1551 } 1552 1553 return (ret); 1554 } 1555 1556 static int 1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1558 off_t offset, size_t bytes) 1559 { 1560 vdev_t *tvd = vd->v_top; 1561 vdev_t *cvd; 1562 raidz_map_t *rm; 1563 raidz_col_t *rc; 1564 int c, error; 1565 int unexpected_errors; 1566 int parity_errors; 1567 int parity_untried; 1568 int data_errors; 1569 int total_errors; 1570 int n; 1571 int tgts[VDEV_RAIDZ_MAXPARITY]; 1572 int code; 1573 1574 rc = NULL; /* gcc */ 1575 error = 0; 1576 1577 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1578 vd->v_nchildren, vd->v_nparity); 1579 if (rm == NULL) 1580 return (ENOMEM); 1581 1582 /* 1583 * Iterate over the columns in reverse order so that we hit the parity 1584 * last -- any errors along the way will force us to read the parity. 1585 */ 1586 for (c = rm->rm_cols - 1; c >= 0; c--) { 1587 rc = &rm->rm_col[c]; 1588 cvd = vdev_child(vd, rc->rc_devidx); 1589 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1590 if (c >= rm->rm_firstdatacol) 1591 rm->rm_missingdata++; 1592 else 1593 rm->rm_missingparity++; 1594 rc->rc_error = ENXIO; 1595 rc->rc_tried = 1; /* don't even try */ 1596 rc->rc_skipped = 1; 1597 continue; 1598 } 1599 #if 0 /* XXX: Too hard for the boot code. */ 1600 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1601 if (c >= rm->rm_firstdatacol) 1602 rm->rm_missingdata++; 1603 else 1604 rm->rm_missingparity++; 1605 rc->rc_error = ESTALE; 1606 rc->rc_skipped = 1; 1607 continue; 1608 } 1609 #endif 1610 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1611 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1612 rc->rc_offset, rc->rc_size); 1613 rc->rc_tried = 1; 1614 rc->rc_skipped = 0; 1615 } 1616 } 1617 1618 reconstruct: 1619 unexpected_errors = 0; 1620 parity_errors = 0; 1621 parity_untried = 0; 1622 data_errors = 0; 1623 total_errors = 0; 1624 1625 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1626 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1627 1628 for (c = 0; c < rm->rm_cols; c++) { 1629 rc = &rm->rm_col[c]; 1630 1631 if (rc->rc_error) { 1632 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1633 1634 if (c < rm->rm_firstdatacol) 1635 parity_errors++; 1636 else 1637 data_errors++; 1638 1639 if (!rc->rc_skipped) 1640 unexpected_errors++; 1641 1642 total_errors++; 1643 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1644 parity_untried++; 1645 } 1646 } 1647 1648 /* 1649 * There are three potential phases for a read: 1650 * 1. produce valid data from the columns read 1651 * 2. read all disks and try again 1652 * 3. perform combinatorial reconstruction 1653 * 1654 * Each phase is progressively both more expensive and less likely to 1655 * occur. If we encounter more errors than we can repair or all phases 1656 * fail, we have no choice but to return an error. 1657 */ 1658 1659 /* 1660 * If the number of errors we saw was correctable -- less than or equal 1661 * to the number of parity disks read -- attempt to produce data that 1662 * has a valid checksum. Naturally, this case applies in the absence of 1663 * any errors. 1664 */ 1665 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1666 int rv; 1667 1668 if (data_errors == 0) { 1669 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1670 if (rv == 0) { 1671 /* 1672 * If we read parity information (unnecessarily 1673 * as it happens since no reconstruction was 1674 * needed) regenerate and verify the parity. 1675 * We also regenerate parity when resilvering 1676 * so we can write it out to the failed device 1677 * later. 1678 */ 1679 if (parity_errors + parity_untried < 1680 rm->rm_firstdatacol) { 1681 n = raidz_parity_verify(rm); 1682 unexpected_errors += n; 1683 ASSERT(parity_errors + n <= 1684 rm->rm_firstdatacol); 1685 } 1686 goto done; 1687 } 1688 } else { 1689 /* 1690 * We either attempt to read all the parity columns or 1691 * none of them. If we didn't try to read parity, we 1692 * wouldn't be here in the correctable case. There must 1693 * also have been fewer parity errors than parity 1694 * columns or, again, we wouldn't be in this code path. 1695 */ 1696 ASSERT(parity_untried == 0); 1697 ASSERT(parity_errors < rm->rm_firstdatacol); 1698 1699 /* 1700 * Identify the data columns that reported an error. 1701 */ 1702 n = 0; 1703 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1704 rc = &rm->rm_col[c]; 1705 if (rc->rc_error != 0) { 1706 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1707 tgts[n++] = c; 1708 } 1709 } 1710 1711 ASSERT(rm->rm_firstdatacol >= n); 1712 1713 code = vdev_raidz_reconstruct(rm, tgts, n); 1714 1715 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1716 if (rv == 0) { 1717 /* 1718 * If we read more parity disks than were used 1719 * for reconstruction, confirm that the other 1720 * parity disks produced correct data. This 1721 * routine is suboptimal in that it regenerates 1722 * the parity that we already used in addition 1723 * to the parity that we're attempting to 1724 * verify, but this should be a relatively 1725 * uncommon case, and can be optimized if it 1726 * becomes a problem. Note that we regenerate 1727 * parity when resilvering so we can write it 1728 * out to failed devices later. 1729 */ 1730 if (parity_errors < rm->rm_firstdatacol - n) { 1731 n = raidz_parity_verify(rm); 1732 unexpected_errors += n; 1733 ASSERT(parity_errors + n <= 1734 rm->rm_firstdatacol); 1735 } 1736 1737 goto done; 1738 } 1739 } 1740 } 1741 1742 /* 1743 * This isn't a typical situation -- either we got a read 1744 * error or a child silently returned bad data. Read every 1745 * block so we can try again with as much data and parity as 1746 * we can track down. If we've already been through once 1747 * before, all children will be marked as tried so we'll 1748 * proceed to combinatorial reconstruction. 1749 */ 1750 unexpected_errors = 1; 1751 rm->rm_missingdata = 0; 1752 rm->rm_missingparity = 0; 1753 1754 n = 0; 1755 for (c = 0; c < rm->rm_cols; c++) { 1756 rc = &rm->rm_col[c]; 1757 1758 if (rc->rc_tried) 1759 continue; 1760 1761 cvd = vdev_child(vd, rc->rc_devidx); 1762 ASSERT(cvd != NULL); 1763 rc->rc_error = cvd->v_read(cvd, NULL, 1764 rc->rc_data, rc->rc_offset, rc->rc_size); 1765 if (rc->rc_error == 0) 1766 n++; 1767 rc->rc_tried = 1; 1768 rc->rc_skipped = 0; 1769 } 1770 /* 1771 * If we managed to read anything more, retry the 1772 * reconstruction. 1773 */ 1774 if (n > 0) 1775 goto reconstruct; 1776 1777 /* 1778 * At this point we've attempted to reconstruct the data given the 1779 * errors we detected, and we've attempted to read all columns. There 1780 * must, therefore, be one or more additional problems -- silent errors 1781 * resulting in invalid data rather than explicit I/O errors resulting 1782 * in absent data. We check if there is enough additional data to 1783 * possibly reconstruct the data and then perform combinatorial 1784 * reconstruction over all possible combinations. If that fails, 1785 * we're cooked. 1786 */ 1787 if (total_errors > rm->rm_firstdatacol) { 1788 error = EIO; 1789 } else if (total_errors < rm->rm_firstdatacol && 1790 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1791 total_errors, data_errors)) != 0) { 1792 /* 1793 * If we didn't use all the available parity for the 1794 * combinatorial reconstruction, verify that the remaining 1795 * parity is correct. 1796 */ 1797 if (code != (1 << rm->rm_firstdatacol) - 1) 1798 (void) raidz_parity_verify(rm); 1799 } else { 1800 /* 1801 * We're here because either: 1802 * 1803 * total_errors == rm_first_datacol, or 1804 * vdev_raidz_combrec() failed 1805 * 1806 * In either case, there is enough bad data to prevent 1807 * reconstruction. 1808 * 1809 * Start checksum ereports for all children which haven't 1810 * failed, and the IO wasn't speculative. 1811 */ 1812 error = ECKSUM; 1813 } 1814 1815 done: 1816 vdev_raidz_map_free(rm); 1817 1818 return (error); 1819 } 1820