1 /* $OpenBSD: softraid_raid6.c,v 1.35 2013/03/25 16:01:49 jsing Exp $ */ 2 /* 3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include "bio.h" 20 21 #include <sys/param.h> 22 #include <sys/systm.h> 23 #include <sys/buf.h> 24 #include <sys/device.h> 25 #include <sys/ioctl.h> 26 #include <sys/proc.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/conf.h> 38 #include <sys/uio.h> 39 40 #include <scsi/scsi_all.h> 41 #include <scsi/scsiconf.h> 42 #include <scsi/scsi_disk.h> 43 44 #include <dev/softraidvar.h> 45 #include <dev/rndvar.h> 46 47 uint8_t *gf_map[256]; 48 uint8_t gf_pow[768]; 49 int gf_log[256]; 50 51 /* RAID 6 functions. */ 52 int sr_raid6_create(struct sr_discipline *, struct bioc_createraid *, 53 int, int64_t); 54 int sr_raid6_assemble(struct sr_discipline *, struct bioc_createraid *, 55 int, void *); 56 int sr_raid6_alloc_resources(struct sr_discipline *); 57 int sr_raid6_free_resources(struct sr_discipline *); 58 int sr_raid6_rw(struct sr_workunit *); 59 int sr_raid6_openings(struct sr_discipline *); 60 void sr_raid6_intr(struct buf *); 61 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); 62 void sr_raid6_set_vol_state(struct sr_discipline *); 63 64 void sr_raid6_xorp(void *, void *, int); 65 void sr_raid6_xorq(void *, void *, int, int); 66 int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, 67 void *, int, int, void *, void *, int); 68 void sr_dump(void *, int); 69 void sr_raid6_scrub(struct sr_discipline *); 70 int sr_failio(struct sr_workunit *); 71 72 void *sr_get_block(struct sr_discipline *, int); 73 void sr_put_block(struct sr_discipline *, void *, int); 74 75 void gf_init(void); 76 uint8_t gf_inv(uint8_t); 77 int gf_premul(uint8_t); 78 uint8_t gf_mul(uint8_t, uint8_t); 79 80 #define SR_NOFAIL 0x00 81 #define SR_FAILX (1L << 0) 82 #define SR_FAILY (1L << 1) 83 #define SR_FAILP (1L << 2) 84 #define SR_FAILQ (1L << 3) 85 86 struct sr_raid6_opaque { 87 int gn; 88 void *pbuf; 89 void *qbuf; 90 }; 91 92 /* discipline initialisation. */ 93 void 94 sr_raid6_discipline_init(struct sr_discipline *sd) 95 { 96 /* Initialize GF256 tables. */ 97 gf_init(); 98 99 /* Fill out discipline members. */ 100 sd->sd_type = SR_MD_RAID6; 101 strlcpy(sd->sd_name, "RAID 6", sizeof(sd->sd_name)); 102 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 103 SR_CAP_REDUNDANT; 104 sd->sd_max_wu = SR_RAID6_NOWU; 105 106 /* Setup discipline specific function pointers. */ 107 sd->sd_alloc_resources = sr_raid6_alloc_resources; 108 sd->sd_assemble = sr_raid6_assemble; 109 sd->sd_create = sr_raid6_create; 110 sd->sd_free_resources = sr_raid6_free_resources; 111 sd->sd_openings = sr_raid6_openings; 112 sd->sd_scsi_rw = sr_raid6_rw; 113 sd->sd_scsi_intr = sr_raid6_intr; 114 sd->sd_set_chunk_state = sr_raid6_set_chunk_state; 115 sd->sd_set_vol_state = sr_raid6_set_vol_state; 116 } 117 118 int 119 sr_raid6_create(struct sr_discipline *sd, struct bioc_createraid *bc, 120 int no_chunk, int64_t coerced_size) 121 { 122 123 if (no_chunk < 4) 124 return EINVAL; 125 126 /* 127 * XXX add variable strip size later even though MAXPHYS is really 128 * the clever value, users like * to tinker with that type of stuff. 129 */ 130 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 131 sd->sd_meta->ssdi.ssd_size = (coerced_size & 132 ~((sd->sd_meta->ssdi.ssd_strip_size >> DEV_BSHIFT) - 1)) * 133 (no_chunk - 2); 134 135 /* only if stripsize <= MAXPHYS */ 136 sd->sd_max_ccb_per_wu = max(6, 2 * no_chunk); 137 138 return 0; 139 } 140 141 int 142 sr_raid6_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 143 int no_chunk, void *data) 144 { 145 146 /* only if stripsize <= MAXPHYS */ 147 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); 148 149 return 0; 150 } 151 152 int 153 sr_raid6_openings(struct sr_discipline *sd) 154 { 155 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ 156 } 157 158 int 159 sr_raid6_alloc_resources(struct sr_discipline *sd) 160 { 161 int rv = EINVAL; 162 163 DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", 164 DEVNAME(sd->sd_sc)); 165 166 if (sr_wu_alloc(sd)) 167 goto bad; 168 if (sr_ccb_alloc(sd)) 169 goto bad; 170 171 /* setup runtime values */ 172 sd->mds.mdd_raid6.sr6_strip_bits = 173 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 174 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) 175 goto bad; 176 177 rv = 0; 178 bad: 179 return (rv); 180 } 181 182 int 183 sr_raid6_free_resources(struct sr_discipline *sd) 184 { 185 int rv = EINVAL; 186 187 DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", 188 DEVNAME(sd->sd_sc)); 189 190 sr_wu_free(sd); 191 sr_ccb_free(sd); 192 193 rv = 0; 194 return (rv); 195 } 196 197 void 198 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 199 { 200 int old_state, s; 201 202 /* XXX this is for RAID 0 */ 203 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 204 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 205 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 206 207 /* ok to go to splbio since this only happens in error path */ 208 s = splbio(); 209 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 210 211 /* multiple IOs to the same chunk that fail will come through here */ 212 if (old_state == new_state) 213 goto done; 214 215 switch (old_state) { 216 case BIOC_SDONLINE: 217 switch (new_state) { 218 case BIOC_SDOFFLINE: 219 case BIOC_SDSCRUB: 220 break; 221 default: 222 goto die; 223 } 224 break; 225 226 case BIOC_SDOFFLINE: 227 if (new_state == BIOC_SDREBUILD) { 228 ; 229 } else 230 goto die; 231 break; 232 233 case BIOC_SDSCRUB: 234 switch (new_state) { 235 case BIOC_SDONLINE: 236 case BIOC_SDOFFLINE: 237 break; 238 default: 239 goto die; 240 } 241 break; 242 243 case BIOC_SDREBUILD: 244 switch (new_state) { 245 case BIOC_SDONLINE: 246 case BIOC_SDOFFLINE: 247 break; 248 default: 249 goto die; 250 } 251 break; 252 253 default: 254 die: 255 splx(s); /* XXX */ 256 panic("%s: %s: %s: invalid chunk state transition " 257 "%d -> %d", DEVNAME(sd->sd_sc), 258 sd->sd_meta->ssd_devname, 259 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 260 old_state, new_state); 261 /* NOTREACHED */ 262 } 263 264 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 265 sd->sd_set_vol_state(sd); 266 267 sd->sd_must_flush = 1; 268 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 269 done: 270 splx(s); 271 } 272 273 void 274 sr_raid6_set_vol_state(struct sr_discipline *sd) 275 { 276 int states[SR_MAX_STATES]; 277 int new_state, i, s, nd; 278 int old_state = sd->sd_vol_status; 279 280 /* XXX this is for RAID 0 */ 281 282 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 283 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 284 285 nd = sd->sd_meta->ssdi.ssd_chunk_no; 286 287 for (i = 0; i < SR_MAX_STATES; i++) 288 states[i] = 0; 289 290 for (i = 0; i < nd; i++) { 291 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 292 if (s >= SR_MAX_STATES) 293 panic("%s: %s: %s: invalid chunk state", 294 DEVNAME(sd->sd_sc), 295 sd->sd_meta->ssd_devname, 296 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 297 states[s]++; 298 } 299 300 if (states[BIOC_SDONLINE] == nd) 301 new_state = BIOC_SVONLINE; 302 else if (states[BIOC_SDONLINE] < nd - 2) 303 new_state = BIOC_SVOFFLINE; 304 else if (states[BIOC_SDSCRUB] != 0) 305 new_state = BIOC_SVSCRUB; 306 else if (states[BIOC_SDREBUILD] != 0) 307 new_state = BIOC_SVREBUILD; 308 else if (states[BIOC_SDONLINE] < nd) 309 new_state = BIOC_SVDEGRADED; 310 else { 311 printf("old_state = %d, ", old_state); 312 for (i = 0; i < nd; i++) 313 printf("%d = %d, ", i, 314 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 315 panic("invalid new_state"); 316 } 317 318 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", 319 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 320 old_state, new_state); 321 322 switch (old_state) { 323 case BIOC_SVONLINE: 324 switch (new_state) { 325 case BIOC_SVONLINE: /* can go to same state */ 326 case BIOC_SVOFFLINE: 327 case BIOC_SVDEGRADED: 328 case BIOC_SVREBUILD: /* happens on boot */ 329 break; 330 default: 331 goto die; 332 } 333 break; 334 335 case BIOC_SVOFFLINE: 336 /* XXX this might be a little too much */ 337 goto die; 338 339 case BIOC_SVSCRUB: 340 switch (new_state) { 341 case BIOC_SVONLINE: 342 case BIOC_SVOFFLINE: 343 case BIOC_SVDEGRADED: 344 case BIOC_SVSCRUB: /* can go to same state */ 345 break; 346 default: 347 goto die; 348 } 349 break; 350 351 case BIOC_SVBUILDING: 352 switch (new_state) { 353 case BIOC_SVONLINE: 354 case BIOC_SVOFFLINE: 355 case BIOC_SVBUILDING: /* can go to the same state */ 356 break; 357 default: 358 goto die; 359 } 360 break; 361 362 case BIOC_SVREBUILD: 363 switch (new_state) { 364 case BIOC_SVONLINE: 365 case BIOC_SVOFFLINE: 366 case BIOC_SVDEGRADED: 367 case BIOC_SVREBUILD: /* can go to the same state */ 368 break; 369 default: 370 goto die; 371 } 372 break; 373 374 case BIOC_SVDEGRADED: 375 switch (new_state) { 376 case BIOC_SVOFFLINE: 377 case BIOC_SVREBUILD: 378 case BIOC_SVDEGRADED: /* can go to the same state */ 379 break; 380 default: 381 goto die; 382 } 383 break; 384 385 default: 386 die: 387 panic("%s: %s: invalid volume state transition %d -> %d", 388 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 389 old_state, new_state); 390 /* NOTREACHED */ 391 } 392 393 sd->sd_vol_status = new_state; 394 } 395 396 /* modes: 397 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 398 * SR_CCBF_FREEBUF, qbuf, NULL, 0); 399 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 400 * SR_CCBF_FREEBUF, pbuf, NULL, 0); 401 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 402 * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); 403 */ 404 405 int 406 sr_raid6_rw(struct sr_workunit *wu) 407 { 408 struct sr_workunit *wu_r = NULL; 409 struct sr_discipline *sd = wu->swu_dis; 410 struct scsi_xfer *xs = wu->swu_xs; 411 struct sr_chunk *scp; 412 int s, fail, i, gxinv, pxinv; 413 daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; 414 daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; 415 daddr64_t strip_bits, length, strip_offs, datalen, row_size; 416 void *pbuf, *data, *qbuf; 417 418 /* blk and scsi error will be handled by sr_validate_io */ 419 if (sr_validate_io(wu, &blk, "sr_raid6_rw")) 420 goto bad; 421 422 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 423 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; 424 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; 425 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 426 427 data = xs->data; 428 datalen = xs->datalen; 429 lbaoffs = blk << DEV_BSHIFT; 430 431 if (xs->flags & SCSI_DATA_OUT) 432 /* create write workunit */ 433 if ((wu_r = scsi_io_get(&sd->sd_iopool, SCSI_NOSLEEP)) == NULL){ 434 printf("%s: can't get wu_r", DEVNAME(sd->sd_sc)); 435 goto bad; 436 } 437 438 wu->swu_blk_start = 0; 439 while (datalen != 0) { 440 strip_no = lbaoffs >> strip_bits; 441 strip_offs = lbaoffs & (strip_size - 1); 442 chunk_offs = (strip_no / no_chunk) << strip_bits; 443 phys_offs = chunk_offs + strip_offs + 444 (sd->sd_meta->ssd_data_offset << DEV_BSHIFT); 445 446 /* get size remaining in this stripe */ 447 length = MIN(strip_size - strip_offs, datalen); 448 449 /* map disk offset to parity/data drive */ 450 chunk = strip_no % no_chunk; 451 452 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); 453 if (qchunk == 0) 454 pchunk = no_chunk + 1; 455 else 456 pchunk = qchunk - 1; 457 if (chunk >= pchunk) 458 chunk++; 459 if (chunk >= qchunk) 460 chunk++; 461 462 lba = phys_offs >> DEV_BSHIFT; 463 464 /* XXX big hammer.. exclude I/O from entire stripe */ 465 if (wu->swu_blk_start == 0) 466 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 467 wu->swu_blk_end = (strip_no / no_chunk) * row_size + (row_size - 1); 468 469 fail = 0; 470 fchunk = -1; 471 472 /* Get disk-fail flags */ 473 for (i=0; i< no_chunk+2; i++) { 474 scp = sd->sd_vol.sv_chunks[i]; 475 switch (scp->src_meta.scm_status) { 476 case BIOC_SDOFFLINE: 477 case BIOC_SDREBUILD: 478 case BIOC_SDHOTSPARE: 479 if (i == qchunk) 480 fail |= SR_FAILQ; 481 else if (i == pchunk) 482 fail |= SR_FAILP; 483 else if (i == chunk) 484 fail |= SR_FAILX; 485 else { 486 /* dual data-disk failure */ 487 fail |= SR_FAILY; 488 fchunk = i; 489 } 490 break; 491 } 492 } 493 if (xs->flags & SCSI_DATA_IN) { 494 if (!(fail & SR_FAILX)) { 495 /* drive is good. issue single read request */ 496 if (sr_raid6_addio(wu, chunk, lba, length, 497 data, xs->flags, 0, NULL, NULL, 0)) 498 goto bad; 499 } else if (fail & SR_FAILP) { 500 /* Dx, P failed */ 501 printf("Disk %llx offline, " 502 "regenerating Dx+P\n", chunk); 503 504 gxinv = gf_inv(gf_pow[chunk]); 505 506 /* Calculate: Dx = (Q^Dz*gz)*inv(gx) */ 507 memset(data, 0, length); 508 if (sr_raid6_addio(wu, qchunk, lba, length, NULL, 509 SCSI_DATA_IN, SR_CCBF_FREEBUF, NULL, data, 510 gxinv)) 511 goto bad; 512 513 /* Read Dz * gz * inv(gx) */ 514 for (i = 0; i < no_chunk+2; i++) { 515 if (i == qchunk || i == pchunk || i == chunk) 516 continue; 517 518 if (sr_raid6_addio(wu, i, lba, 519 length, NULL, SCSI_DATA_IN, 520 SR_CCBF_FREEBUF, NULL, 521 data, gf_mul(gf_pow[i], gxinv))) 522 goto bad; 523 } 524 525 /* data will contain correct value on completion */ 526 } else if (fail & SR_FAILY) { 527 /* Dx, Dy failed */ 528 printf("Disk %llx & %llx offline, " 529 "regenerating Dx+Dy\n", chunk, fchunk); 530 531 gxinv = gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]); 532 pxinv = gf_mul(gf_pow[fchunk], gxinv); 533 534 /* read Q * inv(gx + gy) */ 535 memset(data, 0, length); 536 if (sr_raid6_addio(wu, qchunk, lba, 537 length, NULL, SCSI_DATA_IN, 538 SR_CCBF_FREEBUF, NULL, 539 data, gxinv)) 540 goto bad; 541 542 /* read P * gy * inv(gx + gy) */ 543 if (sr_raid6_addio(wu, pchunk, lba, 544 length, NULL, SCSI_DATA_IN, 545 SR_CCBF_FREEBUF, NULL, 546 data, pxinv)) 547 goto bad; 548 549 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz 550 * Q: sr_raid6_xorp(qbuf, --, length); 551 * P: sr_raid6_xorp(pbuf, --, length); 552 * Dz: sr_raid6_xorp(pbuf, --, length); 553 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); 554 */ 555 for (i = 0; i < no_chunk+2; i++) { 556 if (i == qchunk || i == pchunk || 557 i == chunk || i == fchunk) 558 continue; 559 560 /* read Dz * (gz + gy) * inv(gx + gy) */ 561 if (sr_raid6_addio(wu, i, lba, 562 length, NULL, SCSI_DATA_IN, 563 SR_CCBF_FREEBUF, NULL, data, 564 pxinv ^ gf_mul(gf_pow[i], gxinv))) 565 goto bad; 566 } 567 } else { 568 /* Two cases: single disk (Dx) or (Dx+Q) 569 * Dx = Dz ^ P (same as RAID5) 570 */ 571 printf("Disk %llx offline, " 572 "regenerating Dx%s\n", chunk, 573 fail & SR_FAILQ ? "+Q" : " single"); 574 575 /* Calculate: Dx = P^Dz 576 * P: sr_raid6_xorp(data, ---, length); 577 * Dz: sr_raid6_xorp(data, ---, length); 578 */ 579 memset(data, 0, length); 580 for (i = 0; i < no_chunk+2; i++) { 581 if (i != chunk && i != qchunk) { 582 /* Read Dz */ 583 if (sr_raid6_addio(wu, i, lba, 584 length, NULL, SCSI_DATA_IN, 585 SR_CCBF_FREEBUF, data, 586 NULL, 0)) 587 goto bad; 588 } 589 } 590 591 /* data will contain correct value on completion */ 592 } 593 } else { 594 /* XXX handle writes to failed/offline disk? */ 595 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) 596 goto bad; 597 598 /* 599 * initialize pbuf with contents of new data to be 600 * written. This will be XORed with old data and old 601 * parity in the intr routine. The result in pbuf 602 * is the new parity data. 603 */ 604 qbuf = sr_get_block(sd, length); 605 if (qbuf == NULL) 606 goto bad; 607 608 pbuf = sr_get_block(sd, length); 609 if (pbuf == NULL) 610 goto bad; 611 612 /* Calculate P = Dn; Q = gn * Dn */ 613 if (gf_premul(gf_pow[chunk])) 614 goto bad; 615 sr_raid6_xorp(pbuf, data, length); 616 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); 617 618 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ 619 if (sr_raid6_addio(wu_r, chunk, lba, length, NULL, 620 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, 621 gf_pow[chunk])) 622 goto bad; 623 624 /* Read old xor-parity: P ^= P' */ 625 if (sr_raid6_addio(wu_r, pchunk, lba, length, NULL, 626 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) 627 goto bad; 628 629 /* Read old q-parity: Q ^= Q' */ 630 if (sr_raid6_addio(wu_r, qchunk, lba, length, NULL, 631 SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) 632 goto bad; 633 634 /* write new data */ 635 if (sr_raid6_addio(wu, chunk, lba, length, data, 636 xs->flags, 0, NULL, NULL, 0)) 637 goto bad; 638 639 /* write new xor-parity */ 640 if (sr_raid6_addio(wu, pchunk, lba, length, pbuf, 641 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 642 goto bad; 643 644 /* write new q-parity */ 645 if (sr_raid6_addio(wu, qchunk, lba, length, qbuf, 646 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 647 goto bad; 648 } 649 650 /* advance to next block */ 651 lbaoffs += length; 652 datalen -= length; 653 data += length; 654 } 655 656 s = splbio(); 657 if (wu_r) { 658 /* collide write request with reads */ 659 wu_r->swu_blk_start = wu->swu_blk_start; 660 wu_r->swu_blk_end = wu->swu_blk_end; 661 662 wu->swu_state = SR_WU_DEFERRED; 663 wu_r->swu_collider = wu; 664 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 665 666 wu = wu_r; 667 } 668 669 /* rebuild io, let rebuild routine deal with it */ 670 if (wu->swu_flags & SR_WUF_REBUILD) 671 goto queued; 672 673 /* current io failed, restart */ 674 if (wu->swu_state == SR_WU_RESTART) 675 goto start; 676 677 /* deferred io failed, don't restart */ 678 if (wu->swu_state == SR_WU_REQUEUE) 679 goto queued; 680 681 if (sr_check_io_collision(wu)) 682 goto queued; 683 684 start: 685 sr_raid_startwu(wu); 686 queued: 687 splx(s); 688 return (0); 689 bad: 690 /* wu is unwound by sr_wu_put */ 691 if (wu_r) 692 scsi_io_put(&sd->sd_iopool, wu_r); 693 return (1); 694 } 695 696 /* Handle failure I/O completion */ 697 int 698 sr_failio(struct sr_workunit *wu) 699 { 700 struct sr_discipline *sd = wu->swu_dis; 701 struct sr_ccb *ccb; 702 703 if (!(wu->swu_flags & SR_WUF_FAIL)) 704 return (0); 705 706 /* Wu is a 'fake'.. don't do real I/O just intr */ 707 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); 708 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) 709 sr_raid6_intr(&ccb->ccb_buf); 710 return (1); 711 } 712 713 void 714 sr_raid6_intr(struct buf *bp) 715 { 716 struct sr_ccb *ccb = (struct sr_ccb *)bp; 717 struct sr_workunit *wu = ccb->ccb_wu, *wup; 718 struct sr_discipline *sd = wu->swu_dis; 719 struct scsi_xfer *xs = wu->swu_xs; 720 struct sr_softc *sc = sd->sd_sc; 721 struct sr_raid6_opaque *pq = ccb->ccb_opaque; 722 int s, pend; 723 724 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", 725 DEVNAME(sc), bp, xs); 726 727 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 728 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 729 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, 730 ccb->ccb_buf.b_blkno, ccb->ccb_target); 731 732 s = splbio(); 733 734 if (ccb->ccb_buf.b_flags & B_ERROR) { 735 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 736 DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); 737 printf("io error: disk %x\n", ccb->ccb_target); 738 wu->swu_ios_failed++; 739 ccb->ccb_state = SR_CCB_FAILED; 740 if (ccb->ccb_target != -1) 741 sd->sd_set_chunk_state(sd, ccb->ccb_target, 742 BIOC_SDOFFLINE); 743 else 744 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 745 } else { 746 ccb->ccb_state = SR_CCB_OK; 747 wu->swu_ios_succeeded++; 748 749 /* XOR data to result */ 750 if (pq) { 751 if (pq->pbuf) 752 /* Calculate xor-parity */ 753 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, 754 ccb->ccb_buf.b_bcount); 755 if (pq->qbuf) 756 /* Calculate q-parity */ 757 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, 758 ccb->ccb_buf.b_bcount, pq->gn); 759 free(pq, M_DEVBUF); 760 ccb->ccb_opaque = NULL; 761 } 762 } 763 764 /* free allocated data buffer */ 765 if (ccb->ccb_flag & SR_CCBF_FREEBUF) { 766 sr_put_block(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 767 ccb->ccb_buf.b_data = NULL; 768 } 769 wu->swu_ios_complete++; 770 771 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 772 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 773 wu->swu_ios_failed); 774 775 if (wu->swu_ios_complete >= wu->swu_io_count) { 776 777 /* if all ios failed, retry reads and give up on writes */ 778 if (wu->swu_ios_failed == wu->swu_ios_complete) { 779 if (xs->flags & SCSI_DATA_IN) { 780 printf("%s: retrying read on block %lld\n", 781 DEVNAME(sc), ccb->ccb_buf.b_blkno); 782 sr_ccb_put(ccb); 783 TAILQ_INIT(&wu->swu_ccb); 784 wu->swu_state = SR_WU_RESTART; 785 if (sd->sd_scsi_rw(wu)) 786 goto bad; 787 else 788 goto retry; 789 } else { 790 printf("%s: permanently fail write on block " 791 "%lld\n", DEVNAME(sc), 792 ccb->ccb_buf.b_blkno); 793 xs->error = XS_DRIVER_STUFFUP; 794 goto bad; 795 } 796 } 797 798 if (xs != NULL) 799 xs->error = XS_NOERROR; 800 801 pend = 0; 802 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 803 if (wu == wup) { 804 /* wu on pendq, remove */ 805 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 806 pend = 1; 807 808 if (wu->swu_collider) { 809 if (wu->swu_ios_failed) 810 /* toss all ccbs and recreate */ 811 sr_raid_recreate_wu(wu->swu_collider); 812 813 /* restart deferred wu */ 814 wu->swu_collider->swu_state = 815 SR_WU_INPROGRESS; 816 TAILQ_REMOVE(&sd->sd_wu_defq, 817 wu->swu_collider, swu_link); 818 if (sr_failio(wu->swu_collider) == 0) 819 sr_raid_startwu(wu->swu_collider); 820 } 821 break; 822 } 823 } 824 825 if (!pend) 826 printf("%s: wu: %p not on pending queue\n", 827 DEVNAME(sc), wu); 828 829 if (wu->swu_flags & SR_WUF_REBUILD) { 830 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 831 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 832 wakeup(wu); 833 } 834 } else { 835 if (xs != NULL) 836 sr_scsi_done(sd, xs); 837 else 838 scsi_io_put(&sd->sd_iopool, wu); 839 } 840 841 if (sd->sd_sync && sd->sd_wu_pending == 0) 842 wakeup(sd); 843 } 844 845 retry: 846 splx(s); 847 return; 848 bad: 849 xs->error = XS_DRIVER_STUFFUP; 850 if (wu->swu_flags & SR_WUF_REBUILD) { 851 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 852 wakeup(wu); 853 } else { 854 sr_scsi_done(sd, xs); 855 } 856 857 splx(s); 858 } 859 860 int 861 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, 862 void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) 863 { 864 struct sr_discipline *sd = wu->swu_dis; 865 struct sr_ccb *ccb; 866 struct sr_raid6_opaque *pqbuf; 867 868 ccb = sr_ccb_get(sd); 869 if (!ccb) 870 return (-1); 871 872 /* allocate temporary buffer */ 873 if (data == NULL) { 874 data = sr_get_block(sd, len); 875 if (data == NULL) 876 return (-1); 877 } 878 879 DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", 880 flag & SCSI_DATA_IN ? "read" : "write", 881 dsk, blk, len, pbuf, qbuf); 882 883 ccb->ccb_flag = ccbflag; 884 if (flag & SCSI_POLL) { 885 ccb->ccb_buf.b_flags = 0; 886 ccb->ccb_buf.b_iodone = NULL; 887 } else { 888 ccb->ccb_buf.b_flags = B_CALL; 889 ccb->ccb_buf.b_iodone = sr_raid6_intr; 890 } 891 if (flag & SCSI_DATA_IN) 892 ccb->ccb_buf.b_flags |= B_READ; 893 else 894 ccb->ccb_buf.b_flags |= B_WRITE; 895 896 /* add offset for metadata */ 897 ccb->ccb_buf.b_flags |= B_PHYS; 898 ccb->ccb_buf.b_blkno = blk; 899 ccb->ccb_buf.b_bcount = len; 900 ccb->ccb_buf.b_bufsize = len; 901 ccb->ccb_buf.b_resid = len; 902 ccb->ccb_buf.b_data = data; 903 ccb->ccb_buf.b_error = 0; 904 ccb->ccb_buf.b_proc = curproc; 905 ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; 906 ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; 907 ccb->ccb_buf.b_bq = NULL; 908 if ((ccb->ccb_buf.b_flags & B_READ) == 0) 909 ccb->ccb_buf.b_vp->v_numoutput++; 910 911 ccb->ccb_wu = wu; 912 ccb->ccb_target = dsk; 913 if (pbuf || qbuf) { 914 if (qbuf && gf_premul(gn)) 915 return (-1); 916 917 pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_ZERO | M_NOWAIT); 918 if (pqbuf == NULL) { 919 sr_ccb_put(ccb); 920 return (-1); 921 } 922 pqbuf->pbuf = pbuf; 923 pqbuf->qbuf = qbuf; 924 pqbuf->gn = gn; 925 ccb->ccb_opaque = pqbuf; 926 } 927 928 LIST_INIT(&ccb->ccb_buf.b_dep); 929 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 930 931 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " 932 "b_blkno: %x b_flags 0x%0x b_data %p\n", 933 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 934 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, 935 ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); 936 937 wu->swu_io_count++; 938 939 return (0); 940 } 941 942 /* Perform RAID6 parity calculation. 943 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ 944 void 945 sr_raid6_xorp(void *p, void *d, int len) 946 { 947 uint32_t *pbuf = p, *data = d; 948 949 len >>= 2; 950 while (len--) 951 *pbuf++ ^= *data++; 952 } 953 954 void 955 sr_raid6_xorq(void *q, void *d, int len, int gn) 956 { 957 uint32_t *qbuf = q, *data = d, x; 958 uint8_t *gn_map = gf_map[gn]; 959 960 len >>= 2; 961 while (len--) { 962 x = *data++; 963 *qbuf++ ^= (((uint32_t)gn_map[x & 0xff]) | 964 ((uint32_t)gn_map[(x >> 8) & 0xff] << 8) | 965 ((uint32_t)gn_map[(x >> 16) & 0xff] << 16) | 966 ((uint32_t)gn_map[(x >> 24) & 0xff] << 24)); 967 } 968 } 969 970 /* Create GF256 log/pow tables: polynomial = 0x11D */ 971 void 972 gf_init(void) 973 { 974 int i; 975 uint8_t p = 1; 976 977 /* use 2N pow table to avoid using % in multiply */ 978 for (i=0; i<256; i++) { 979 gf_log[p] = i; 980 gf_pow[i] = gf_pow[i+255] = p; 981 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); 982 } 983 gf_log[0] = 512; 984 } 985 986 uint8_t 987 gf_inv(uint8_t a) 988 { 989 return gf_pow[255 - gf_log[a]]; 990 } 991 992 uint8_t 993 gf_mul(uint8_t a, uint8_t b) 994 { 995 return gf_pow[gf_log[a] + gf_log[b]]; 996 } 997 998 /* Precalculate multiplication tables for drive gn */ 999 int 1000 gf_premul(uint8_t gn) 1001 { 1002 int i; 1003 1004 if (gf_map[gn] != NULL) 1005 return (0); 1006 1007 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_ZERO | M_NOWAIT)) == NULL) 1008 return (-1); 1009 1010 for (i=0; i<256; i++) 1011 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; 1012 return (0); 1013 } 1014