1 /* $OpenBSD: softraid_raid6.c,v 1.6 2009/08/26 20:14:44 jordan Exp $ */ 2 /* 3 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 4 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include "bio.h" 20 21 #include <sys/param.h> 22 #include <sys/systm.h> 23 #include <sys/buf.h> 24 #include <sys/device.h> 25 #include <sys/ioctl.h> 26 #include <sys/proc.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/disklabel.h> 34 #include <sys/mount.h> 35 #include <sys/sensors.h> 36 #include <sys/stat.h> 37 #include <sys/conf.h> 38 #include <sys/uio.h> 39 40 #include <scsi/scsi_all.h> 41 #include <scsi/scsiconf.h> 42 #include <scsi/scsi_disk.h> 43 44 #include <dev/softraidvar.h> 45 #include <dev/rndvar.h> 46 47 uint8_t *gf_map[256]; 48 uint8_t gf_pow[768]; 49 int gf_log[256]; 50 51 /* RAID 6 functions. */ 52 int sr_raid6_alloc_resources(struct sr_discipline *); 53 int sr_raid6_free_resources(struct sr_discipline *); 54 int sr_raid6_rw(struct sr_workunit *); 55 int sr_raid6_openings(struct sr_discipline *); 56 void sr_raid6_intr(struct buf *); 57 void sr_raid6_recreate_wu(struct sr_workunit *); 58 void sr_raid6_set_chunk_state(struct sr_discipline *, int, int); 59 void sr_raid6_set_vol_state(struct sr_discipline *); 60 61 void sr_raid6_xorp(void *, void *, int); 62 void sr_raid6_xorq(void *, void *, int, int); 63 int sr_raid6_addio(struct sr_workunit *wu, int, daddr64_t, daddr64_t, 64 void *, int, int, void *, void *, int); 65 void sr_dump(void *, int); 66 void sr_raid6_scrub(struct sr_discipline *); 67 int sr_failio(struct sr_workunit *); 68 69 void *sr_get_block(struct sr_discipline *, int); 70 void sr_put_block(struct sr_discipline *, void *); 71 72 void gf_init(void); 73 uint8_t gf_inv(uint8_t); 74 int gf_premul(uint8_t); 75 76 #define SR_NOFAIL 0x00 77 #define SR_FAILX (1L << 0) 78 #define SR_FAILY (1L << 1) 79 #define SR_FAILP (1L << 2) 80 #define SR_FAILQ (1L << 3) 81 82 struct sr_raid6_opaque { 83 int gn; 84 void *pbuf; 85 void *qbuf; 86 }; 87 88 /* discipline initialisation. */ 89 void 90 sr_raid6_discipline_init(struct sr_discipline *sd) 91 { 92 /* Initialize GF256 tables */ 93 gf_init(); 94 95 /* fill out discipline members. */ 96 sd->sd_max_ccb_per_wu = max(6, 2 * sd->sd_meta->ssdi.ssd_chunk_no); /* only if stripsize <= MAXPHYS */ 97 sd->sd_max_wu = SR_RAID6_NOWU; 98 sd->sd_rebuild = 0; 99 100 /* setup discipline pointers. */ 101 sd->sd_alloc_resources = sr_raid6_alloc_resources; 102 sd->sd_free_resources = sr_raid6_free_resources; 103 sd->sd_start_discipline = NULL; 104 sd->sd_scsi_inquiry = sr_raid_inquiry; 105 sd->sd_scsi_read_cap = sr_raid_read_cap; 106 sd->sd_scsi_tur = sr_raid_tur; 107 sd->sd_scsi_req_sense = sr_raid_request_sense; 108 sd->sd_scsi_start_stop = sr_raid_start_stop; 109 sd->sd_scsi_sync = sr_raid_sync; 110 sd->sd_scsi_rw = sr_raid6_rw; 111 sd->sd_set_chunk_state = sr_raid6_set_chunk_state; 112 sd->sd_set_vol_state = sr_raid6_set_vol_state; 113 sd->sd_openings = sr_raid6_openings; 114 } 115 116 int 117 sr_raid6_openings(struct sr_discipline *sd) 118 { 119 return (sd->sd_max_wu >> 1); /* 2 wu's per IO */ 120 } 121 122 int 123 sr_raid6_alloc_resources(struct sr_discipline *sd) 124 { 125 int rv = EINVAL; 126 127 if (!sd) 128 return (rv); 129 130 DNPRINTF(SR_D_DIS, "%s: sr_raid6_alloc_resources\n", 131 DEVNAME(sd->sd_sc)); 132 133 if (sr_wu_alloc(sd)) 134 goto bad; 135 if (sr_ccb_alloc(sd)) 136 goto bad; 137 138 /* setup runtime values */ 139 sd->mds.mdd_raid6.sr6_strip_bits = 140 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 141 if (sd->mds.mdd_raid6.sr6_strip_bits == -1) 142 goto bad; 143 144 rv = 0; 145 bad: 146 return (rv); 147 } 148 149 int 150 sr_raid6_free_resources(struct sr_discipline *sd) 151 { 152 int rv = EINVAL; 153 154 if (!sd) 155 return (rv); 156 157 DNPRINTF(SR_D_DIS, "%s: sr_raid6_free_resources\n", 158 DEVNAME(sd->sd_sc)); 159 160 sr_wu_free(sd); 161 sr_ccb_free(sd); 162 163 rv = 0; 164 return (rv); 165 } 166 167 void 168 sr_raid6_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 169 { 170 int old_state, s; 171 172 /* XXX this is for RAID 0 */ 173 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 174 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 175 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 176 177 /* ok to go to splbio since this only happens in error path */ 178 s = splbio(); 179 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 180 181 /* multiple IOs to the same chunk that fail will come through here */ 182 if (old_state == new_state) 183 goto done; 184 185 switch (old_state) { 186 case BIOC_SDONLINE: 187 switch (new_state) { 188 case BIOC_SDOFFLINE: 189 case BIOC_SDSCRUB: 190 break; 191 default: 192 goto die; 193 } 194 break; 195 196 case BIOC_SDOFFLINE: 197 if (new_state == BIOC_SDREBUILD) { 198 ; 199 } else 200 goto die; 201 break; 202 203 case BIOC_SDSCRUB: 204 switch (new_state) { 205 case BIOC_SDONLINE: 206 case BIOC_SDOFFLINE: 207 break; 208 default: 209 goto die; 210 } 211 break; 212 213 case BIOC_SDREBUILD: 214 switch (new_state) { 215 case BIOC_SDONLINE: 216 case BIOC_SDOFFLINE: 217 break; 218 default: 219 goto die; 220 } 221 break; 222 223 default: 224 die: 225 splx(s); /* XXX */ 226 panic("%s: %s: %s: invalid chunk state transition " 227 "%d -> %d\n", DEVNAME(sd->sd_sc), 228 sd->sd_meta->ssd_devname, 229 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 230 old_state, new_state); 231 /* NOTREACHED */ 232 } 233 234 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 235 sd->sd_set_vol_state(sd); 236 237 sd->sd_must_flush = 1; 238 workq_add_task(NULL, 0, sr_meta_save_callback, sd, NULL); 239 done: 240 splx(s); 241 } 242 243 void 244 sr_raid6_set_vol_state(struct sr_discipline *sd) 245 { 246 int states[SR_MAX_STATES]; 247 int new_state, i, s, nd; 248 int old_state = sd->sd_vol_status; 249 250 /* XXX this is for RAID 0 */ 251 252 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 253 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 254 255 nd = sd->sd_meta->ssdi.ssd_chunk_no; 256 257 for (i = 0; i < SR_MAX_STATES; i++) 258 states[i] = 0; 259 260 for (i = 0; i < nd; i++) { 261 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 262 if (s >= SR_MAX_STATES) 263 panic("%s: %s: %s: invalid chunk state", 264 DEVNAME(sd->sd_sc), 265 sd->sd_meta->ssd_devname, 266 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 267 states[s]++; 268 } 269 270 if (states[BIOC_SDONLINE] == nd) 271 new_state = BIOC_SVONLINE; 272 else if (states[BIOC_SDONLINE] < nd - 2) 273 new_state = BIOC_SVOFFLINE; 274 else if (states[BIOC_SDSCRUB] != 0) 275 new_state = BIOC_SVSCRUB; 276 else if (states[BIOC_SDREBUILD] != 0) 277 new_state = BIOC_SVREBUILD; 278 else if (states[BIOC_SDONLINE] < nd) 279 new_state = BIOC_SVDEGRADED; 280 else { 281 printf("old_state = %d, ", old_state); 282 for (i = 0; i < nd; i++) 283 printf("%d = %d, ", i, 284 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 285 panic("invalid new_state"); 286 } 287 288 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state %d -> %d\n", 289 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 290 old_state, new_state); 291 292 switch (old_state) { 293 case BIOC_SVONLINE: 294 switch (new_state) { 295 case BIOC_SVONLINE: /* can go to same state */ 296 case BIOC_SVOFFLINE: 297 case BIOC_SVDEGRADED: 298 case BIOC_SVREBUILD: /* happens on boot */ 299 break; 300 default: 301 goto die; 302 } 303 break; 304 305 case BIOC_SVOFFLINE: 306 /* XXX this might be a little too much */ 307 goto die; 308 309 case BIOC_SVSCRUB: 310 switch (new_state) { 311 case BIOC_SVONLINE: 312 case BIOC_SVOFFLINE: 313 case BIOC_SVDEGRADED: 314 case BIOC_SVSCRUB: /* can go to same state */ 315 break; 316 default: 317 goto die; 318 } 319 break; 320 321 case BIOC_SVBUILDING: 322 switch (new_state) { 323 case BIOC_SVONLINE: 324 case BIOC_SVOFFLINE: 325 case BIOC_SVBUILDING: /* can go to the same state */ 326 break; 327 default: 328 goto die; 329 } 330 break; 331 332 case BIOC_SVREBUILD: 333 switch (new_state) { 334 case BIOC_SVONLINE: 335 case BIOC_SVOFFLINE: 336 case BIOC_SVDEGRADED: 337 case BIOC_SVREBUILD: /* can go to the same state */ 338 break; 339 default: 340 goto die; 341 } 342 break; 343 344 case BIOC_SVDEGRADED: 345 switch (new_state) { 346 case BIOC_SVOFFLINE: 347 case BIOC_SVREBUILD: 348 case BIOC_SVDEGRADED: /* can go to the same state */ 349 break; 350 default: 351 goto die; 352 } 353 break; 354 355 default: 356 die: 357 panic("%s: %s: invalid volume state transition %d -> %d\n", 358 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 359 old_state, new_state); 360 /* NOTREACHED */ 361 } 362 363 sd->sd_vol_status = new_state; 364 } 365 366 /* modes: 367 * readq: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 368 * SR_CCBF_FREEBUF, qbuf, NULL, 0); 369 * readp: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 370 * SR_CCBF_FREEBUF, pbuf, NULL, 0); 371 * readx: sr_raid6_addio(i, lba, length, NULL, SCSI_DATA_IN, 372 * SR_CCBF_FREEBUF, pbuf, qbuf, gf_pow[i]); 373 */ 374 375 int 376 sr_raid6_rw(struct sr_workunit *wu) 377 { 378 struct sr_workunit *wu_w = NULL; 379 struct sr_discipline *sd = wu->swu_dis; 380 struct scsi_xfer *xs = wu->swu_xs; 381 struct sr_chunk *scp; 382 int s, fail, i; 383 daddr64_t blk, lbaoffs, strip_no, chunk, qchunk, pchunk, fchunk; 384 daddr64_t strip_size, no_chunk, lba, chunk_offs, phys_offs; 385 daddr64_t strip_bits, length, strip_offs, datalen; 386 void *pbuf, *data, *qbuf; 387 388 /* blk and scsi error will be handled by sr_validate_io */ 389 if (sr_validate_io(wu, &blk, "sr_raid6_rw")) 390 goto bad; 391 392 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 393 strip_bits = sd->mds.mdd_raid6.sr6_strip_bits; 394 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 2; 395 396 data = xs->data; 397 datalen = xs->datalen; 398 lbaoffs = blk << DEV_BSHIFT; 399 400 if (xs->flags & SCSI_DATA_OUT) 401 /* create write workunit */ 402 if ((wu_w = sr_wu_get(sd, 0)) == NULL) { 403 printf("%s: can't get wu_w", DEVNAME(sd->sd_sc)); 404 goto bad; 405 } 406 407 wu->swu_blk_start = 0; 408 while (datalen != 0) { 409 strip_no = lbaoffs >> strip_bits; 410 strip_offs = lbaoffs & (strip_size - 1); 411 chunk_offs = (strip_no / no_chunk) << strip_bits; 412 phys_offs = chunk_offs + strip_offs + 413 ((SR_META_OFFSET + SR_META_SIZE) << DEV_BSHIFT); 414 415 /* get size remaining in this stripe */ 416 length = MIN(strip_size - strip_offs, datalen); 417 418 /* map disk offset to parity/data drive */ 419 chunk = strip_no % no_chunk; 420 421 qchunk = (no_chunk + 1) - ((strip_no / no_chunk) % (no_chunk+2)); 422 if (qchunk == 0) 423 pchunk = no_chunk + 1; 424 else 425 pchunk = qchunk - 1; 426 if (chunk >= pchunk) 427 chunk++; 428 if (chunk >= qchunk) 429 chunk++; 430 431 lba = phys_offs >> DEV_BSHIFT; 432 433 /* XXX big hammer.. exclude I/O from entire stripe */ 434 if (wu->swu_blk_start == 0) 435 wu->swu_blk_start = chunk_offs >> DEV_BSHIFT; 436 wu->swu_blk_end = ((chunk_offs + (no_chunk << strip_bits)) >> DEV_BSHIFT) - 1; 437 438 fail = 0; 439 fchunk = -1; 440 441 /* Get disk-fail flags */ 442 for (i=0; i< no_chunk+2; i++) { 443 scp = sd->sd_vol.sv_chunks[i]; 444 switch (scp->src_meta.scm_status) { 445 case BIOC_SDOFFLINE: 446 case BIOC_SDREBUILD: 447 case BIOC_SDHOTSPARE: 448 if (i == qchunk) 449 fail |= SR_FAILQ; 450 else if (i == pchunk) 451 fail |= SR_FAILP; 452 else if (i == chunk) 453 fail |= SR_FAILX; 454 else { 455 /* dual data-disk failure */ 456 fail |= SR_FAILY; 457 fchunk = i; 458 } 459 break; 460 } 461 } 462 if (xs->flags & SCSI_DATA_IN) { 463 if (!(fail & SR_FAILX)) { 464 /* drive is good. issue single read request */ 465 if (sr_raid6_addio(wu, chunk, lba, length, 466 data, xs->flags, 0, NULL, NULL, 0)) 467 goto bad; 468 } else if (fail & SR_FAILP) { 469 /* Dx, P failed */ 470 printf("Disk %llx offline, " 471 "regenerating Dx+P\n", chunk); 472 473 qbuf = sr_get_block(sd, length); 474 if (qbuf == NULL) 475 goto bad; 476 477 /* Calculate: Dx*gx = Q^(Dz*gz) 478 * Q: sr_raid6_xorp(data, --, length); 479 * Dz: sr_raid6_xorq(data, --, length, gf_pow[i]); 480 */ 481 memset(data, 0, length); 482 for (i = 0; i < no_chunk+2; i++) { 483 if (i == qchunk) { 484 /* Read Q */ 485 if (sr_raid6_addio(wu, i, lba, 486 length, NULL, SCSI_DATA_IN, 487 SR_CCBF_FREEBUF, qbuf, 488 NULL, 0)) 489 goto bad; 490 } else if (i != chunk && i != pchunk) { 491 /* Read Dz * gz */ 492 if (sr_raid6_addio(wu, i, lba, 493 length, NULL, SCSI_DATA_IN, 494 SR_CCBF_FREEBUF, NULL, 495 qbuf, gf_pow[i])) 496 goto bad; 497 } 498 } 499 500 /* run fake wu when read i/o is complete */ 501 if (wu_w == NULL && 502 (wu_w = sr_wu_get(sd, 0)) == NULL) 503 goto bad; 504 505 wu_w->swu_flags |= SR_WUF_FAIL; 506 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 507 SR_CCBF_FREEBUF, NULL, data, 508 gf_inv(gf_pow[chunk]))) 509 goto bad; 510 } else if (fail & SR_FAILY) { 511 /* Dx, Dy failed */ 512 printf("Disk %llx & %llx offline, " 513 "regenerating Dx+Dy\n", chunk, fchunk); 514 qbuf = sr_get_block(sd, length); 515 if (qbuf == NULL) 516 goto bad; 517 pbuf = sr_get_block(sd, length); 518 if (pbuf == NULL) 519 goto bad; 520 521 /* Calculate: Dx*gx^Dy*gy = Q^(Dz*gz) ; Dx^Dy = P^Dz 522 * Q: sr_raid6_xorp(qbuf, --, length); 523 * P: sr_raid6_xorp(pbuf, --, length); 524 * Dz: sr_raid6_xorp(pbuf, --, length); 525 * sr_raid6_xorq(qbuf, --, length, gf_pow[i]); 526 */ 527 memset(data, 0, length); 528 for (i = 0; i < no_chunk+2; i++) { 529 if (i == qchunk) { 530 /* read Q */ 531 if (sr_raid6_addio(wu, i, lba, 532 length, NULL, SCSI_DATA_IN, 533 SR_CCBF_FREEBUF, qbuf, 534 NULL, 0)) 535 goto bad; 536 } else if (i == pchunk) { 537 /* read P */ 538 if (sr_raid6_addio(wu, i, lba, 539 length, NULL, SCSI_DATA_IN, 540 SR_CCBF_FREEBUF, pbuf, 541 NULL, 0)) 542 goto bad; 543 } else if (i != chunk) { 544 /* read Dz * gz */ 545 if (sr_raid6_addio(wu, i, lba, 546 length, NULL, SCSI_DATA_IN, 547 SR_CCBF_FREEBUF, pbuf, 548 qbuf, gf_pow[i])) 549 goto bad; 550 } 551 } 552 553 /* run fake wu when read i/o is complete */ 554 if (wu_w == NULL && 555 (wu_w = sr_wu_get(sd, 0)) == NULL) 556 goto bad; 557 558 wu_w->swu_flags |= SR_WUF_FAIL; 559 if (sr_raid6_addio(wu_w, 0, 0, length, pbuf, 0, 560 SR_CCBF_FREEBUF, NULL, data, 561 gf_inv(gf_pow[255+chunk-fchunk] ^ 1))) 562 goto bad; 563 if (sr_raid6_addio(wu_w, 0, 0, length, qbuf, 0, 564 SR_CCBF_FREEBUF, NULL, data, 565 gf_inv(gf_pow[chunk] ^ gf_pow[fchunk]))) 566 goto bad; 567 } else { 568 /* Two cases: single disk (Dx) or (Dx+Q) 569 * Dx = Dz ^ P (same as RAID5) 570 */ 571 printf("Disk %llx offline, " 572 "regenerating Dx%s\n", chunk, 573 fail & SR_FAILQ ? "+Q" : " single"); 574 575 /* Calculate: Dx = P^Dz 576 * P: sr_raid6_xorp(data, ---, length); 577 * Dz: sr_raid6_xorp(data, ---, length); 578 */ 579 memset(data, 0, length); 580 for (i = 0; i < no_chunk+2; i++) { 581 if (i != chunk && i != qchunk) { 582 /* Read Dz */ 583 if (sr_raid6_addio(wu, i, lba, 584 length, NULL, SCSI_DATA_IN, 585 SR_CCBF_FREEBUF, data, 586 NULL, 0)) 587 goto bad; 588 } 589 } 590 591 /* data will contain correct value on completion */ 592 } 593 } else { 594 /* XXX handle writes to failed/offline disk? */ 595 if (fail & (SR_FAILX|SR_FAILQ|SR_FAILP)) 596 goto bad; 597 598 /* 599 * initialize pbuf with contents of new data to be 600 * written. This will be XORed with old data and old 601 * parity in the intr routine. The result in pbuf 602 * is the new parity data. 603 */ 604 qbuf = sr_get_block(sd, length); 605 if (qbuf == NULL) 606 goto bad; 607 608 pbuf = sr_get_block(sd, length); 609 if (pbuf == NULL) 610 goto bad; 611 612 /* Calulate P = Dn; Q = gn * Dn */ 613 if (gf_premul(gf_pow[chunk])) 614 goto bad; 615 sr_raid6_xorp(pbuf, data, length); 616 sr_raid6_xorq(qbuf, data, length, gf_pow[chunk]); 617 618 /* Read old data: P ^= Dn' ; Q ^= (gn * Dn') */ 619 if (sr_raid6_addio(wu, chunk, lba, length, NULL, 620 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, qbuf, 621 gf_pow[chunk])) 622 goto bad; 623 624 /* Read old xor-parity: P ^= P' */ 625 if (sr_raid6_addio(wu, pchunk, lba, length, NULL, 626 SCSI_DATA_IN, SR_CCBF_FREEBUF, pbuf, NULL, 0)) 627 goto bad; 628 629 /* Read old q-parity: Q ^= Q' */ 630 if (sr_raid6_addio(wu, qchunk, lba, length, NULL, 631 SCSI_DATA_IN, SR_CCBF_FREEBUF, qbuf, NULL, 0)) 632 goto bad; 633 634 /* write new data */ 635 if (sr_raid6_addio(wu_w, chunk, lba, length, data, 636 xs->flags, 0, NULL, NULL, 0)) 637 goto bad; 638 639 /* write new xor-parity */ 640 if (sr_raid6_addio(wu_w, pchunk, lba, length, pbuf, 641 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 642 goto bad; 643 644 /* write new q-parity */ 645 if (sr_raid6_addio(wu_w, qchunk, lba, length, qbuf, 646 xs->flags, SR_CCBF_FREEBUF, NULL, NULL, 0)) 647 goto bad; 648 } 649 650 /* advance to next block */ 651 lbaoffs += length; 652 datalen -= length; 653 data += length; 654 } 655 656 s = splbio(); 657 if (wu_w) { 658 /* collide write request with reads */ 659 wu_w->swu_blk_start = wu->swu_blk_start; 660 wu_w->swu_blk_end = wu->swu_blk_end; 661 662 /* 663 * put xs block in write request (scsi_done not called till 664 * write completes) 665 */ 666 wu_w->swu_xs = wu->swu_xs; 667 wu->swu_xs = NULL; 668 669 wu_w->swu_state = SR_WU_DEFERRED; 670 wu->swu_collider = wu_w; 671 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 672 } 673 674 /* rebuild io, let rebuild routine deal with it */ 675 if (wu->swu_flags & SR_WUF_REBUILD) 676 goto queued; 677 678 /* current io failed, restart */ 679 if (wu->swu_state == SR_WU_RESTART) 680 goto start; 681 682 /* deferred io failed, don't restart */ 683 if (wu->swu_state == SR_WU_REQUEUE) 684 goto queued; 685 686 if (sr_check_io_collision(wu)) 687 goto queued; 688 689 start: 690 sr_raid_startwu(wu); 691 queued: 692 splx(s); 693 return (0); 694 bad: 695 /* wu is unwound by sr_wu_put */ 696 if (wu_w) 697 sr_wu_put(wu_w); 698 return (1); 699 } 700 701 /* Handle failure I/O completion */ 702 int 703 sr_failio(struct sr_workunit *wu) 704 { 705 struct sr_discipline *sd = wu->swu_dis; 706 struct sr_ccb *ccb; 707 708 if (!(wu->swu_flags & SR_WUF_FAIL)) 709 return (0); 710 711 /* Wu is a 'fake'.. don't do real I/O just intr */ 712 TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); 713 TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) 714 sr_raid6_intr(&ccb->ccb_buf); 715 return (1); 716 } 717 718 void 719 sr_raid6_intr(struct buf *bp) 720 { 721 struct sr_ccb *ccb = (struct sr_ccb *)bp; 722 struct sr_workunit *wu = ccb->ccb_wu, *wup; 723 struct sr_discipline *sd = wu->swu_dis; 724 struct scsi_xfer *xs = wu->swu_xs; 725 struct sr_softc *sc = sd->sd_sc; 726 struct sr_raid6_opaque *pq = ccb->ccb_opaque; 727 int s, pend; 728 729 DNPRINTF(SR_D_INTR, "%s: sr_intr bp %p xs %p\n", 730 DEVNAME(sc), bp, xs); 731 732 DNPRINTF(SR_D_INTR, "%s: sr_intr: b_bcount: %d b_resid: %d" 733 " b_flags: 0x%0x block: %lld target: %d\n", DEVNAME(sc), 734 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_resid, ccb->ccb_buf.b_flags, 735 ccb->ccb_buf.b_blkno, ccb->ccb_target); 736 737 s = splbio(); 738 739 if (ccb->ccb_buf.b_flags & B_ERROR) { 740 DNPRINTF(SR_D_INTR, "%s: i/o error on block %lld target: %d\n", 741 DEVNAME(sc), ccb->ccb_buf.b_blkno, ccb->ccb_target); 742 printf("io error: disk %x\n", ccb->ccb_target); 743 wu->swu_ios_failed++; 744 ccb->ccb_state = SR_CCB_FAILED; 745 if (ccb->ccb_target != -1) 746 sd->sd_set_chunk_state(sd, ccb->ccb_target, 747 BIOC_SDOFFLINE); 748 else 749 panic("%s: invalid target on wu: %p", DEVNAME(sc), wu); 750 } else { 751 ccb->ccb_state = SR_CCB_OK; 752 wu->swu_ios_succeeded++; 753 754 /* XOR data to result */ 755 if (pq) { 756 if (pq->pbuf) 757 /* Calculate xor-parity */ 758 sr_raid6_xorp(pq->pbuf, ccb->ccb_buf.b_data, 759 ccb->ccb_buf.b_bcount); 760 if (pq->qbuf) 761 /* Calculate q-parity */ 762 sr_raid6_xorq(pq->qbuf, ccb->ccb_buf.b_data, 763 ccb->ccb_buf.b_bcount, pq->gn); 764 free(pq, M_DEVBUF); 765 ccb->ccb_opaque = NULL; 766 } 767 } 768 769 /* free allocated data buffer */ 770 if (ccb->ccb_flag & SR_CCBF_FREEBUF) { 771 sr_put_block(sd, ccb->ccb_buf.b_data); 772 ccb->ccb_buf.b_data = NULL; 773 } 774 wu->swu_ios_complete++; 775 776 DNPRINTF(SR_D_INTR, "%s: sr_intr: comp: %d count: %d failed: %d\n", 777 DEVNAME(sc), wu->swu_ios_complete, wu->swu_io_count, 778 wu->swu_ios_failed); 779 780 if (wu->swu_ios_complete >= wu->swu_io_count) { 781 782 /* if all ios failed, retry reads and give up on writes */ 783 if (wu->swu_ios_failed == wu->swu_ios_complete) { 784 if (xs->flags & SCSI_DATA_IN) { 785 printf("%s: retrying read on block %lld\n", 786 DEVNAME(sc), ccb->ccb_buf.b_blkno); 787 sr_ccb_put(ccb); 788 TAILQ_INIT(&wu->swu_ccb); 789 wu->swu_state = SR_WU_RESTART; 790 if (sd->sd_scsi_rw(wu)) 791 goto bad; 792 else 793 goto retry; 794 } else { 795 printf("%s: permanently fail write on block " 796 "%lld\n", DEVNAME(sc), 797 ccb->ccb_buf.b_blkno); 798 xs->error = XS_DRIVER_STUFFUP; 799 goto bad; 800 } 801 } 802 803 if (xs != NULL) { 804 xs->error = XS_NOERROR; 805 xs->resid = 0; 806 xs->flags |= ITSDONE; 807 } 808 809 pend = 0; 810 TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { 811 if (wu == wup) { 812 /* wu on pendq, remove */ 813 TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link); 814 pend = 1; 815 816 if (wu->swu_collider) { 817 if (wu->swu_ios_failed) 818 /* toss all ccbs and recreate */ 819 sr_raid6_recreate_wu(wu->swu_collider); 820 821 /* restart deferred wu */ 822 wu->swu_collider->swu_state = 823 SR_WU_INPROGRESS; 824 TAILQ_REMOVE(&sd->sd_wu_defq, 825 wu->swu_collider, swu_link); 826 if (sr_failio(wu->swu_collider) == 0) 827 sr_raid_startwu(wu->swu_collider); 828 } 829 break; 830 } 831 } 832 833 if (!pend) 834 printf("%s: wu: %p not on pending queue\n", 835 DEVNAME(sc), wu); 836 837 if (wu->swu_flags & SR_WUF_REBUILD) { 838 if (wu->swu_xs->flags & SCSI_DATA_OUT) { 839 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 840 wakeup(wu); 841 } 842 } else { 843 /* do not change the order of these 2 functions */ 844 sr_wu_put(wu); 845 if (xs != NULL) 846 scsi_done(xs); 847 } 848 849 if (sd->sd_sync && sd->sd_wu_pending == 0) 850 wakeup(sd); 851 } 852 853 retry: 854 splx(s); 855 return; 856 bad: 857 xs->error = XS_DRIVER_STUFFUP; 858 xs->flags |= ITSDONE; 859 if (wu->swu_flags & SR_WUF_REBUILD) { 860 wu->swu_flags |= SR_WUF_REBUILDIOCOMP; 861 wakeup(wu); 862 } else { 863 /* do not change the order of these 2 functions */ 864 sr_wu_put(wu); 865 scsi_done(xs); 866 } 867 868 splx(s); 869 } 870 871 void 872 sr_raid6_recreate_wu(struct sr_workunit *wu) 873 { 874 struct sr_discipline *sd = wu->swu_dis; 875 struct sr_workunit *wup = wu; 876 struct sr_ccb *ccb; 877 878 do { 879 DNPRINTF(SR_D_INTR, "%s: sr_raid6_recreate_wu: %p\n", wup); 880 881 /* toss all ccbs */ 882 while ((ccb = TAILQ_FIRST(&wup->swu_ccb)) != NULL) { 883 TAILQ_REMOVE(&wup->swu_ccb, ccb, ccb_link); 884 sr_ccb_put(ccb); 885 } 886 TAILQ_INIT(&wup->swu_ccb); 887 888 /* recreate ccbs */ 889 wup->swu_state = SR_WU_REQUEUE; 890 if (sd->sd_scsi_rw(wup)) 891 panic("could not requeue io"); 892 893 wup = wup->swu_collider; 894 } while (wup); 895 } 896 897 int 898 sr_raid6_addio(struct sr_workunit *wu, int dsk, daddr64_t blk, daddr64_t len, 899 void *data, int flag, int ccbflag, void *pbuf, void *qbuf, int gn) 900 { 901 struct sr_discipline *sd = wu->swu_dis; 902 struct sr_ccb *ccb; 903 struct sr_raid6_opaque *pqbuf; 904 905 ccb = sr_ccb_get(sd); 906 if (!ccb) 907 return (-1); 908 909 /* allocate temporary buffer */ 910 if (data == NULL) { 911 data = sr_get_block(sd, len); 912 if (data == NULL) 913 return (-1); 914 } 915 916 DNPRINTF(0, "%sio: %d.%llx %llx %p:%p\n", 917 flag & SCSI_DATA_IN ? "read" : "write", 918 dsk, blk, len, pbuf, qbuf); 919 920 ccb->ccb_flag = ccbflag; 921 if (flag & SCSI_POLL) { 922 ccb->ccb_buf.b_flags = 0; 923 ccb->ccb_buf.b_iodone = NULL; 924 } else { 925 ccb->ccb_buf.b_flags = B_CALL; 926 ccb->ccb_buf.b_iodone = sr_raid6_intr; 927 } 928 if (flag & SCSI_DATA_IN) 929 ccb->ccb_buf.b_flags |= B_READ; 930 else 931 ccb->ccb_buf.b_flags |= B_WRITE; 932 933 /* add offset for metadata */ 934 ccb->ccb_buf.b_flags |= B_PHYS; 935 ccb->ccb_buf.b_blkno = blk; 936 ccb->ccb_buf.b_bcount = len; 937 ccb->ccb_buf.b_bufsize = len; 938 ccb->ccb_buf.b_resid = len; 939 ccb->ccb_buf.b_data = data; 940 ccb->ccb_buf.b_error = 0; 941 ccb->ccb_buf.b_proc = curproc; 942 ccb->ccb_buf.b_dev = sd->sd_vol.sv_chunks[dsk]->src_dev_mm; 943 ccb->ccb_buf.b_vp = sd->sd_vol.sv_chunks[dsk]->src_vn; 944 if ((ccb->ccb_buf.b_flags & B_READ) == 0) 945 ccb->ccb_buf.b_vp->v_numoutput++; 946 947 ccb->ccb_wu = wu; 948 ccb->ccb_target = dsk; 949 if (pbuf || qbuf) { 950 if (qbuf && gf_premul(gn)) 951 return (-1); 952 953 pqbuf = malloc(sizeof(struct sr_raid6_opaque), M_DEVBUF, M_CANFAIL); 954 if (pqbuf == NULL) { 955 sr_ccb_put(ccb); 956 return (-1); 957 } 958 pqbuf->pbuf = pbuf; 959 pqbuf->qbuf = qbuf; 960 pqbuf->gn = gn; 961 ccb->ccb_opaque = pqbuf; 962 } 963 964 LIST_INIT(&ccb->ccb_buf.b_dep); 965 TAILQ_INSERT_TAIL(&wu->swu_ccb, ccb, ccb_link); 966 967 DNPRINTF(SR_D_DIS, "%s: %s: sr_raid6: b_bcount: %d " 968 "b_blkno: %x b_flags 0x%0x b_data %p\n", 969 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 970 ccb->ccb_buf.b_bcount, ccb->ccb_buf.b_blkno, 971 ccb->ccb_buf.b_flags, ccb->ccb_buf.b_data); 972 973 wu->swu_io_count++; 974 975 return (0); 976 } 977 978 /* Perform RAID6 parity calculation. 979 * P=xor parity, Q=GF256 parity, D=data, gn=disk# */ 980 void 981 sr_raid6_xorp(void *p, void *d, int len) 982 { 983 uint8_t *pbuf = p, *data = d; 984 985 while (len--) 986 pbuf[len] ^= data[len]; 987 } 988 989 void 990 sr_raid6_xorq(void *q, void *d, int len, int gn) 991 { 992 uint8_t *qbuf = q, *data = d; 993 uint8_t *gn_map = gf_map[gn]; 994 995 /* Have to do this a byte at a time */ 996 /* Faster multiply.. gn is always constant */ 997 while (len--) 998 qbuf[len] ^= gn_map[data[len]]; 999 } 1000 1001 /* Create GF256 log/pow tables: polynomial = 0x11D */ 1002 void 1003 gf_init(void) 1004 { 1005 int i; 1006 uint8_t p = 1; 1007 1008 /* use 2N pow table to avoid using % in multiply */ 1009 for (i=0; i<256; i++) { 1010 gf_log[p] = i; 1011 gf_pow[i] = gf_pow[i+255] = p; 1012 p = ((p << 1) ^ ((p & 0x80) ? 0x1D : 0x00)); 1013 } 1014 gf_log[0] = 512; 1015 } 1016 1017 uint8_t 1018 gf_inv(uint8_t a) 1019 { 1020 return gf_pow[255 - gf_log[a]]; 1021 } 1022 1023 /* Precalculate multiplication tables for drive gn */ 1024 int 1025 gf_premul(uint8_t gn) 1026 { 1027 int i; 1028 1029 if (gf_map[gn] != NULL) 1030 return (0); 1031 1032 if ((gf_map[gn] = malloc(256, M_DEVBUF, M_CANFAIL)) == NULL) 1033 return (-1); 1034 1035 for (i=0; i<256; i++) 1036 gf_map[gn][i] = gf_pow[gf_log[i] + gf_log[gn]]; 1037 return (0); 1038 } 1039