1 /* $OpenBSD: softraid_raid5.c,v 1.26 2016/05/31 15:19:12 jsing Exp $ */ 2 /* 3 * Copyright (c) 2014 Joel Sing <jsing@openbsd.org> 4 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 5 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include "bio.h" 21 22 #include <sys/param.h> 23 #include <sys/systm.h> 24 #include <sys/buf.h> 25 #include <sys/device.h> 26 #include <sys/ioctl.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/mount.h> 34 #include <sys/sensors.h> 35 #include <sys/stat.h> 36 #include <sys/task.h> 37 #include <sys/pool.h> 38 #include <sys/conf.h> 39 #include <sys/uio.h> 40 41 #include <scsi/scsi_all.h> 42 #include <scsi/scsiconf.h> 43 #include <scsi/scsi_disk.h> 44 45 #include <dev/softraidvar.h> 46 47 /* RAID 5 functions. */ 48 int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *, 49 int, int64_t); 50 int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *, 51 int, void *); 52 int sr_raid5_init(struct sr_discipline *); 53 int sr_raid5_rw(struct sr_workunit *); 54 int sr_raid5_openings(struct sr_discipline *); 55 void sr_raid5_intr(struct buf *); 56 int sr_raid5_wu_done(struct sr_workunit *); 57 void sr_raid5_set_chunk_state(struct sr_discipline *, int, int); 58 void sr_raid5_set_vol_state(struct sr_discipline *); 59 60 int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long, 61 void *, int, int, void *); 62 int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long, 63 void *); 64 int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int, 65 daddr_t, long, void *, int, int); 66 void sr_raid5_xor(void *, void *, int); 67 68 void sr_raid5_rebuild(struct sr_discipline *); 69 void sr_raid5_scrub(struct sr_discipline *); 70 71 /* discipline initialisation. */ 72 void 73 sr_raid5_discipline_init(struct sr_discipline *sd) 74 { 75 /* Fill out discipline members. */ 76 sd->sd_type = SR_MD_RAID5; 77 strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name)); 78 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 79 SR_CAP_REBUILD | SR_CAP_REDUNDANT; 80 sd->sd_max_ccb_per_wu = 4; /* only if stripsize <= MAXPHYS */ 81 sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */ 82 83 /* Setup discipline specific function pointers. */ 84 sd->sd_assemble = sr_raid5_assemble; 85 sd->sd_create = sr_raid5_create; 86 sd->sd_openings = sr_raid5_openings; 87 sd->sd_rebuild = sr_raid5_rebuild; 88 sd->sd_scsi_rw = sr_raid5_rw; 89 sd->sd_scsi_intr = sr_raid5_intr; 90 sd->sd_scsi_wu_done = sr_raid5_wu_done; 91 sd->sd_set_chunk_state = sr_raid5_set_chunk_state; 92 sd->sd_set_vol_state = sr_raid5_set_vol_state; 93 } 94 95 int 96 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc, 97 int no_chunk, int64_t coerced_size) 98 { 99 if (no_chunk < 3) { 100 sr_error(sd->sd_sc, "%s requires three or more chunks", 101 sd->sd_name); 102 return EINVAL; 103 } 104 105 /* 106 * XXX add variable strip size later even though MAXPHYS is really 107 * the clever value, users like to tinker with that type of stuff. 108 */ 109 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 110 sd->sd_meta->ssdi.ssd_size = (coerced_size & 111 ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >> 112 DEV_BSHIFT) - 1)) * (no_chunk - 1); 113 114 return sr_raid5_init(sd); 115 } 116 117 int 118 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 119 int no_chunk, void *data) 120 { 121 return sr_raid5_init(sd); 122 } 123 124 int 125 sr_raid5_init(struct sr_discipline *sd) 126 { 127 /* Initialise runtime values. */ 128 sd->mds.mdd_raid5.sr5_strip_bits = 129 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 130 if (sd->mds.mdd_raid5.sr5_strip_bits == -1) { 131 sr_error(sd->sd_sc, "invalid strip size"); 132 return EINVAL; 133 } 134 135 return 0; 136 } 137 138 int 139 sr_raid5_openings(struct sr_discipline *sd) 140 { 141 /* Two work units per I/O, two for rebuild/scrub. */ 142 return ((sd->sd_max_wu - 2) >> 1); 143 } 144 145 void 146 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 147 { 148 int old_state, s; 149 150 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 151 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 152 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 153 154 /* ok to go to splbio since this only happens in error path */ 155 s = splbio(); 156 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 157 158 /* multiple IOs to the same chunk that fail will come through here */ 159 if (old_state == new_state) 160 goto done; 161 162 switch (old_state) { 163 case BIOC_SDONLINE: 164 switch (new_state) { 165 case BIOC_SDOFFLINE: 166 case BIOC_SDSCRUB: 167 break; 168 default: 169 goto die; 170 } 171 break; 172 173 case BIOC_SDOFFLINE: 174 if (new_state == BIOC_SDREBUILD) { 175 ; 176 } else 177 goto die; 178 break; 179 180 case BIOC_SDSCRUB: 181 switch (new_state) { 182 case BIOC_SDONLINE: 183 case BIOC_SDOFFLINE: 184 break; 185 default: 186 goto die; 187 } 188 break; 189 190 case BIOC_SDREBUILD: 191 switch (new_state) { 192 case BIOC_SDONLINE: 193 case BIOC_SDOFFLINE: 194 break; 195 default: 196 goto die; 197 } 198 break; 199 200 default: 201 die: 202 splx(s); /* XXX */ 203 panic("%s: %s: %s: invalid chunk state transition " 204 "%d -> %d", DEVNAME(sd->sd_sc), 205 sd->sd_meta->ssd_devname, 206 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 207 old_state, new_state); 208 /* NOTREACHED */ 209 } 210 211 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 212 sd->sd_set_vol_state(sd); 213 214 sd->sd_must_flush = 1; 215 task_add(systq, &sd->sd_meta_save_task); 216 done: 217 splx(s); 218 } 219 220 void 221 sr_raid5_set_vol_state(struct sr_discipline *sd) 222 { 223 int states[SR_MAX_STATES]; 224 int new_state, i, s, nd; 225 int old_state = sd->sd_vol_status; 226 227 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 228 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 229 230 nd = sd->sd_meta->ssdi.ssd_chunk_no; 231 232 for (i = 0; i < SR_MAX_STATES; i++) 233 states[i] = 0; 234 235 for (i = 0; i < nd; i++) { 236 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 237 if (s >= SR_MAX_STATES) 238 panic("%s: %s: %s: invalid chunk state", 239 DEVNAME(sd->sd_sc), 240 sd->sd_meta->ssd_devname, 241 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 242 states[s]++; 243 } 244 245 if (states[BIOC_SDONLINE] == nd) 246 new_state = BIOC_SVONLINE; 247 else if (states[BIOC_SDONLINE] < nd - 1) 248 new_state = BIOC_SVOFFLINE; 249 else if (states[BIOC_SDSCRUB] != 0) 250 new_state = BIOC_SVSCRUB; 251 else if (states[BIOC_SDREBUILD] != 0) 252 new_state = BIOC_SVREBUILD; 253 else if (states[BIOC_SDONLINE] == nd - 1) 254 new_state = BIOC_SVDEGRADED; 255 else { 256 #ifdef SR_DEBUG 257 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 258 "was %d\n", DEVNAME(sd->sd_sc), old_state); 259 for (i = 0; i < nd; i++) 260 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 261 DEVNAME(sd->sd_sc), i, 262 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 263 #endif 264 panic("invalid volume state"); 265 } 266 267 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n", 268 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 269 old_state, new_state); 270 271 switch (old_state) { 272 case BIOC_SVONLINE: 273 switch (new_state) { 274 case BIOC_SVONLINE: /* can go to same state */ 275 case BIOC_SVOFFLINE: 276 case BIOC_SVDEGRADED: 277 case BIOC_SVREBUILD: /* happens on boot */ 278 break; 279 default: 280 goto die; 281 } 282 break; 283 284 case BIOC_SVOFFLINE: 285 /* XXX this might be a little too much */ 286 goto die; 287 288 case BIOC_SVDEGRADED: 289 switch (new_state) { 290 case BIOC_SVOFFLINE: 291 case BIOC_SVREBUILD: 292 case BIOC_SVDEGRADED: /* can go to the same state */ 293 break; 294 default: 295 goto die; 296 } 297 break; 298 299 case BIOC_SVBUILDING: 300 switch (new_state) { 301 case BIOC_SVONLINE: 302 case BIOC_SVOFFLINE: 303 case BIOC_SVBUILDING: /* can go to the same state */ 304 break; 305 default: 306 goto die; 307 } 308 break; 309 310 case BIOC_SVSCRUB: 311 switch (new_state) { 312 case BIOC_SVONLINE: 313 case BIOC_SVOFFLINE: 314 case BIOC_SVDEGRADED: 315 case BIOC_SVSCRUB: /* can go to same state */ 316 break; 317 default: 318 goto die; 319 } 320 break; 321 322 case BIOC_SVREBUILD: 323 switch (new_state) { 324 case BIOC_SVONLINE: 325 case BIOC_SVOFFLINE: 326 case BIOC_SVDEGRADED: 327 case BIOC_SVREBUILD: /* can go to the same state */ 328 break; 329 default: 330 goto die; 331 } 332 break; 333 334 default: 335 die: 336 panic("%s: %s: invalid volume state transition %d -> %d", 337 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 338 old_state, new_state); 339 /* NOTREACHED */ 340 } 341 342 sd->sd_vol_status = new_state; 343 } 344 345 static inline int 346 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk) 347 { 348 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 349 case BIOC_SDONLINE: 350 case BIOC_SDSCRUB: 351 return 1; 352 default: 353 return 0; 354 } 355 } 356 357 static inline int 358 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk) 359 { 360 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 361 case BIOC_SDREBUILD: 362 return 1; 363 default: 364 return 0; 365 } 366 } 367 368 int 369 sr_raid5_rw(struct sr_workunit *wu) 370 { 371 struct sr_workunit *wu_r = NULL; 372 struct sr_discipline *sd = wu->swu_dis; 373 struct scsi_xfer *xs = wu->swu_xs; 374 struct sr_chunk *scp; 375 daddr_t blkno, lba; 376 int64_t chunk_offs, lbaoffs, offset, strip_offs; 377 int64_t strip_bits, strip_no, strip_size; 378 int64_t chunk, no_chunk; 379 int64_t parity, row_size; 380 long length, datalen; 381 void *data; 382 int s; 383 384 /* blkno and scsi error will be handled by sr_validate_io */ 385 if (sr_validate_io(wu, &blkno, "sr_raid5_rw")) 386 goto bad; 387 388 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n", 389 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 390 (xs->flags & SCSI_DATA_IN) ? "read" : "write", 391 (long long)blkno, xs->datalen); 392 393 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 394 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 395 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 396 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 397 398 data = xs->data; 399 datalen = xs->datalen; 400 lbaoffs = blkno << DEV_BSHIFT; 401 402 if (xs->flags & SCSI_DATA_OUT) { 403 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){ 404 printf("%s: %s failed to get read work unit", 405 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 406 goto bad; 407 } 408 wu_r->swu_state = SR_WU_INPROGRESS; 409 wu_r->swu_flags |= SR_WUF_DISCIPLINE; 410 } 411 412 wu->swu_blk_start = 0; 413 while (datalen != 0) { 414 strip_no = lbaoffs >> strip_bits; 415 strip_offs = lbaoffs & (strip_size - 1); 416 chunk_offs = (strip_no / no_chunk) << strip_bits; 417 offset = chunk_offs + strip_offs; 418 419 /* get size remaining in this stripe */ 420 length = MIN(strip_size - strip_offs, datalen); 421 422 /* 423 * Map disk offset to data and parity chunks, using a left 424 * asymmetric algorithm for the parity assignment. 425 */ 426 chunk = strip_no % no_chunk; 427 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 428 if (chunk >= parity) 429 chunk++; 430 431 lba = offset >> DEV_BSHIFT; 432 433 /* XXX big hammer.. exclude I/O from entire stripe */ 434 if (wu->swu_blk_start == 0) 435 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 436 wu->swu_blk_end = (strip_no / no_chunk) * row_size + 437 (row_size - 1); 438 439 scp = sd->sd_vol.sv_chunks[chunk]; 440 if (xs->flags & SCSI_DATA_IN) { 441 switch (scp->src_meta.scm_status) { 442 case BIOC_SDONLINE: 443 case BIOC_SDSCRUB: 444 /* 445 * Chunk is online, issue a single read 446 * request. 447 */ 448 if (sr_raid5_addio(wu, chunk, lba, length, 449 data, xs->flags, 0, NULL)) 450 goto bad; 451 break; 452 case BIOC_SDOFFLINE: 453 case BIOC_SDREBUILD: 454 case BIOC_SDHOTSPARE: 455 if (sr_raid5_regenerate(wu, chunk, lba, 456 length, data)) 457 goto bad; 458 break; 459 default: 460 printf("%s: is offline, can't read\n", 461 DEVNAME(sd->sd_sc)); 462 goto bad; 463 } 464 } else { 465 if (sr_raid5_write(wu, wu_r, chunk, parity, lba, 466 length, data, xs->flags, 0)) 467 goto bad; 468 } 469 470 /* advance to next block */ 471 lbaoffs += length; 472 datalen -= length; 473 data += length; 474 } 475 476 s = splbio(); 477 if (wu_r) { 478 if (wu_r->swu_io_count > 0) { 479 /* collide write request with reads */ 480 wu_r->swu_blk_start = wu->swu_blk_start; 481 wu_r->swu_blk_end = wu->swu_blk_end; 482 483 wu->swu_state = SR_WU_DEFERRED; 484 wu_r->swu_collider = wu; 485 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 486 487 wu = wu_r; 488 } else { 489 sr_scsi_wu_put(sd, wu_r); 490 } 491 } 492 splx(s); 493 494 sr_schedule_wu(wu); 495 496 return (0); 497 498 bad: 499 /* wu is unwound by sr_wu_put */ 500 if (wu_r) 501 sr_scsi_wu_put(sd, wu_r); 502 return (1); 503 } 504 505 int 506 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno, 507 long len, void *data) 508 { 509 struct sr_discipline *sd = wu->swu_dis; 510 int i; 511 512 /* 513 * Regenerate a block on a RAID 5 volume by xoring the data and parity 514 * from all of the remaining online chunks. This requires the parity 515 * to already be correct. 516 */ 517 518 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, " 519 "regenerating block %llu\n", 520 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno); 521 522 memset(data, 0, len); 523 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 524 if (i == chunk) 525 continue; 526 if (!sr_raid5_chunk_online(sd, i)) 527 goto bad; 528 if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN, 529 0, data)) 530 goto bad; 531 } 532 return (0); 533 534 bad: 535 return (1); 536 } 537 538 int 539 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk, 540 int parity, daddr_t blkno, long len, void *data, int xsflags, 541 int ccbflags) 542 { 543 struct sr_discipline *sd = wu->swu_dis; 544 struct scsi_xfer *xs = wu->swu_xs; 545 void *xorbuf; 546 int chunk_online, chunk_rebuild; 547 int parity_online, parity_rebuild; 548 int other_offline = 0, other_rebuild = 0; 549 int i; 550 551 /* 552 * Perform a write to a RAID 5 volume. This write routine does not 553 * require the parity to already be correct and will operate on a 554 * uninitialised volume. 555 * 556 * There are four possible cases: 557 * 558 * 1) All data chunks and parity are online. In this case we read the 559 * data from all data chunks, except the one we are writing to, in 560 * order to calculate and write the new parity. 561 * 562 * 2) The parity chunk is offline. In this case we only need to write 563 * to the data chunk. No parity calculation is required. 564 * 565 * 3) The data chunk is offline. In this case we read the data from all 566 * online chunks in order to calculate and write the new parity. 567 * This is the same as (1) except we do not write the data chunk. 568 * 569 * 4) A different data chunk is offline. The new parity is calculated 570 * by taking the existing parity, xoring the original data and 571 * xoring in the new data. This requires that the parity already be 572 * correct, which it will be if any of the data chunks has 573 * previously been written. 574 * 575 * There is an additional complication introduced by a chunk that is 576 * being rebuilt. If this is the data or parity chunk, then we want 577 * to write to it as per normal. If it is another data chunk then we 578 * need to presume that it has not yet been regenerated and use the 579 * same method as detailed in (4) above. 580 */ 581 582 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i " 583 "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 584 chunk, parity, (unsigned long long)blkno); 585 586 chunk_online = sr_raid5_chunk_online(sd, chunk); 587 chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk); 588 parity_online = sr_raid5_chunk_online(sd, parity); 589 parity_rebuild = sr_raid5_chunk_rebuild(sd, parity); 590 591 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 592 if (i == chunk || i == parity) 593 continue; 594 if (sr_raid5_chunk_rebuild(sd, i)) 595 other_rebuild = 1; 596 else if (!sr_raid5_chunk_online(sd, i)) 597 other_offline = 1; 598 } 599 600 DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, " 601 "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 602 chunk_online, parity_online, other_offline); 603 604 if (!parity_online && !parity_rebuild) 605 goto data_write; 606 607 xorbuf = sr_block_get(sd, len); 608 if (xorbuf == NULL) 609 goto bad; 610 memcpy(xorbuf, data, len); 611 612 if (other_offline || other_rebuild) { 613 614 /* 615 * XXX - If we can guarantee that this LBA has been scrubbed 616 * then we can also take this faster path. 617 */ 618 619 /* Read in existing data and existing parity. */ 620 if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL, 621 SCSI_DATA_IN, 0, xorbuf)) 622 goto bad; 623 if (sr_raid5_addio(wu_r, parity, blkno, len, NULL, 624 SCSI_DATA_IN, 0, xorbuf)) 625 goto bad; 626 627 } else { 628 629 /* Read in existing data from all other chunks. */ 630 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 631 if (i == chunk || i == parity) 632 continue; 633 if (sr_raid5_addio(wu_r, i, blkno, len, NULL, 634 SCSI_DATA_IN, 0, xorbuf)) 635 goto bad; 636 } 637 638 } 639 640 /* Write new parity. */ 641 if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags, 642 SR_CCBF_FREEBUF, NULL)) 643 goto bad; 644 645 data_write: 646 /* Write new data. */ 647 if (chunk_online || chunk_rebuild) 648 if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags, 649 0, NULL)) 650 goto bad; 651 652 return (0); 653 654 bad: 655 return (1); 656 } 657 658 void 659 sr_raid5_intr(struct buf *bp) 660 { 661 struct sr_ccb *ccb = (struct sr_ccb *)bp; 662 struct sr_workunit *wu = ccb->ccb_wu; 663 struct sr_discipline *sd = wu->swu_dis; 664 int s; 665 666 DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n", 667 DEVNAME(sd->sd_sc), bp, wu->swu_xs); 668 669 s = splbio(); 670 sr_ccb_done(ccb); 671 672 /* XXX - Should this be done via the taskq? */ 673 674 /* XOR data to result. */ 675 if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque) 676 sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data, 677 ccb->ccb_buf.b_bcount); 678 679 /* Free allocated data buffer. */ 680 if (ccb->ccb_flags & SR_CCBF_FREEBUF) { 681 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 682 ccb->ccb_buf.b_data = NULL; 683 } 684 685 sr_wu_done(wu); 686 splx(s); 687 } 688 689 int 690 sr_raid5_wu_done(struct sr_workunit *wu) 691 { 692 struct sr_discipline *sd = wu->swu_dis; 693 struct scsi_xfer *xs = wu->swu_xs; 694 695 /* XXX - we have no way of propagating errors... */ 696 if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD)) 697 return SR_WU_OK; 698 699 /* XXX - This is insufficient for RAID 5. */ 700 if (wu->swu_ios_succeeded > 0) { 701 xs->error = XS_NOERROR; 702 return SR_WU_OK; 703 } 704 705 if (xs->flags & SCSI_DATA_IN) { 706 printf("%s: retrying read on block %lld\n", 707 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 708 sr_wu_release_ccbs(wu); 709 wu->swu_state = SR_WU_RESTART; 710 if (sd->sd_scsi_rw(wu) == 0) 711 return SR_WU_RESTART; 712 } else { 713 /* XXX - retry write if we just went from online to degraded. */ 714 printf("%s: permanently fail write on block %lld\n", 715 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 716 } 717 718 wu->swu_state = SR_WU_FAILED; 719 xs->error = XS_DRIVER_STUFFUP; 720 721 return SR_WU_FAILED; 722 } 723 724 int 725 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno, 726 long len, void *data, int xsflags, int ccbflags, void *xorbuf) 727 { 728 struct sr_discipline *sd = wu->swu_dis; 729 struct sr_ccb *ccb; 730 731 DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld " 732 "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write", 733 chunk, (long long)blkno, len, xorbuf ? "X0R" : "-"); 734 735 /* Allocate temporary buffer. */ 736 if (data == NULL) { 737 data = sr_block_get(sd, len); 738 if (data == NULL) 739 return (-1); 740 ccbflags |= SR_CCBF_FREEBUF; 741 } 742 743 ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags); 744 if (ccb == NULL) { 745 if (ccbflags & SR_CCBF_FREEBUF) 746 sr_block_put(sd, data, len); 747 return (-1); 748 } 749 ccb->ccb_opaque = xorbuf; 750 sr_wu_enqueue_ccb(wu, ccb); 751 752 return (0); 753 } 754 755 void 756 sr_raid5_xor(void *a, void *b, int len) 757 { 758 uint32_t *xa = a, *xb = b; 759 760 len >>= 2; 761 while (len--) 762 *xa++ ^= *xb++; 763 } 764 765 void 766 sr_raid5_rebuild(struct sr_discipline *sd) 767 { 768 int64_t strip_no, strip_size, strip_bits, i, restart; 769 int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size; 770 struct sr_workunit *wu_r, *wu_w; 771 int s, slept, percent = 0, old_percent = -1; 772 int rebuild_chunk = -1; 773 void *xorbuf; 774 775 /* Find the rebuild chunk. */ 776 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 777 if (sr_raid5_chunk_rebuild(sd, i)) { 778 rebuild_chunk = i; 779 break; 780 } 781 } 782 if (rebuild_chunk == -1) 783 goto bad; 784 785 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 786 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 787 chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1; 788 chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count; 789 chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits; 790 row_size = (chunk_count << strip_bits) >> DEV_BSHIFT; 791 792 DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, " 793 "chunk count = %lld, chunk size = %lld, chunk strips = %lld, " 794 "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 795 sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips, 796 row_size); 797 798 restart = sd->sd_meta->ssd_rebuild / row_size; 799 if (restart > chunk_strips) { 800 printf("%s: bogus rebuild restart offset, starting from 0\n", 801 DEVNAME(sd->sd_sc)); 802 restart = 0; 803 } 804 if (restart != 0) { 805 percent = sr_rebuild_percent(sd); 806 printf("%s: resuming rebuild on %s at %d%%\n", 807 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent); 808 } 809 810 for (strip_no = restart; strip_no < chunk_strips; strip_no++) { 811 chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no; 812 813 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, " 814 "chunk lba = %lld\n", DEVNAME(sd->sd_sc), 815 sd->sd_meta->ssd_devname, strip_no, chunk_lba); 816 817 wu_w = sr_scsi_wu_get(sd, 0); 818 wu_r = sr_scsi_wu_get(sd, 0); 819 820 xorbuf = sr_block_get(sd, strip_size); 821 if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba, 822 strip_size, xorbuf)) 823 goto bad; 824 if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size, 825 xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL)) 826 goto bad; 827 828 /* Collide write work unit with read work unit. */ 829 wu_r->swu_state = SR_WU_INPROGRESS; 830 wu_r->swu_flags |= SR_WUF_REBUILD; 831 wu_w->swu_state = SR_WU_DEFERRED; 832 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 833 wu_r->swu_collider = wu_w; 834 835 /* Block I/O to this strip while we rebuild it. */ 836 wu_r->swu_blk_start = (strip_no / chunk_count) * row_size; 837 wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1; 838 wu_w->swu_blk_start = wu_r->swu_blk_start; 839 wu_w->swu_blk_end = wu_r->swu_blk_end; 840 841 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, " 842 "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc), 843 sd->sd_meta->ssd_devname, 844 wu_r->swu_blk_start, wu_r->swu_blk_end); 845 846 s = splbio(); 847 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 848 splx(s); 849 850 sr_schedule_wu(wu_r); 851 852 slept = 0; 853 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 854 tsleep(wu_w, PRIBIO, "sr_rebuild", 0); 855 slept = 1; 856 } 857 if (!slept) 858 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 859 860 sr_scsi_wu_put(sd, wu_r); 861 sr_scsi_wu_put(sd, wu_w); 862 863 sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count; 864 865 percent = sr_rebuild_percent(sd); 866 if (percent != old_percent && strip_no != chunk_strips - 1) { 867 if (sr_meta_save(sd, SR_META_DIRTY)) 868 printf("%s: could not save metadata to %s\n", 869 DEVNAME(sd->sd_sc), 870 sd->sd_meta->ssd_devname); 871 old_percent = percent; 872 } 873 874 if (sd->sd_reb_abort) 875 goto abort; 876 } 877 878 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc), 879 sd->sd_meta->ssd_devname); 880 881 /* all done */ 882 sd->sd_meta->ssd_rebuild = 0; 883 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 884 if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status == 885 BIOC_SDREBUILD) { 886 sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE); 887 break; 888 } 889 } 890 891 return; 892 893 abort: 894 if (sr_meta_save(sd, SR_META_DIRTY)) 895 printf("%s: could not save metadata to %s\n", 896 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 897 bad: 898 return; 899 } 900 901 #if 0 902 void 903 sr_raid5_scrub(struct sr_discipline *sd) 904 { 905 int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits; 906 int64_t i; 907 struct sr_workunit *wu_r, *wu_w; 908 int s, slept; 909 void *xorbuf; 910 911 wu_w = sr_scsi_wu_get(sd, 0); 912 wu_r = sr_scsi_wu_get(sd, 0); 913 914 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 915 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 916 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 917 max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits; 918 919 for (strip_no = 0; strip_no < max_strip; strip_no++) { 920 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 921 922 xorbuf = sr_block_get(sd, strip_size); 923 for (i = 0; i <= no_chunk; i++) { 924 if (i != parity) 925 sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size, 926 NULL, SCSI_DATA_IN, 0, xorbuf); 927 } 928 sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf, 929 SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL); 930 931 wu_r->swu_flags |= SR_WUF_REBUILD; 932 933 /* Collide wu_w with wu_r */ 934 wu_w->swu_state = SR_WU_DEFERRED; 935 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 936 wu_r->swu_collider = wu_w; 937 938 s = splbio(); 939 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 940 splx(s); 941 942 wu_r->swu_state = SR_WU_INPROGRESS; 943 sr_schedule_wu(wu_r); 944 945 slept = 0; 946 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 947 tsleep(wu_w, PRIBIO, "sr_scrub", 0); 948 slept = 1; 949 } 950 if (!slept) 951 tsleep(sd->sd_sc, PWAIT, "sr_yield", 1); 952 } 953 done: 954 return; 955 } 956 #endif 957