1 /* $OpenBSD: softraid_raid5.c,v 1.32 2021/05/16 15:12:37 deraadt Exp $ */ 2 /* 3 * Copyright (c) 2014 Joel Sing <jsing@openbsd.org> 4 * Copyright (c) 2009 Marco Peereboom <marco@peereboom.us> 5 * Copyright (c) 2009 Jordan Hargrave <jordan@openbsd.org> 6 * 7 * Permission to use, copy, modify, and distribute this software for any 8 * purpose with or without fee is hereby granted, provided that the above 9 * copyright notice and this permission notice appear in all copies. 10 * 11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 */ 19 20 #include "bio.h" 21 22 #include <sys/param.h> 23 #include <sys/systm.h> 24 #include <sys/buf.h> 25 #include <sys/device.h> 26 #include <sys/ioctl.h> 27 #include <sys/malloc.h> 28 #include <sys/kernel.h> 29 #include <sys/disk.h> 30 #include <sys/rwlock.h> 31 #include <sys/queue.h> 32 #include <sys/fcntl.h> 33 #include <sys/mount.h> 34 #include <sys/sensors.h> 35 #include <sys/stat.h> 36 #include <sys/task.h> 37 #include <sys/pool.h> 38 #include <sys/conf.h> 39 #include <sys/uio.h> 40 41 #include <scsi/scsi_all.h> 42 #include <scsi/scsiconf.h> 43 #include <scsi/scsi_disk.h> 44 45 #include <dev/softraidvar.h> 46 47 /* RAID 5 functions. */ 48 int sr_raid5_create(struct sr_discipline *, struct bioc_createraid *, 49 int, int64_t); 50 int sr_raid5_assemble(struct sr_discipline *, struct bioc_createraid *, 51 int, void *); 52 int sr_raid5_init(struct sr_discipline *); 53 int sr_raid5_rw(struct sr_workunit *); 54 int sr_raid5_openings(struct sr_discipline *); 55 void sr_raid5_intr(struct buf *); 56 int sr_raid5_wu_done(struct sr_workunit *); 57 void sr_raid5_set_chunk_state(struct sr_discipline *, int, int); 58 void sr_raid5_set_vol_state(struct sr_discipline *); 59 60 int sr_raid5_addio(struct sr_workunit *wu, int, daddr_t, long, 61 void *, int, int, void *); 62 int sr_raid5_regenerate(struct sr_workunit *, int, daddr_t, long, 63 void *); 64 int sr_raid5_write(struct sr_workunit *, struct sr_workunit *, int, int, 65 daddr_t, long, void *, int, int); 66 void sr_raid5_xor(void *, void *, int); 67 68 void sr_raid5_rebuild(struct sr_discipline *); 69 void sr_raid5_scrub(struct sr_discipline *); 70 71 /* discipline initialisation. */ 72 void 73 sr_raid5_discipline_init(struct sr_discipline *sd) 74 { 75 /* Fill out discipline members. */ 76 sd->sd_type = SR_MD_RAID5; 77 strlcpy(sd->sd_name, "RAID 5", sizeof(sd->sd_name)); 78 sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE | 79 SR_CAP_REBUILD | SR_CAP_REDUNDANT; 80 sd->sd_max_wu = SR_RAID5_NOWU + 2; /* Two for scrub/rebuild. */ 81 82 /* Setup discipline specific function pointers. */ 83 sd->sd_assemble = sr_raid5_assemble; 84 sd->sd_create = sr_raid5_create; 85 sd->sd_openings = sr_raid5_openings; 86 sd->sd_rebuild = sr_raid5_rebuild; 87 sd->sd_scsi_rw = sr_raid5_rw; 88 sd->sd_scsi_intr = sr_raid5_intr; 89 sd->sd_scsi_wu_done = sr_raid5_wu_done; 90 sd->sd_set_chunk_state = sr_raid5_set_chunk_state; 91 sd->sd_set_vol_state = sr_raid5_set_vol_state; 92 } 93 94 int 95 sr_raid5_create(struct sr_discipline *sd, struct bioc_createraid *bc, 96 int no_chunk, int64_t coerced_size) 97 { 98 if (no_chunk < 3) { 99 sr_error(sd->sd_sc, "%s requires three or more chunks", 100 sd->sd_name); 101 return EINVAL; 102 } 103 104 /* 105 * XXX add variable strip size later even though MAXPHYS is really 106 * the clever value, users like to tinker with that type of stuff. 107 */ 108 sd->sd_meta->ssdi.ssd_strip_size = MAXPHYS; 109 sd->sd_meta->ssdi.ssd_size = (coerced_size & 110 ~(((u_int64_t)sd->sd_meta->ssdi.ssd_strip_size >> 111 DEV_BSHIFT) - 1)) * (no_chunk - 1); 112 113 return sr_raid5_init(sd); 114 } 115 116 int 117 sr_raid5_assemble(struct sr_discipline *sd, struct bioc_createraid *bc, 118 int no_chunk, void *data) 119 { 120 return sr_raid5_init(sd); 121 } 122 123 int 124 sr_raid5_init(struct sr_discipline *sd) 125 { 126 /* Initialise runtime values. */ 127 sd->mds.mdd_raid5.sr5_strip_bits = 128 sr_validate_stripsize(sd->sd_meta->ssdi.ssd_strip_size); 129 if (sd->mds.mdd_raid5.sr5_strip_bits == -1) { 130 sr_error(sd->sd_sc, "invalid strip size"); 131 return EINVAL; 132 } 133 134 sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no; 135 136 return 0; 137 } 138 139 int 140 sr_raid5_openings(struct sr_discipline *sd) 141 { 142 /* Two work units per I/O, two for rebuild/scrub. */ 143 return ((sd->sd_max_wu - 2) >> 1); 144 } 145 146 void 147 sr_raid5_set_chunk_state(struct sr_discipline *sd, int c, int new_state) 148 { 149 int old_state, s; 150 151 DNPRINTF(SR_D_STATE, "%s: %s: %s: sr_raid_set_chunk_state %d -> %d\n", 152 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 153 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, c, new_state); 154 155 /* ok to go to splbio since this only happens in error path */ 156 s = splbio(); 157 old_state = sd->sd_vol.sv_chunks[c]->src_meta.scm_status; 158 159 /* multiple IOs to the same chunk that fail will come through here */ 160 if (old_state == new_state) 161 goto done; 162 163 switch (old_state) { 164 case BIOC_SDONLINE: 165 switch (new_state) { 166 case BIOC_SDOFFLINE: 167 case BIOC_SDSCRUB: 168 break; 169 default: 170 goto die; 171 } 172 break; 173 174 case BIOC_SDOFFLINE: 175 if (new_state == BIOC_SDREBUILD) { 176 ; 177 } else 178 goto die; 179 break; 180 181 case BIOC_SDSCRUB: 182 switch (new_state) { 183 case BIOC_SDONLINE: 184 case BIOC_SDOFFLINE: 185 break; 186 default: 187 goto die; 188 } 189 break; 190 191 case BIOC_SDREBUILD: 192 switch (new_state) { 193 case BIOC_SDONLINE: 194 case BIOC_SDOFFLINE: 195 break; 196 default: 197 goto die; 198 } 199 break; 200 201 default: 202 die: 203 splx(s); /* XXX */ 204 panic("%s: %s: %s: invalid chunk state transition %d -> %d", 205 DEVNAME(sd->sd_sc), 206 sd->sd_meta->ssd_devname, 207 sd->sd_vol.sv_chunks[c]->src_meta.scmi.scm_devname, 208 old_state, new_state); 209 /* NOTREACHED */ 210 } 211 212 sd->sd_vol.sv_chunks[c]->src_meta.scm_status = new_state; 213 sd->sd_set_vol_state(sd); 214 215 sd->sd_must_flush = 1; 216 task_add(systq, &sd->sd_meta_save_task); 217 done: 218 splx(s); 219 } 220 221 void 222 sr_raid5_set_vol_state(struct sr_discipline *sd) 223 { 224 int states[SR_MAX_STATES]; 225 int new_state, i, s, nd; 226 int old_state = sd->sd_vol_status; 227 228 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid_set_vol_state\n", 229 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 230 231 nd = sd->sd_meta->ssdi.ssd_chunk_no; 232 233 for (i = 0; i < SR_MAX_STATES; i++) 234 states[i] = 0; 235 236 for (i = 0; i < nd; i++) { 237 s = sd->sd_vol.sv_chunks[i]->src_meta.scm_status; 238 if (s >= SR_MAX_STATES) 239 panic("%s: %s: %s: invalid chunk state", 240 DEVNAME(sd->sd_sc), 241 sd->sd_meta->ssd_devname, 242 sd->sd_vol.sv_chunks[i]->src_meta.scmi.scm_devname); 243 states[s]++; 244 } 245 246 if (states[BIOC_SDONLINE] == nd) 247 new_state = BIOC_SVONLINE; 248 else if (states[BIOC_SDONLINE] < nd - 1) 249 new_state = BIOC_SVOFFLINE; 250 else if (states[BIOC_SDSCRUB] != 0) 251 new_state = BIOC_SVSCRUB; 252 else if (states[BIOC_SDREBUILD] != 0) 253 new_state = BIOC_SVREBUILD; 254 else if (states[BIOC_SDONLINE] == nd - 1) 255 new_state = BIOC_SVDEGRADED; 256 else { 257 #ifdef SR_DEBUG 258 DNPRINTF(SR_D_STATE, "%s: invalid volume state, old state " 259 "was %d\n", DEVNAME(sd->sd_sc), old_state); 260 for (i = 0; i < nd; i++) 261 DNPRINTF(SR_D_STATE, "%s: chunk %d status = %d\n", 262 DEVNAME(sd->sd_sc), i, 263 sd->sd_vol.sv_chunks[i]->src_meta.scm_status); 264 #endif 265 panic("invalid volume state"); 266 } 267 268 DNPRINTF(SR_D_STATE, "%s: %s: sr_raid5_set_vol_state %d -> %d\n", 269 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 270 old_state, new_state); 271 272 switch (old_state) { 273 case BIOC_SVONLINE: 274 switch (new_state) { 275 case BIOC_SVONLINE: /* can go to same state */ 276 case BIOC_SVOFFLINE: 277 case BIOC_SVDEGRADED: 278 case BIOC_SVREBUILD: /* happens on boot */ 279 break; 280 default: 281 goto die; 282 } 283 break; 284 285 case BIOC_SVOFFLINE: 286 /* XXX this might be a little too much */ 287 goto die; 288 289 case BIOC_SVDEGRADED: 290 switch (new_state) { 291 case BIOC_SVOFFLINE: 292 case BIOC_SVREBUILD: 293 case BIOC_SVDEGRADED: /* can go to the same state */ 294 break; 295 default: 296 goto die; 297 } 298 break; 299 300 case BIOC_SVBUILDING: 301 switch (new_state) { 302 case BIOC_SVONLINE: 303 case BIOC_SVOFFLINE: 304 case BIOC_SVBUILDING: /* can go to the same state */ 305 break; 306 default: 307 goto die; 308 } 309 break; 310 311 case BIOC_SVSCRUB: 312 switch (new_state) { 313 case BIOC_SVONLINE: 314 case BIOC_SVOFFLINE: 315 case BIOC_SVDEGRADED: 316 case BIOC_SVSCRUB: /* can go to same state */ 317 break; 318 default: 319 goto die; 320 } 321 break; 322 323 case BIOC_SVREBUILD: 324 switch (new_state) { 325 case BIOC_SVONLINE: 326 case BIOC_SVOFFLINE: 327 case BIOC_SVDEGRADED: 328 case BIOC_SVREBUILD: /* can go to the same state */ 329 break; 330 default: 331 goto die; 332 } 333 break; 334 335 default: 336 die: 337 panic("%s: %s: invalid volume state transition %d -> %d", 338 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 339 old_state, new_state); 340 /* NOTREACHED */ 341 } 342 343 sd->sd_vol_status = new_state; 344 } 345 346 static inline int 347 sr_raid5_chunk_online(struct sr_discipline *sd, int chunk) 348 { 349 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 350 case BIOC_SDONLINE: 351 case BIOC_SDSCRUB: 352 return 1; 353 default: 354 return 0; 355 } 356 } 357 358 static inline int 359 sr_raid5_chunk_rebuild(struct sr_discipline *sd, int chunk) 360 { 361 switch (sd->sd_vol.sv_chunks[chunk]->src_meta.scm_status) { 362 case BIOC_SDREBUILD: 363 return 1; 364 default: 365 return 0; 366 } 367 } 368 369 int 370 sr_raid5_rw(struct sr_workunit *wu) 371 { 372 struct sr_workunit *wu_r = NULL; 373 struct sr_discipline *sd = wu->swu_dis; 374 struct scsi_xfer *xs = wu->swu_xs; 375 struct sr_chunk *scp; 376 daddr_t blkno, lba; 377 int64_t chunk_offs, lbaoffs, offset, strip_offs; 378 int64_t strip_bits, strip_no, strip_size; 379 int64_t chunk, no_chunk; 380 int64_t parity, row_size; 381 long length, datalen; 382 void *data; 383 int s; 384 385 /* blkno and scsi error will be handled by sr_validate_io */ 386 if (sr_validate_io(wu, &blkno, "sr_raid5_rw")) 387 goto bad; 388 389 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_rw %s: blkno %lld size %d\n", 390 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 391 (xs->flags & SCSI_DATA_IN) ? "read" : "write", 392 (long long)blkno, xs->datalen); 393 394 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 395 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 396 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 397 row_size = (no_chunk << strip_bits) >> DEV_BSHIFT; 398 399 data = xs->data; 400 datalen = xs->datalen; 401 lbaoffs = blkno << DEV_BSHIFT; 402 403 if (xs->flags & SCSI_DATA_OUT) { 404 if ((wu_r = sr_scsi_wu_get(sd, SCSI_NOSLEEP)) == NULL){ 405 printf("%s: %s failed to get read work unit", 406 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 407 goto bad; 408 } 409 wu_r->swu_state = SR_WU_INPROGRESS; 410 wu_r->swu_flags |= SR_WUF_DISCIPLINE; 411 } 412 413 wu->swu_blk_start = 0; 414 while (datalen != 0) { 415 strip_no = lbaoffs >> strip_bits; 416 strip_offs = lbaoffs & (strip_size - 1); 417 chunk_offs = (strip_no / no_chunk) << strip_bits; 418 offset = chunk_offs + strip_offs; 419 420 /* get size remaining in this stripe */ 421 length = MIN(strip_size - strip_offs, datalen); 422 423 /* 424 * Map disk offset to data and parity chunks, using a left 425 * asymmetric algorithm for the parity assignment. 426 */ 427 chunk = strip_no % no_chunk; 428 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 429 if (chunk >= parity) 430 chunk++; 431 432 lba = offset >> DEV_BSHIFT; 433 434 /* XXX big hammer.. exclude I/O from entire stripe */ 435 if (wu->swu_blk_start == 0) 436 wu->swu_blk_start = (strip_no / no_chunk) * row_size; 437 wu->swu_blk_end = (strip_no / no_chunk) * row_size + 438 (row_size - 1); 439 440 scp = sd->sd_vol.sv_chunks[chunk]; 441 if (xs->flags & SCSI_DATA_IN) { 442 switch (scp->src_meta.scm_status) { 443 case BIOC_SDONLINE: 444 case BIOC_SDSCRUB: 445 /* 446 * Chunk is online, issue a single read 447 * request. 448 */ 449 if (sr_raid5_addio(wu, chunk, lba, length, 450 data, xs->flags, 0, NULL)) 451 goto bad; 452 break; 453 case BIOC_SDOFFLINE: 454 case BIOC_SDREBUILD: 455 case BIOC_SDHOTSPARE: 456 if (sr_raid5_regenerate(wu, chunk, lba, 457 length, data)) 458 goto bad; 459 break; 460 default: 461 printf("%s: is offline, can't read\n", 462 DEVNAME(sd->sd_sc)); 463 goto bad; 464 } 465 } else { 466 if (sr_raid5_write(wu, wu_r, chunk, parity, lba, 467 length, data, xs->flags, 0)) 468 goto bad; 469 } 470 471 /* advance to next block */ 472 lbaoffs += length; 473 datalen -= length; 474 data += length; 475 } 476 477 s = splbio(); 478 if (wu_r) { 479 if (wu_r->swu_io_count > 0) { 480 /* collide write request with reads */ 481 wu_r->swu_blk_start = wu->swu_blk_start; 482 wu_r->swu_blk_end = wu->swu_blk_end; 483 484 wu->swu_state = SR_WU_DEFERRED; 485 wu_r->swu_collider = wu; 486 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link); 487 488 wu = wu_r; 489 } else { 490 sr_scsi_wu_put(sd, wu_r); 491 } 492 } 493 splx(s); 494 495 sr_schedule_wu(wu); 496 497 return (0); 498 499 bad: 500 /* wu is unwound by sr_wu_put */ 501 if (wu_r) 502 sr_scsi_wu_put(sd, wu_r); 503 return (1); 504 } 505 506 int 507 sr_raid5_regenerate(struct sr_workunit *wu, int chunk, daddr_t blkno, 508 long len, void *data) 509 { 510 struct sr_discipline *sd = wu->swu_dis; 511 int i; 512 513 /* 514 * Regenerate a block on a RAID 5 volume by xoring the data and parity 515 * from all of the remaining online chunks. This requires the parity 516 * to already be correct. 517 */ 518 519 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_regenerate chunk %d offline, " 520 "regenerating block %llu\n", 521 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, chunk, blkno); 522 523 memset(data, 0, len); 524 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 525 if (i == chunk) 526 continue; 527 if (!sr_raid5_chunk_online(sd, i)) 528 goto bad; 529 if (sr_raid5_addio(wu, i, blkno, len, NULL, SCSI_DATA_IN, 530 0, data)) 531 goto bad; 532 } 533 return (0); 534 535 bad: 536 return (1); 537 } 538 539 int 540 sr_raid5_write(struct sr_workunit *wu, struct sr_workunit *wu_r, int chunk, 541 int parity, daddr_t blkno, long len, void *data, int xsflags, 542 int ccbflags) 543 { 544 struct sr_discipline *sd = wu->swu_dis; 545 struct scsi_xfer *xs = wu->swu_xs; 546 void *xorbuf; 547 int chunk_online, chunk_rebuild; 548 int parity_online, parity_rebuild; 549 int other_offline = 0, other_rebuild = 0; 550 int i; 551 552 /* 553 * Perform a write to a RAID 5 volume. This write routine does not 554 * require the parity to already be correct and will operate on a 555 * uninitialised volume. 556 * 557 * There are four possible cases: 558 * 559 * 1) All data chunks and parity are online. In this case we read the 560 * data from all data chunks, except the one we are writing to, in 561 * order to calculate and write the new parity. 562 * 563 * 2) The parity chunk is offline. In this case we only need to write 564 * to the data chunk. No parity calculation is required. 565 * 566 * 3) The data chunk is offline. In this case we read the data from all 567 * online chunks in order to calculate and write the new parity. 568 * This is the same as (1) except we do not write the data chunk. 569 * 570 * 4) A different data chunk is offline. The new parity is calculated 571 * by taking the existing parity, xoring the original data and 572 * xoring in the new data. This requires that the parity already be 573 * correct, which it will be if any of the data chunks has 574 * previously been written. 575 * 576 * There is an additional complication introduced by a chunk that is 577 * being rebuilt. If this is the data or parity chunk, then we want 578 * to write to it as per normal. If it is another data chunk then we 579 * need to presume that it has not yet been regenerated and use the 580 * same method as detailed in (4) above. 581 */ 582 583 DNPRINTF(SR_D_DIS, "%s: %s sr_raid5_write chunk %i parity %i " 584 "blkno %llu\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 585 chunk, parity, (unsigned long long)blkno); 586 587 chunk_online = sr_raid5_chunk_online(sd, chunk); 588 chunk_rebuild = sr_raid5_chunk_rebuild(sd, chunk); 589 parity_online = sr_raid5_chunk_online(sd, parity); 590 parity_rebuild = sr_raid5_chunk_rebuild(sd, parity); 591 592 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 593 if (i == chunk || i == parity) 594 continue; 595 if (sr_raid5_chunk_rebuild(sd, i)) 596 other_rebuild = 1; 597 else if (!sr_raid5_chunk_online(sd, i)) 598 other_offline = 1; 599 } 600 601 DNPRINTF(SR_D_DIS, "%s: %s chunk online %d, parity online %d, " 602 "other offline %d\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 603 chunk_online, parity_online, other_offline); 604 605 if (!parity_online && !parity_rebuild) 606 goto data_write; 607 608 xorbuf = sr_block_get(sd, len); 609 if (xorbuf == NULL) 610 goto bad; 611 memcpy(xorbuf, data, len); 612 613 if (other_offline || other_rebuild) { 614 615 /* 616 * XXX - If we can guarantee that this LBA has been scrubbed 617 * then we can also take this faster path. 618 */ 619 620 /* Read in existing data and existing parity. */ 621 if (sr_raid5_addio(wu_r, chunk, blkno, len, NULL, 622 SCSI_DATA_IN, 0, xorbuf)) 623 goto bad; 624 if (sr_raid5_addio(wu_r, parity, blkno, len, NULL, 625 SCSI_DATA_IN, 0, xorbuf)) 626 goto bad; 627 628 } else { 629 630 /* Read in existing data from all other chunks. */ 631 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 632 if (i == chunk || i == parity) 633 continue; 634 if (sr_raid5_addio(wu_r, i, blkno, len, NULL, 635 SCSI_DATA_IN, 0, xorbuf)) 636 goto bad; 637 } 638 639 } 640 641 /* Write new parity. */ 642 if (sr_raid5_addio(wu, parity, blkno, len, xorbuf, xs->flags, 643 SR_CCBF_FREEBUF, NULL)) 644 goto bad; 645 646 data_write: 647 /* Write new data. */ 648 if (chunk_online || chunk_rebuild) 649 if (sr_raid5_addio(wu, chunk, blkno, len, data, xs->flags, 650 0, NULL)) 651 goto bad; 652 653 return (0); 654 655 bad: 656 return (1); 657 } 658 659 void 660 sr_raid5_intr(struct buf *bp) 661 { 662 struct sr_ccb *ccb = (struct sr_ccb *)bp; 663 struct sr_workunit *wu = ccb->ccb_wu; 664 struct sr_discipline *sd = wu->swu_dis; 665 int s; 666 667 DNPRINTF(SR_D_INTR, "%s: sr_raid5_intr bp %p xs %p\n", 668 DEVNAME(sd->sd_sc), bp, wu->swu_xs); 669 670 s = splbio(); 671 sr_ccb_done(ccb); 672 673 /* XXX - Should this be done via the taskq? */ 674 675 /* XOR data to result. */ 676 if (ccb->ccb_state == SR_CCB_OK && ccb->ccb_opaque) 677 sr_raid5_xor(ccb->ccb_opaque, ccb->ccb_buf.b_data, 678 ccb->ccb_buf.b_bcount); 679 680 /* Free allocated data buffer. */ 681 if (ccb->ccb_flags & SR_CCBF_FREEBUF) { 682 sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount); 683 ccb->ccb_buf.b_data = NULL; 684 } 685 686 sr_wu_done(wu); 687 splx(s); 688 } 689 690 int 691 sr_raid5_wu_done(struct sr_workunit *wu) 692 { 693 struct sr_discipline *sd = wu->swu_dis; 694 struct scsi_xfer *xs = wu->swu_xs; 695 696 /* XXX - we have no way of propagating errors... */ 697 if (wu->swu_flags & (SR_WUF_DISCIPLINE | SR_WUF_REBUILD)) 698 return SR_WU_OK; 699 700 /* XXX - This is insufficient for RAID 5. */ 701 if (wu->swu_ios_succeeded > 0) { 702 xs->error = XS_NOERROR; 703 return SR_WU_OK; 704 } 705 706 if (xs->flags & SCSI_DATA_IN) { 707 printf("%s: retrying read on block %lld\n", 708 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 709 sr_wu_release_ccbs(wu); 710 wu->swu_state = SR_WU_RESTART; 711 if (sd->sd_scsi_rw(wu) == 0) 712 return SR_WU_RESTART; 713 } else { 714 /* XXX - retry write if we just went from online to degraded. */ 715 printf("%s: permanently fail write on block %lld\n", 716 sd->sd_meta->ssd_devname, (long long)wu->swu_blk_start); 717 } 718 719 wu->swu_state = SR_WU_FAILED; 720 xs->error = XS_DRIVER_STUFFUP; 721 722 return SR_WU_FAILED; 723 } 724 725 int 726 sr_raid5_addio(struct sr_workunit *wu, int chunk, daddr_t blkno, 727 long len, void *data, int xsflags, int ccbflags, void *xorbuf) 728 { 729 struct sr_discipline *sd = wu->swu_dis; 730 struct sr_ccb *ccb; 731 732 DNPRINTF(SR_D_DIS, "sr_raid5_addio: %s chunk %d block %lld " 733 "length %ld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write", 734 chunk, (long long)blkno, len, xorbuf ? "X0R" : "-"); 735 736 /* Allocate temporary buffer. */ 737 if (data == NULL) { 738 data = sr_block_get(sd, len); 739 if (data == NULL) 740 return (-1); 741 ccbflags |= SR_CCBF_FREEBUF; 742 } 743 744 ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags); 745 if (ccb == NULL) { 746 if (ccbflags & SR_CCBF_FREEBUF) 747 sr_block_put(sd, data, len); 748 return (-1); 749 } 750 ccb->ccb_opaque = xorbuf; 751 sr_wu_enqueue_ccb(wu, ccb); 752 753 return (0); 754 } 755 756 void 757 sr_raid5_xor(void *a, void *b, int len) 758 { 759 uint32_t *xa = a, *xb = b; 760 761 len >>= 2; 762 while (len--) 763 *xa++ ^= *xb++; 764 } 765 766 void 767 sr_raid5_rebuild(struct sr_discipline *sd) 768 { 769 int64_t strip_no, strip_size, strip_bits, i, restart; 770 int64_t chunk_count, chunk_strips, chunk_lba, chunk_size, row_size; 771 struct sr_workunit *wu_r, *wu_w; 772 int s, slept, percent = 0, old_percent = -1; 773 int rebuild_chunk = -1; 774 void *xorbuf; 775 776 /* Find the rebuild chunk. */ 777 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 778 if (sr_raid5_chunk_rebuild(sd, i)) { 779 rebuild_chunk = i; 780 break; 781 } 782 } 783 if (rebuild_chunk == -1) 784 goto bad; 785 786 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 787 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 788 chunk_count = sd->sd_meta->ssdi.ssd_chunk_no - 1; 789 chunk_size = sd->sd_meta->ssdi.ssd_size / chunk_count; 790 chunk_strips = (chunk_size << DEV_BSHIFT) >> strip_bits; 791 row_size = (chunk_count << strip_bits) >> DEV_BSHIFT; 792 793 DNPRINTF(SR_D_REBUILD, "%s: %s sr_raid5_rebuild volume size = %lld, " 794 "chunk count = %lld, chunk size = %lld, chunk strips = %lld, " 795 "row size = %lld\n", DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, 796 sd->sd_meta->ssdi.ssd_size, chunk_count, chunk_size, chunk_strips, 797 row_size); 798 799 restart = sd->sd_meta->ssd_rebuild / row_size; 800 if (restart > chunk_strips) { 801 printf("%s: bogus rebuild restart offset, starting from 0\n", 802 DEVNAME(sd->sd_sc)); 803 restart = 0; 804 } 805 if (restart != 0) { 806 percent = sr_rebuild_percent(sd); 807 printf("%s: resuming rebuild on %s at %d%%\n", 808 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, percent); 809 } 810 811 for (strip_no = restart; strip_no < chunk_strips; strip_no++) { 812 chunk_lba = (strip_size >> DEV_BSHIFT) * strip_no; 813 814 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild strip %lld, " 815 "chunk lba = %lld\n", DEVNAME(sd->sd_sc), 816 sd->sd_meta->ssd_devname, strip_no, chunk_lba); 817 818 wu_w = sr_scsi_wu_get(sd, 0); 819 wu_r = sr_scsi_wu_get(sd, 0); 820 821 xorbuf = sr_block_get(sd, strip_size); 822 if (xorbuf == NULL) 823 goto bad; 824 if (sr_raid5_regenerate(wu_r, rebuild_chunk, chunk_lba, 825 strip_size, xorbuf)) 826 goto bad; 827 if (sr_raid5_addio(wu_w, rebuild_chunk, chunk_lba, strip_size, 828 xorbuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL)) 829 goto bad; 830 831 /* Collide write work unit with read work unit. */ 832 wu_r->swu_state = SR_WU_INPROGRESS; 833 wu_r->swu_flags |= SR_WUF_REBUILD; 834 wu_w->swu_state = SR_WU_DEFERRED; 835 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 836 wu_r->swu_collider = wu_w; 837 838 /* Block I/O to this strip while we rebuild it. */ 839 wu_r->swu_blk_start = (strip_no / chunk_count) * row_size; 840 wu_r->swu_blk_end = wu_r->swu_blk_start + row_size - 1; 841 wu_w->swu_blk_start = wu_r->swu_blk_start; 842 wu_w->swu_blk_end = wu_r->swu_blk_end; 843 844 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild swu_blk_start = %lld, " 845 "swu_blk_end = %lld\n", DEVNAME(sd->sd_sc), 846 sd->sd_meta->ssd_devname, 847 wu_r->swu_blk_start, wu_r->swu_blk_end); 848 849 s = splbio(); 850 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 851 splx(s); 852 853 sr_schedule_wu(wu_r); 854 855 slept = 0; 856 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 857 tsleep_nsec(wu_w, PRIBIO, "sr_rebuild", INFSLP); 858 slept = 1; 859 } 860 if (!slept) { 861 tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield", 862 MSEC_TO_NSEC(1)); 863 } 864 865 sr_scsi_wu_put(sd, wu_r); 866 sr_scsi_wu_put(sd, wu_w); 867 868 sd->sd_meta->ssd_rebuild = chunk_lba * chunk_count; 869 870 percent = sr_rebuild_percent(sd); 871 if (percent != old_percent && strip_no != chunk_strips - 1) { 872 if (sr_meta_save(sd, SR_META_DIRTY)) 873 printf("%s: could not save metadata to %s\n", 874 DEVNAME(sd->sd_sc), 875 sd->sd_meta->ssd_devname); 876 old_percent = percent; 877 } 878 879 if (sd->sd_reb_abort) 880 goto abort; 881 } 882 883 DNPRINTF(SR_D_REBUILD, "%s: %s rebuild complete\n", DEVNAME(sd->sd_sc), 884 sd->sd_meta->ssd_devname); 885 886 /* all done */ 887 sd->sd_meta->ssd_rebuild = 0; 888 for (i = 0; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) { 889 if (sd->sd_vol.sv_chunks[i]->src_meta.scm_status == 890 BIOC_SDREBUILD) { 891 sd->sd_set_chunk_state(sd, i, BIOC_SDONLINE); 892 break; 893 } 894 } 895 896 return; 897 898 abort: 899 if (sr_meta_save(sd, SR_META_DIRTY)) 900 printf("%s: could not save metadata to %s\n", 901 DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname); 902 bad: 903 return; 904 } 905 906 #if 0 907 void 908 sr_raid5_scrub(struct sr_discipline *sd) 909 { 910 int64_t strip_no, strip_size, no_chunk, parity, max_strip, strip_bits; 911 int64_t i; 912 struct sr_workunit *wu_r, *wu_w; 913 int s, slept; 914 void *xorbuf; 915 916 wu_w = sr_scsi_wu_get(sd, 0); 917 wu_r = sr_scsi_wu_get(sd, 0); 918 919 no_chunk = sd->sd_meta->ssdi.ssd_chunk_no - 1; 920 strip_size = sd->sd_meta->ssdi.ssd_strip_size; 921 strip_bits = sd->mds.mdd_raid5.sr5_strip_bits; 922 max_strip = sd->sd_meta->ssdi.ssd_size >> strip_bits; 923 924 for (strip_no = 0; strip_no < max_strip; strip_no++) { 925 parity = no_chunk - ((strip_no / no_chunk) % (no_chunk + 1)); 926 927 xorbuf = sr_block_get(sd, strip_size); 928 for (i = 0; i <= no_chunk; i++) { 929 if (i != parity) 930 sr_raid5_addio(wu_r, i, 0xBADCAFE, strip_size, 931 NULL, SCSI_DATA_IN, 0, xorbuf); 932 } 933 sr_raid5_addio(wu_w, parity, 0xBADCAFE, strip_size, xorbuf, 934 SCSI_DATA_OUT, SR_CCBF_FREEBUF, NULL); 935 936 wu_r->swu_flags |= SR_WUF_REBUILD; 937 938 /* Collide wu_w with wu_r */ 939 wu_w->swu_state = SR_WU_DEFERRED; 940 wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP; 941 wu_r->swu_collider = wu_w; 942 943 s = splbio(); 944 TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link); 945 splx(s); 946 947 wu_r->swu_state = SR_WU_INPROGRESS; 948 sr_schedule_wu(wu_r); 949 950 slept = 0; 951 while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) { 952 tsleep_nsec(wu_w, PRIBIO, "sr_scrub", INFSLP); 953 slept = 1; 954 } 955 if (!slept) { 956 tsleep_nsec(sd->sd_sc, PWAIT, "sr_yield", 957 MSEC_TO_NSEC(1)); 958 } 959 } 960 } 961 #endif 962