1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_parity(struct gv_raid5_packet *); 48 int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 49 50 struct gv_raid5_bit * 51 gv_new_raid5_bit(void) 52 { 53 struct gv_raid5_bit *r; 54 r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 55 KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 56 return (r); 57 } 58 59 struct gv_raid5_packet * 60 gv_new_raid5_packet(void) 61 { 62 struct gv_raid5_packet *wp; 63 64 wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 65 KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 66 wp->state = SETUP; 67 wp->type = JUNK; 68 TAILQ_INIT(&wp->bits); 69 70 return (wp); 71 } 72 73 void 74 gv_free_raid5_packet(struct gv_raid5_packet *wp) 75 { 76 struct gv_raid5_bit *r, *r2; 77 78 /* Remove all the bits from this work packet. */ 79 TAILQ_FOREACH_SAFE(r, &wp->bits, list, r2) { 80 TAILQ_REMOVE(&wp->bits, r, list); 81 if (r->malloc) 82 g_free(r->buf); 83 if (r->bio != NULL) 84 g_destroy_bio(r->bio); 85 g_free(r); 86 } 87 88 if (wp->bufmalloc == 1) 89 g_free(wp->buf); 90 g_free(wp); 91 } 92 93 /* 94 * Check if the stripe that the work packet wants is already being used by 95 * some other work packet. 96 */ 97 int 98 gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 99 { 100 struct gv_raid5_packet *wpa; 101 102 TAILQ_FOREACH(wpa, &sc->worklist, list) { 103 if (wpa->lockbase == wp->lockbase) { 104 if (wpa->bio == wp->bio) 105 return (0); 106 return (1); 107 } 108 } 109 return (0); 110 } 111 112 /* 113 * The "worker" thread that runs through the worklist and fires off the 114 * "subrequests" needed to fulfill a RAID5 read or write request. 115 */ 116 void 117 gv_raid5_worker(void *arg) 118 { 119 struct bio *bp; 120 struct g_geom *gp; 121 struct gv_plex *p; 122 struct gv_raid5_packet *wp, *wpt; 123 struct gv_raid5_bit *rbp, *rbpt; 124 int error, restart; 125 126 gp = arg; 127 p = gp->softc; 128 129 mtx_lock(&p->worklist_mtx); 130 for (;;) { 131 restart = 0; 132 TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 133 /* This request packet is already being processed. */ 134 if (wp->state == IO) 135 continue; 136 /* This request packet is ready for processing. */ 137 if (wp->state == VALID) { 138 /* Couldn't get the lock, try again. */ 139 if ((wp->lockbase != -1) && 140 gv_stripe_active(wp, p)) 141 continue; 142 143 wp->state = IO; 144 mtx_unlock(&p->worklist_mtx); 145 TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 146 g_io_request(rbp->bio, rbp->consumer); 147 mtx_lock(&p->worklist_mtx); 148 continue; 149 } 150 if (wp->state == FINISH) { 151 bp = wp->bio; 152 bp->bio_completed += wp->length; 153 /* 154 * Deliver the original request if we have 155 * finished. 156 */ 157 if (bp->bio_completed == bp->bio_length) { 158 mtx_unlock(&p->worklist_mtx); 159 g_io_deliver(bp, 0); 160 mtx_lock(&p->worklist_mtx); 161 } 162 TAILQ_REMOVE(&p->worklist, wp, list); 163 gv_free_raid5_packet(wp); 164 restart++; 165 /*break;*/ 166 } 167 } 168 if (!restart) { 169 /* Self-destruct. */ 170 if (p->flags & GV_PLEX_THREAD_DIE) 171 break; 172 error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 173 hz/100); 174 } 175 } 176 mtx_unlock(&p->worklist_mtx); 177 178 g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 179 180 /* Signal our plex that we are dead. */ 181 p->flags |= GV_PLEX_THREAD_DEAD; 182 wakeup(p); 183 kthread_exit(0); 184 } 185 186 /* Final bio transaction to write out the parity data. */ 187 int 188 gv_raid5_parity(struct gv_raid5_packet *wp) 189 { 190 struct bio *bp; 191 192 bp = g_new_bio(); 193 if (bp == NULL) 194 return (ENOMEM); 195 196 wp->type = ISPARITY; 197 bp->bio_cmd = BIO_WRITE; 198 bp->bio_data = wp->buf; 199 bp->bio_offset = wp->offset; 200 bp->bio_length = wp->length; 201 bp->bio_done = gv_raid5_done; 202 bp->bio_caller1 = wp; 203 bp->bio_caller2 = NULL; 204 g_io_request(bp, wp->parity); 205 206 return (0); 207 } 208 209 /* We end up here after each subrequest. */ 210 void 211 gv_raid5_done(struct bio *bp) 212 { 213 struct bio *obp; 214 struct g_geom *gp; 215 struct gv_plex *p; 216 struct gv_raid5_packet *wp; 217 struct gv_raid5_bit *rbp; 218 off_t i; 219 int error; 220 221 wp = bp->bio_caller1; 222 rbp = bp->bio_caller2; 223 obp = wp->bio; 224 gp = bp->bio_from->geom; 225 p = gp->softc; 226 227 /* One less active subrequest. */ 228 wp->active--; 229 230 switch (obp->bio_cmd) { 231 case BIO_READ: 232 /* Degraded reads need to handle parity data. */ 233 if (wp->type == DEGRADED) { 234 for (i = 0; i < wp->length; i++) 235 wp->buf[i] ^= bp->bio_data[i]; 236 237 /* When we're finished copy back the data we want. */ 238 if (wp->active == 0) 239 bcopy(wp->buf, wp->data, wp->length); 240 } 241 242 break; 243 244 case BIO_WRITE: 245 /* Handle the parity data, if needed. */ 246 if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 247 for (i = 0; i < wp->length; i++) 248 wp->buf[i] ^= bp->bio_data[i]; 249 250 /* Write out the parity data we calculated. */ 251 if (wp->active == 0) { 252 wp->active++; 253 error = gv_raid5_parity(wp); 254 } 255 } 256 break; 257 } 258 259 /* This request group is done. */ 260 if (wp->active == 0) 261 wp->state = FINISH; 262 } 263 264 /* Build a request group to perform (part of) a RAID5 request. */ 265 int 266 gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 267 long bcount, off_t boff) 268 { 269 struct g_geom *gp; 270 struct gv_plex *p; 271 struct gv_raid5_bit *rbp; 272 struct gv_sd *broken, *original, *parity, *s; 273 int i, psdno, sdno; 274 off_t len_left, real_off, stripeend, stripeoff, stripestart; 275 276 gp = bp->bio_to->geom; 277 p = gp->softc; 278 279 if (p == NULL || LIST_EMPTY(&p->subdisks)) 280 return (ENXIO); 281 282 /* We are optimistic and assume that this request will be OK. */ 283 wp->type = NORMAL; 284 original = parity = broken = NULL; 285 286 /* The number of the subdisk containing the parity stripe. */ 287 psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 288 p->sdcount; 289 KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 290 291 /* Offset of the start address from the start of the stripe. */ 292 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 293 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 294 295 /* The number of the subdisk where the stripe resides. */ 296 sdno = stripeoff / p->stripesize; 297 KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 298 299 /* At or past parity subdisk. */ 300 if (sdno >= psdno) 301 sdno++; 302 303 /* The offset of the stripe on this subdisk. */ 304 stripestart = (boff - stripeoff) / (p->sdcount - 1); 305 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 306 307 stripeoff %= p->stripesize; 308 309 /* The offset of the request on this subdisk. */ 310 real_off = stripestart + stripeoff; 311 312 stripeend = stripestart + p->stripesize; 313 len_left = stripeend - real_off; 314 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 315 316 /* Find the right subdisks. */ 317 i = 0; 318 LIST_FOREACH(s, &p->subdisks, in_plex) { 319 if (i == sdno) 320 original = s; 321 if (i == psdno) 322 parity = s; 323 if (s->state != GV_SD_UP) 324 broken = s; 325 i++; 326 } 327 328 if ((original == NULL) || (parity == NULL)) 329 return (ENXIO); 330 331 /* Our data stripe is missing. */ 332 if (original->state != GV_SD_UP) 333 wp->type = DEGRADED; 334 /* Our parity stripe is missing. */ 335 if (parity->state != GV_SD_UP) { 336 /* We cannot take another failure if we're already degraded. */ 337 if (wp->type != NORMAL) 338 return (ENXIO); 339 else 340 wp->type = NOPARITY; 341 } 342 343 /* 344 * A combined write is necessary when the original data subdisk and the 345 * parity subdisk are both up, but one of the other subdisks isn't. 346 */ 347 if ((broken != NULL) && (broken != parity) && (broken != original)) 348 wp->type = COMBINED; 349 350 wp->offset = real_off; 351 wp->length = (bcount <= len_left) ? bcount : len_left; 352 wp->data = addr; 353 wp->original = original->consumer; 354 wp->parity = parity->consumer; 355 wp->lockbase = stripestart; 356 357 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 358 359 switch (bp->bio_cmd) { 360 case BIO_READ: 361 /* 362 * For a degraded read we need to read in all stripes except 363 * the broken one plus the parity stripe and then recalculate 364 * the desired data. 365 */ 366 if (wp->type == DEGRADED) { 367 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 368 if (wp->buf == NULL) 369 return (ENOMEM); 370 wp->bufmalloc = 1; 371 LIST_FOREACH(s, &p->subdisks, in_plex) { 372 /* Skip the broken subdisk. */ 373 if (s == broken) 374 continue; 375 rbp = gv_new_raid5_bit(); 376 rbp->consumer = s->consumer; 377 rbp->bio = g_new_bio(); 378 if (rbp->bio == NULL) 379 return (ENOMEM); 380 rbp->buf = g_malloc(wp->length, 381 M_NOWAIT | M_ZERO); 382 if (rbp->buf == NULL) 383 return (ENOMEM); 384 rbp->malloc = 1; 385 rbp->bio->bio_cmd = BIO_READ; 386 rbp->bio->bio_offset = wp->offset; 387 rbp->bio->bio_length = wp->length; 388 rbp->bio->bio_data = rbp->buf; 389 rbp->bio->bio_done = gv_raid5_done; 390 rbp->bio->bio_caller1 = wp; 391 rbp->bio->bio_caller2 = rbp; 392 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 393 wp->active++; 394 wp->rqcount++; 395 } 396 397 /* A normal read can be fulfilled with the original subdisk. */ 398 } else { 399 rbp = gv_new_raid5_bit(); 400 rbp->consumer = wp->original; 401 rbp->bio = g_new_bio(); 402 if (rbp->bio == NULL) 403 return (ENOMEM); 404 rbp->bio->bio_cmd = BIO_READ; 405 rbp->bio->bio_offset = wp->offset; 406 rbp->bio->bio_length = wp->length; 407 rbp->buf = addr; 408 rbp->bio->bio_data = rbp->buf; 409 rbp->bio->bio_done = gv_raid5_done; 410 rbp->bio->bio_caller1 = wp; 411 rbp->bio->bio_caller2 = rbp; 412 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 413 wp->active++; 414 wp->rqcount++; 415 } 416 if (wp->type != COMBINED) 417 wp->lockbase = -1; 418 break; 419 420 case BIO_WRITE: 421 /* 422 * A degraded write means we cannot write to the original data 423 * subdisk. Thus we need to read in all valid stripes, 424 * recalculate the parity from the original data, and then 425 * write the parity stripe back out. 426 */ 427 if (wp->type == DEGRADED) { 428 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 429 if (wp->buf == NULL) 430 return (ENOMEM); 431 wp->bufmalloc = 1; 432 433 /* Copy the original data. */ 434 bcopy(wp->data, wp->buf, wp->length); 435 436 LIST_FOREACH(s, &p->subdisks, in_plex) { 437 /* Skip the broken and the parity subdisk. */ 438 if ((s == broken) || 439 (s->consumer == wp->parity)) 440 continue; 441 442 rbp = gv_new_raid5_bit(); 443 rbp->consumer = s->consumer; 444 rbp->bio = g_new_bio(); 445 if (rbp->bio == NULL) 446 return (ENOMEM); 447 rbp->buf = g_malloc(wp->length, 448 M_NOWAIT | M_ZERO); 449 if (rbp->buf == NULL) 450 return (ENOMEM); 451 rbp->malloc = 1; 452 rbp->bio->bio_cmd = BIO_READ; 453 rbp->bio->bio_data = rbp->buf; 454 rbp->bio->bio_offset = wp->offset; 455 rbp->bio->bio_length = wp->length; 456 rbp->bio->bio_done = gv_raid5_done; 457 rbp->bio->bio_caller1 = wp; 458 rbp->bio->bio_caller2 = rbp; 459 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 460 wp->active++; 461 wp->rqcount++; 462 } 463 464 /* 465 * When we don't have the parity stripe we just write out the 466 * data. 467 */ 468 } else if (wp->type == NOPARITY) { 469 rbp = gv_new_raid5_bit(); 470 rbp->consumer = wp->original; 471 rbp->bio = g_new_bio(); 472 if (rbp->bio == NULL) 473 return (ENOMEM); 474 rbp->bio->bio_cmd = BIO_WRITE; 475 rbp->bio->bio_offset = wp->offset; 476 rbp->bio->bio_length = wp->length; 477 rbp->bio->bio_data = addr; 478 rbp->bio->bio_done = gv_raid5_done; 479 rbp->bio->bio_caller1 = wp; 480 rbp->bio->bio_caller2 = rbp; 481 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 482 wp->active++; 483 wp->rqcount++; 484 485 /* 486 * A combined write means that our data subdisk and the parity 487 * subdisks are both up, but another subdisk isn't. We need to 488 * read all valid stripes including the parity to recalculate 489 * the data of the stripe that is missing. Then we write our 490 * original data, and together with the other data stripes 491 * recalculate the parity again. 492 */ 493 } else if (wp->type == COMBINED) { 494 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 495 if (wp->buf == NULL) 496 return (ENOMEM); 497 wp->bufmalloc = 1; 498 499 /* Get the data from all subdisks. */ 500 LIST_FOREACH(s, &p->subdisks, in_plex) { 501 /* Skip the broken subdisk. */ 502 if (s == broken) 503 continue; 504 505 rbp = gv_new_raid5_bit(); 506 rbp->consumer = s->consumer; 507 rbp->bio = g_new_bio(); 508 if (rbp->bio == NULL) 509 return (ENOMEM); 510 rbp->bio->bio_cmd = BIO_READ; 511 rbp->buf = g_malloc(wp->length, 512 M_NOWAIT | M_ZERO); 513 if (rbp->buf == NULL) 514 return (ENOMEM); 515 rbp->malloc = 1; 516 rbp->bio->bio_data = rbp->buf; 517 rbp->bio->bio_offset = wp->offset; 518 rbp->bio->bio_length = wp->length; 519 rbp->bio->bio_done = gv_raid5_done; 520 rbp->bio->bio_caller1 = wp; 521 rbp->bio->bio_caller2 = rbp; 522 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 523 wp->active++; 524 wp->rqcount++; 525 } 526 527 /* Write the original data. */ 528 rbp = gv_new_raid5_bit(); 529 rbp->consumer = wp->original; 530 rbp->buf = addr; 531 rbp->bio = g_new_bio(); 532 if (rbp->bio == NULL) 533 return (ENOMEM); 534 rbp->bio->bio_cmd = BIO_WRITE; 535 rbp->bio->bio_data = rbp->buf; 536 rbp->bio->bio_offset = wp->offset; 537 rbp->bio->bio_length = wp->length; 538 rbp->bio->bio_done = gv_raid5_done; 539 rbp->bio->bio_caller1 = wp; 540 rbp->bio->bio_caller2 = rbp; 541 /* 542 * Insert at the tail, because we want to read the old 543 * data first. 544 */ 545 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 546 wp->active++; 547 wp->rqcount++; 548 549 /* Get the rest of the data again. */ 550 LIST_FOREACH(s, &p->subdisks, in_plex) { 551 /* 552 * Skip the broken subdisk, the parity, and the 553 * one we just wrote. 554 */ 555 if ((s == broken) || 556 (s->consumer == wp->parity) || 557 (s->consumer == wp->original)) 558 continue; 559 rbp = gv_new_raid5_bit(); 560 rbp->consumer = s->consumer; 561 rbp->bio = g_new_bio(); 562 if (rbp->bio == NULL) 563 return (ENOMEM); 564 rbp->bio->bio_cmd = BIO_READ; 565 rbp->buf = g_malloc(wp->length, 566 M_NOWAIT | M_ZERO); 567 if (rbp->buf == NULL) 568 return (ENOMEM); 569 rbp->malloc = 1; 570 rbp->bio->bio_data = rbp->buf; 571 rbp->bio->bio_offset = wp->offset; 572 rbp->bio->bio_length = wp->length; 573 rbp->bio->bio_done = gv_raid5_done; 574 rbp->bio->bio_caller1 = wp; 575 rbp->bio->bio_caller2 = rbp; 576 /* 577 * Again, insert at the tail to keep correct 578 * order. 579 */ 580 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 581 wp->active++; 582 wp->rqcount++; 583 } 584 585 586 /* 587 * A normal write request goes to the original subdisk, then we 588 * read in all other stripes, recalculate the parity and write 589 * out the parity again. 590 */ 591 } else { 592 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 593 if (wp->buf == NULL) 594 return (ENOMEM); 595 wp->bufmalloc = 1; 596 LIST_FOREACH(s, &p->subdisks, in_plex) { 597 /* Skip the parity stripe. */ 598 if (s->consumer == wp->parity) 599 continue; 600 601 rbp = gv_new_raid5_bit(); 602 rbp->consumer = s->consumer; 603 rbp->bio = g_new_bio(); 604 if (rbp->bio == NULL) 605 return (ENOMEM); 606 /* 607 * The data for the original stripe is written, 608 * the others need to be read in for the parity 609 * calculation. 610 */ 611 if (s->consumer == wp->original) { 612 rbp->bio->bio_cmd = BIO_WRITE; 613 rbp->buf = addr; 614 } else { 615 rbp->bio->bio_cmd = BIO_READ; 616 rbp->buf = g_malloc(wp->length, 617 M_NOWAIT | M_ZERO); 618 if (rbp->buf == NULL) 619 return (ENOMEM); 620 rbp->malloc = 1; 621 } 622 rbp->bio->bio_data = rbp->buf; 623 rbp->bio->bio_offset = wp->offset; 624 rbp->bio->bio_length = wp->length; 625 rbp->bio->bio_done = gv_raid5_done; 626 rbp->bio->bio_caller1 = wp; 627 rbp->bio->bio_caller2 = rbp; 628 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 629 wp->active++; 630 wp->rqcount++; 631 } 632 } 633 break; 634 default: 635 return (EINVAL); 636 } 637 638 wp->state = VALID; 639 return (0); 640 } 641