1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_parity(struct gv_raid5_packet *); 48 int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 49 50 struct gv_raid5_bit * 51 gv_new_raid5_bit(void) 52 { 53 struct gv_raid5_bit *r; 54 r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 55 KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 56 return (r); 57 } 58 59 struct gv_raid5_packet * 60 gv_new_raid5_packet(void) 61 { 62 struct gv_raid5_packet *wp; 63 64 wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 65 KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 66 wp->state = SETUP; 67 wp->type = JUNK; 68 TAILQ_INIT(&wp->bits); 69 70 return (wp); 71 } 72 73 /* 74 * Check if the stripe that the work packet wants is already being used by 75 * some other work packet. 76 */ 77 int 78 gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 79 { 80 struct gv_raid5_packet *wpa; 81 82 TAILQ_FOREACH(wpa, &sc->worklist, list) { 83 if (wpa->lockbase == wp->lockbase) { 84 if (wpa->bio == wp->bio) 85 return (0); 86 return (1); 87 } 88 } 89 return (0); 90 } 91 92 /* 93 * The "worker" thread that runs through the worklist and fires off the 94 * "subrequests" needed to fulfill a RAID5 read or write request. 95 */ 96 void 97 gv_raid5_worker(void *arg) 98 { 99 struct bio *bp; 100 struct g_geom *gp; 101 struct gv_plex *p; 102 struct gv_raid5_packet *wp, *wpt; 103 struct gv_raid5_bit *rbp, *rbpt; 104 int error, restart; 105 106 gp = arg; 107 p = gp->softc; 108 109 mtx_lock(&p->worklist_mtx); 110 for (;;) { 111 restart = 0; 112 TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 113 /* This request packet is already being processed. */ 114 if (wp->state == IO) 115 continue; 116 /* This request packet is ready for processing. */ 117 if (wp->state == VALID) { 118 /* Couldn't get the lock, try again. */ 119 if ((wp->lockbase != -1) && 120 gv_stripe_active(wp, p)) 121 continue; 122 123 wp->state = IO; 124 mtx_unlock(&p->worklist_mtx); 125 TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 126 g_io_request(rbp->bio, rbp->consumer); 127 mtx_lock(&p->worklist_mtx); 128 continue; 129 } 130 if (wp->state == FINISH) { 131 bp = wp->bio; 132 bp->bio_completed += wp->length; 133 /* 134 * Deliver the original request if we have 135 * finished. 136 */ 137 if (bp->bio_completed == bp->bio_length) { 138 mtx_unlock(&p->worklist_mtx); 139 g_io_deliver(bp, 0); 140 mtx_lock(&p->worklist_mtx); 141 } 142 TAILQ_REMOVE(&p->worklist, wp, list); 143 if (wp->bufmalloc == 1) 144 g_free(wp->buf); 145 g_free(wp); 146 restart++; 147 /*break;*/ 148 } 149 } 150 if (!restart) { 151 /* Self-destruct. */ 152 if (p->flags & GV_PLEX_THREAD_DIE) 153 break; 154 error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 155 hz/100); 156 } 157 } 158 mtx_unlock(&p->worklist_mtx); 159 160 g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 161 162 /* Signal our plex that we are dead. */ 163 p->flags |= GV_PLEX_THREAD_DEAD; 164 wakeup(p); 165 kthread_exit(0); 166 } 167 168 /* Final bio transaction to write out the parity data. */ 169 int 170 gv_raid5_parity(struct gv_raid5_packet *wp) 171 { 172 struct bio *bp; 173 174 bp = g_new_bio(); 175 if (bp == NULL) 176 return (ENOMEM); 177 178 wp->type = ISPARITY; 179 bp->bio_cmd = BIO_WRITE; 180 bp->bio_data = wp->buf; 181 bp->bio_offset = wp->offset; 182 bp->bio_length = wp->length; 183 bp->bio_done = gv_raid5_done; 184 bp->bio_caller1 = wp; 185 bp->bio_caller2 = NULL; 186 g_io_request(bp, wp->parity); 187 188 return (0); 189 } 190 191 /* We end up here after each subrequest. */ 192 void 193 gv_raid5_done(struct bio *bp) 194 { 195 struct bio *obp; 196 struct g_geom *gp; 197 struct gv_plex *p; 198 struct gv_raid5_packet *wp; 199 struct gv_raid5_bit *rbp; 200 off_t i; 201 int error; 202 203 wp = bp->bio_caller1; 204 rbp = bp->bio_caller2; 205 obp = wp->bio; 206 gp = bp->bio_from->geom; 207 p = gp->softc; 208 209 /* One less active subrequest. */ 210 wp->active--; 211 212 switch (obp->bio_cmd) { 213 case BIO_READ: 214 /* Degraded reads need to handle parity data. */ 215 if (wp->type == DEGRADED) { 216 for (i = 0; i < wp->length; i++) 217 wp->buf[i] ^= bp->bio_data[i]; 218 219 /* When we're finished copy back the data we want. */ 220 if (wp->active == 0) 221 bcopy(wp->buf, wp->data, wp->length); 222 } 223 224 break; 225 226 case BIO_WRITE: 227 /* Handle the parity data, if needed. */ 228 if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 229 for (i = 0; i < wp->length; i++) 230 wp->buf[i] ^= bp->bio_data[i]; 231 232 /* Write out the parity data we calculated. */ 233 if (wp->active == 0) { 234 wp->active++; 235 error = gv_raid5_parity(wp); 236 } 237 } 238 break; 239 } 240 241 g_destroy_bio(bp); 242 243 if (rbp != NULL) { 244 if (rbp->malloc == 1) 245 g_free(rbp->buf); 246 TAILQ_REMOVE(&wp->bits, rbp, list); 247 g_free(rbp); 248 } 249 250 /* This request group is done. */ 251 if (wp->active == 0) 252 wp->state = FINISH; 253 } 254 255 /* Build a request group to perform (part of) a RAID5 request. */ 256 int 257 gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 258 long bcount, off_t boff) 259 { 260 struct g_geom *gp; 261 struct gv_plex *p; 262 struct gv_raid5_bit *rbp; 263 struct gv_sd *broken, *original, *parity, *s; 264 int i, psdno, sdno; 265 off_t len_left, real_off, stripeend, stripeoff, stripestart; 266 267 gp = bp->bio_to->geom; 268 p = gp->softc; 269 270 if (p == NULL || LIST_EMPTY(&p->subdisks)) 271 return (ENXIO); 272 273 /* We are optimistic and assume that this request will be OK. */ 274 wp->type = NORMAL; 275 original = parity = broken = NULL; 276 277 /* The number of the subdisk containing the parity stripe. */ 278 psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 279 p->sdcount; 280 KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 281 282 /* Offset of the start address from the start of the stripe. */ 283 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 284 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 285 286 /* The number of the subdisk where the stripe resides. */ 287 sdno = stripeoff / p->stripesize; 288 KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 289 290 /* At or past parity subdisk. */ 291 if (sdno >= psdno) 292 sdno++; 293 294 /* The offset of the stripe on this subdisk. */ 295 stripestart = (boff - stripeoff) / (p->sdcount - 1); 296 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 297 298 stripeoff %= p->stripesize; 299 300 /* The offset of the request on this subdisk. */ 301 real_off = stripestart + stripeoff; 302 303 stripeend = stripestart + p->stripesize; 304 len_left = stripeend - real_off; 305 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 306 307 /* Find the right subdisks. */ 308 i = 0; 309 LIST_FOREACH(s, &p->subdisks, in_plex) { 310 if (i == sdno) 311 original = s; 312 if (i == psdno) 313 parity = s; 314 if (s->state != GV_SD_UP) 315 broken = s; 316 i++; 317 } 318 319 if ((original == NULL) || (parity == NULL)) 320 return (ENXIO); 321 322 /* Our data stripe is missing. */ 323 if (original->state != GV_SD_UP) 324 wp->type = DEGRADED; 325 /* Our parity stripe is missing. */ 326 if (parity->state != GV_SD_UP) { 327 /* We cannot take another failure if we're already degraded. */ 328 if (wp->type != NORMAL) 329 return (ENXIO); 330 else 331 wp->type = NOPARITY; 332 } 333 334 /* 335 * A combined write is necessary when the original data subdisk and the 336 * parity subdisk are both up, but one of the other subdisks isn't. 337 */ 338 if ((broken != NULL) && (broken != parity) && (broken != original)) 339 wp->type = COMBINED; 340 341 wp->offset = real_off; 342 wp->length = (bcount <= len_left) ? bcount : len_left; 343 wp->data = addr; 344 wp->original = original->consumer; 345 wp->parity = parity->consumer; 346 wp->lockbase = stripestart; 347 348 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 349 350 switch (bp->bio_cmd) { 351 case BIO_READ: 352 /* 353 * For a degraded read we need to read in all stripes except 354 * the broken one plus the parity stripe and then recalculate 355 * the desired data. 356 */ 357 if (wp->type == DEGRADED) { 358 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 359 if (wp->buf == NULL) 360 return (ENOMEM); 361 wp->bufmalloc = 1; 362 LIST_FOREACH(s, &p->subdisks, in_plex) { 363 /* Skip the broken subdisk. */ 364 if (s == broken) 365 continue; 366 rbp = gv_new_raid5_bit(); 367 rbp->consumer = s->consumer; 368 rbp->bio = g_new_bio(); 369 if (rbp->bio == NULL) 370 return (ENOMEM); 371 rbp->buf = g_malloc(wp->length, 372 M_NOWAIT | M_ZERO); 373 if (rbp->buf == NULL) 374 return (ENOMEM); 375 rbp->malloc = 1; 376 rbp->bio->bio_cmd = BIO_READ; 377 rbp->bio->bio_offset = wp->offset; 378 rbp->bio->bio_length = wp->length; 379 rbp->bio->bio_data = rbp->buf; 380 rbp->bio->bio_done = gv_raid5_done; 381 rbp->bio->bio_caller1 = wp; 382 rbp->bio->bio_caller2 = rbp; 383 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 384 wp->active++; 385 wp->rqcount++; 386 } 387 388 /* A normal read can be fulfilled with the original subdisk. */ 389 } else { 390 rbp = gv_new_raid5_bit(); 391 rbp->consumer = wp->original; 392 rbp->bio = g_new_bio(); 393 if (rbp->bio == NULL) 394 return (ENOMEM); 395 rbp->bio->bio_cmd = BIO_READ; 396 rbp->bio->bio_offset = wp->offset; 397 rbp->bio->bio_length = wp->length; 398 rbp->buf = addr; 399 rbp->bio->bio_data = rbp->buf; 400 rbp->bio->bio_done = gv_raid5_done; 401 rbp->bio->bio_caller1 = wp; 402 rbp->bio->bio_caller2 = rbp; 403 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 404 wp->active++; 405 wp->rqcount++; 406 } 407 if (wp->type != COMBINED) 408 wp->lockbase = -1; 409 break; 410 411 case BIO_WRITE: 412 /* 413 * A degraded write means we cannot write to the original data 414 * subdisk. Thus we need to read in all valid stripes, 415 * recalculate the parity from the original data, and then 416 * write the parity stripe back out. 417 */ 418 if (wp->type == DEGRADED) { 419 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 420 if (wp->buf == NULL) 421 return (ENOMEM); 422 wp->bufmalloc = 1; 423 424 /* Copy the original data. */ 425 bcopy(wp->data, wp->buf, wp->length); 426 427 LIST_FOREACH(s, &p->subdisks, in_plex) { 428 /* Skip the broken and the parity subdisk. */ 429 if ((s == broken) || 430 (s->consumer == wp->parity)) 431 continue; 432 433 rbp = gv_new_raid5_bit(); 434 rbp->consumer = s->consumer; 435 rbp->bio = g_new_bio(); 436 if (rbp->bio == NULL) 437 return (ENOMEM); 438 rbp->buf = g_malloc(wp->length, 439 M_NOWAIT | M_ZERO); 440 if (rbp->buf == NULL) 441 return (ENOMEM); 442 rbp->malloc = 1; 443 rbp->bio->bio_cmd = BIO_READ; 444 rbp->bio->bio_data = rbp->buf; 445 rbp->bio->bio_offset = wp->offset; 446 rbp->bio->bio_length = wp->length; 447 rbp->bio->bio_done = gv_raid5_done; 448 rbp->bio->bio_caller1 = wp; 449 rbp->bio->bio_caller2 = rbp; 450 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 451 wp->active++; 452 wp->rqcount++; 453 } 454 455 /* 456 * When we don't have the parity stripe we just write out the 457 * data. 458 */ 459 } else if (wp->type == NOPARITY) { 460 rbp = gv_new_raid5_bit(); 461 rbp->consumer = wp->original; 462 rbp->bio = g_new_bio(); 463 if (rbp->bio == NULL) 464 return (ENOMEM); 465 rbp->bio->bio_cmd = BIO_WRITE; 466 rbp->bio->bio_offset = wp->offset; 467 rbp->bio->bio_length = wp->length; 468 rbp->bio->bio_data = addr; 469 rbp->bio->bio_done = gv_raid5_done; 470 rbp->bio->bio_caller1 = wp; 471 rbp->bio->bio_caller2 = rbp; 472 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 473 wp->active++; 474 wp->rqcount++; 475 476 /* 477 * A combined write means that our data subdisk and the parity 478 * subdisks are both up, but another subdisk isn't. We need to 479 * read all valid stripes including the parity to recalculate 480 * the data of the stripe that is missing. Then we write our 481 * original data, and together with the other data stripes 482 * recalculate the parity again. 483 */ 484 } else if (wp->type == COMBINED) { 485 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 486 if (wp->buf == NULL) 487 return (ENOMEM); 488 wp->bufmalloc = 1; 489 490 /* Get the data from all subdisks. */ 491 LIST_FOREACH(s, &p->subdisks, in_plex) { 492 /* Skip the broken subdisk. */ 493 if (s == broken) 494 continue; 495 496 rbp = gv_new_raid5_bit(); 497 rbp->consumer = s->consumer; 498 rbp->bio = g_new_bio(); 499 if (rbp->bio == NULL) 500 return (ENOMEM); 501 rbp->bio->bio_cmd = BIO_READ; 502 rbp->buf = g_malloc(wp->length, 503 M_NOWAIT | M_ZERO); 504 if (rbp->buf == NULL) 505 return (ENOMEM); 506 rbp->malloc = 1; 507 rbp->bio->bio_data = rbp->buf; 508 rbp->bio->bio_offset = wp->offset; 509 rbp->bio->bio_length = wp->length; 510 rbp->bio->bio_done = gv_raid5_done; 511 rbp->bio->bio_caller1 = wp; 512 rbp->bio->bio_caller2 = rbp; 513 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 514 wp->active++; 515 wp->rqcount++; 516 } 517 518 /* Write the original data. */ 519 rbp = gv_new_raid5_bit(); 520 rbp->consumer = wp->original; 521 rbp->buf = addr; 522 rbp->bio = g_new_bio(); 523 if (rbp->bio == NULL) 524 return (ENOMEM); 525 rbp->bio->bio_cmd = BIO_WRITE; 526 rbp->bio->bio_data = rbp->buf; 527 rbp->bio->bio_offset = wp->offset; 528 rbp->bio->bio_length = wp->length; 529 rbp->bio->bio_done = gv_raid5_done; 530 rbp->bio->bio_caller1 = wp; 531 rbp->bio->bio_caller2 = rbp; 532 /* 533 * Insert at the tail, because we want to read the old 534 * data first. 535 */ 536 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 537 wp->active++; 538 wp->rqcount++; 539 540 /* Get the rest of the data again. */ 541 LIST_FOREACH(s, &p->subdisks, in_plex) { 542 /* 543 * Skip the broken subdisk, the parity, and the 544 * one we just wrote. 545 */ 546 if ((s == broken) || 547 (s->consumer == wp->parity) || 548 (s->consumer == wp->original)) 549 continue; 550 rbp = gv_new_raid5_bit(); 551 rbp->consumer = s->consumer; 552 rbp->bio = g_new_bio(); 553 if (rbp->bio == NULL) 554 return (ENOMEM); 555 rbp->bio->bio_cmd = BIO_READ; 556 rbp->buf = g_malloc(wp->length, 557 M_NOWAIT | M_ZERO); 558 if (rbp->buf == NULL) 559 return (ENOMEM); 560 rbp->malloc = 1; 561 rbp->bio->bio_data = rbp->buf; 562 rbp->bio->bio_offset = wp->offset; 563 rbp->bio->bio_length = wp->length; 564 rbp->bio->bio_done = gv_raid5_done; 565 rbp->bio->bio_caller1 = wp; 566 rbp->bio->bio_caller2 = rbp; 567 /* 568 * Again, insert at the tail to keep correct 569 * order. 570 */ 571 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 572 wp->active++; 573 wp->rqcount++; 574 } 575 576 577 /* 578 * A normal write request goes to the original subdisk, then we 579 * read in all other stripes, recalculate the parity and write 580 * out the parity again. 581 */ 582 } else { 583 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 584 if (wp->buf == NULL) 585 return (ENOMEM); 586 wp->bufmalloc = 1; 587 LIST_FOREACH(s, &p->subdisks, in_plex) { 588 /* Skip the parity stripe. */ 589 if (s->consumer == wp->parity) 590 continue; 591 592 rbp = gv_new_raid5_bit(); 593 rbp->consumer = s->consumer; 594 rbp->bio = g_new_bio(); 595 if (rbp->bio == NULL) 596 return (ENOMEM); 597 /* 598 * The data for the original stripe is written, 599 * the others need to be read in for the parity 600 * calculation. 601 */ 602 if (s->consumer == wp->original) { 603 rbp->bio->bio_cmd = BIO_WRITE; 604 rbp->buf = addr; 605 } else { 606 rbp->bio->bio_cmd = BIO_READ; 607 rbp->buf = g_malloc(wp->length, 608 M_NOWAIT | M_ZERO); 609 if (rbp->buf == NULL) 610 return (ENOMEM); 611 rbp->malloc = 1; 612 } 613 rbp->bio->bio_data = rbp->buf; 614 rbp->bio->bio_offset = wp->offset; 615 rbp->bio->bio_length = wp->length; 616 rbp->bio->bio_done = gv_raid5_done; 617 rbp->bio->bio_caller1 = wp; 618 rbp->bio->bio_caller2 = rbp; 619 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 620 wp->active++; 621 wp->rqcount++; 622 } 623 } 624 break; 625 default: 626 return (EINVAL); 627 } 628 629 wp->state = VALID; 630 return (0); 631 } 632