1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_parity(struct gv_raid5_packet *); 48 int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 49 50 struct gv_raid5_bit * 51 gv_new_raid5_bit(void) 52 { 53 struct gv_raid5_bit *r; 54 r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 55 KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 56 return (r); 57 } 58 59 struct gv_raid5_packet * 60 gv_new_raid5_packet(void) 61 { 62 struct gv_raid5_packet *wp; 63 64 wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 65 KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 66 wp->state = SETUP; 67 wp->type = JUNK; 68 TAILQ_INIT(&wp->bits); 69 70 return (wp); 71 } 72 73 /* 74 * Check if the stripe that the work packet wants is already being used by 75 * some other work packet. 76 */ 77 int 78 gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 79 { 80 struct gv_raid5_packet *wpa; 81 82 TAILQ_FOREACH(wpa, &sc->worklist, list) { 83 if (wpa->lockbase == wp->lockbase) { 84 if (wpa->bio == wp->bio) 85 return (0); 86 return (1); 87 } 88 } 89 return (0); 90 } 91 92 /* 93 * The "worker" thread that runs through the worklist and fires off the 94 * "subrequests" needed to fulfill a RAID5 read or write request. 95 */ 96 void 97 gv_raid5_worker(void *arg) 98 { 99 struct bio *bp; 100 struct g_geom *gp; 101 struct gv_plex *p; 102 struct gv_raid5_packet *wp, *wpt; 103 struct gv_raid5_bit *rbp, *rbpt; 104 int error, restart; 105 106 gp = arg; 107 p = gp->softc; 108 109 mtx_lock(&p->worklist_mtx); 110 for (;;) { 111 restart = 0; 112 g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); 113 TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 114 /* This request packet is already being processed. */ 115 if (wp->state == IO) 116 continue; 117 /* This request packet is ready for processing. */ 118 if (wp->state == VALID) { 119 /* Couldn't get the lock, try again. */ 120 if ((wp->lockbase != -1) && 121 gv_stripe_active(wp, p)) 122 continue; 123 124 wp->state = IO; 125 mtx_unlock(&p->worklist_mtx); 126 TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 127 g_io_request(rbp->bio, rbp->consumer); 128 mtx_lock(&p->worklist_mtx); 129 continue; 130 } 131 if (wp->state == FINISH) { 132 bp = wp->bio; 133 bp->bio_completed += wp->length; 134 /* 135 * Deliver the original request if we have 136 * finished. 137 */ 138 if (bp->bio_completed == bp->bio_length) { 139 mtx_unlock(&p->worklist_mtx); 140 g_io_deliver(bp, 0); 141 mtx_lock(&p->worklist_mtx); 142 } 143 TAILQ_REMOVE(&p->worklist, wp, list); 144 if (wp->bufmalloc == 1) 145 g_free(wp->buf); 146 g_free(wp); 147 restart++; 148 /*break;*/ 149 } 150 } 151 if (!restart) { 152 /* Self-destruct. */ 153 if (p->flags & GV_PLEX_THREAD_DIE) 154 break; 155 g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); 156 error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 157 hz/100); 158 } 159 } 160 mtx_unlock(&p->worklist_mtx); 161 162 g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 163 164 /* Signal our plex that we are dead. */ 165 p->flags |= GV_PLEX_THREAD_DEAD; 166 wakeup(p); 167 kthread_exit(0); 168 } 169 170 /* Final bio transaction to write out the parity data. */ 171 int 172 gv_raid5_parity(struct gv_raid5_packet *wp) 173 { 174 struct bio *bp; 175 176 bp = g_new_bio(); 177 if (bp == NULL) 178 return (ENOMEM); 179 180 wp->type = ISPARITY; 181 bp->bio_cmd = BIO_WRITE; 182 bp->bio_data = wp->buf; 183 bp->bio_offset = wp->offset; 184 bp->bio_length = wp->length; 185 bp->bio_done = gv_raid5_done; 186 bp->bio_caller1 = wp; 187 bp->bio_caller2 = NULL; 188 g_io_request(bp, wp->parity); 189 190 return (0); 191 } 192 193 /* We end up here after each subrequest. */ 194 void 195 gv_raid5_done(struct bio *bp) 196 { 197 struct bio *obp; 198 struct g_geom *gp; 199 struct gv_plex *p; 200 struct gv_raid5_packet *wp; 201 struct gv_raid5_bit *rbp; 202 off_t i; 203 int error; 204 205 wp = bp->bio_caller1; 206 rbp = bp->bio_caller2; 207 obp = wp->bio; 208 gp = bp->bio_from->geom; 209 p = gp->softc; 210 211 /* One less active subrequest. */ 212 wp->active--; 213 214 switch (obp->bio_cmd) { 215 case BIO_READ: 216 /* Degraded reads need to handle parity data. */ 217 if (wp->type == DEGRADED) { 218 for (i = 0; i < wp->length; i++) 219 wp->buf[i] ^= bp->bio_data[i]; 220 221 /* When we're finished copy back the data we want. */ 222 if (wp->active == 0) 223 bcopy(wp->buf, wp->data, wp->length); 224 } 225 226 break; 227 228 case BIO_WRITE: 229 /* Handle the parity data, if needed. */ 230 if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 231 for (i = 0; i < wp->length; i++) 232 wp->buf[i] ^= bp->bio_data[i]; 233 234 /* Write out the parity data we calculated. */ 235 if (wp->active == 0) { 236 wp->active++; 237 error = gv_raid5_parity(wp); 238 } 239 } 240 break; 241 } 242 243 g_destroy_bio(bp); 244 245 if (rbp != NULL) { 246 if (rbp->malloc == 1) 247 g_free(rbp->buf); 248 TAILQ_REMOVE(&wp->bits, rbp, list); 249 g_free(rbp); 250 } 251 252 /* This request group is done. */ 253 if (wp->active == 0) 254 wp->state = FINISH; 255 } 256 257 /* Build a request group to perform (part of) a RAID5 request. */ 258 int 259 gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 260 long bcount, off_t boff) 261 { 262 struct g_geom *gp; 263 struct gv_plex *p; 264 struct gv_raid5_bit *rbp; 265 struct gv_sd *broken, *original, *parity, *s; 266 int i, psdno, sdno; 267 off_t len_left, real_off, stripeend, stripeoff, stripestart; 268 269 gp = bp->bio_to->geom; 270 p = gp->softc; 271 272 if (p == NULL || LIST_EMPTY(&p->subdisks)) 273 return (ENXIO); 274 275 /* We are optimistic and assume that this request will be OK. */ 276 wp->type = NORMAL; 277 original = parity = broken = NULL; 278 279 /* The number of the subdisk containing the parity stripe. */ 280 psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 281 p->sdcount; 282 KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 283 284 /* Offset of the start address from the start of the stripe. */ 285 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 286 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 287 288 /* The number of the subdisk where the stripe resides. */ 289 sdno = stripeoff / p->stripesize; 290 KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 291 292 /* At or past parity subdisk. */ 293 if (sdno >= psdno) 294 sdno++; 295 296 /* The offset of the stripe on this subdisk. */ 297 stripestart = (boff - stripeoff) / (p->sdcount - 1); 298 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 299 300 stripeoff %= p->stripesize; 301 302 /* The offset of the request on this subdisk. */ 303 real_off = stripestart + stripeoff; 304 305 stripeend = stripestart + p->stripesize; 306 len_left = stripeend - real_off; 307 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 308 309 /* Find the right subdisks. */ 310 i = 0; 311 LIST_FOREACH(s, &p->subdisks, in_plex) { 312 if (i == sdno) 313 original = s; 314 if (i == psdno) 315 parity = s; 316 if (s->state != GV_SD_UP) 317 broken = s; 318 i++; 319 } 320 321 if ((original == NULL) || (parity == NULL)) 322 return (ENXIO); 323 324 /* Our data stripe is missing. */ 325 if (original->state != GV_SD_UP) 326 wp->type = DEGRADED; 327 /* Our parity stripe is missing. */ 328 if (parity->state != GV_SD_UP) { 329 /* We cannot take another failure if we're already degraded. */ 330 if (wp->type != NORMAL) 331 return (ENXIO); 332 else 333 wp->type = NOPARITY; 334 } 335 336 /* 337 * A combined write is necessary when the original data subdisk and the 338 * parity subdisk are both up, but one of the other subdisks isn't. 339 */ 340 if ((broken != NULL) && (broken != parity) && (broken != original)) 341 wp->type = COMBINED; 342 343 wp->offset = real_off; 344 wp->length = (bcount <= len_left) ? bcount : len_left; 345 wp->data = addr; 346 wp->original = original->consumer; 347 wp->parity = parity->consumer; 348 wp->lockbase = stripestart; 349 350 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 351 352 switch (bp->bio_cmd) { 353 case BIO_READ: 354 /* 355 * For a degraded read we need to read in all stripes except 356 * the broken one plus the parity stripe and then recalculate 357 * the desired data. 358 */ 359 if (wp->type == DEGRADED) { 360 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 361 if (wp->buf == NULL) 362 return (ENOMEM); 363 wp->bufmalloc = 1; 364 LIST_FOREACH(s, &p->subdisks, in_plex) { 365 /* Skip the broken subdisk. */ 366 if (s == broken) 367 continue; 368 rbp = gv_new_raid5_bit(); 369 rbp->consumer = s->consumer; 370 rbp->bio = g_new_bio(); 371 if (rbp->bio == NULL) 372 return (ENOMEM); 373 rbp->buf = g_malloc(wp->length, 374 M_NOWAIT | M_ZERO); 375 if (rbp->buf == NULL) 376 return (ENOMEM); 377 rbp->malloc = 1; 378 rbp->bio->bio_cmd = BIO_READ; 379 rbp->bio->bio_offset = wp->offset; 380 rbp->bio->bio_length = wp->length; 381 rbp->bio->bio_data = rbp->buf; 382 rbp->bio->bio_done = gv_raid5_done; 383 rbp->bio->bio_caller1 = wp; 384 rbp->bio->bio_caller2 = rbp; 385 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 386 wp->active++; 387 wp->rqcount++; 388 } 389 390 /* A normal read can be fulfilled with the original subdisk. */ 391 } else { 392 rbp = gv_new_raid5_bit(); 393 rbp->consumer = wp->original; 394 rbp->bio = g_new_bio(); 395 if (rbp->bio == NULL) 396 return (ENOMEM); 397 rbp->bio->bio_cmd = BIO_READ; 398 rbp->bio->bio_offset = wp->offset; 399 rbp->bio->bio_length = wp->length; 400 rbp->buf = addr; 401 rbp->bio->bio_data = rbp->buf; 402 rbp->bio->bio_done = gv_raid5_done; 403 rbp->bio->bio_caller1 = wp; 404 rbp->bio->bio_caller2 = rbp; 405 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 406 wp->active++; 407 wp->rqcount++; 408 } 409 if (wp->type != COMBINED) 410 wp->lockbase = -1; 411 break; 412 413 case BIO_WRITE: 414 /* 415 * A degraded write means we cannot write to the original data 416 * subdisk. Thus we need to read in all valid stripes, 417 * recalculate the parity from the original data, and then 418 * write the parity stripe back out. 419 */ 420 if (wp->type == DEGRADED) { 421 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 422 if (wp->buf == NULL) 423 return (ENOMEM); 424 wp->bufmalloc = 1; 425 426 /* Copy the original data. */ 427 bcopy(wp->data, wp->buf, wp->length); 428 429 LIST_FOREACH(s, &p->subdisks, in_plex) { 430 /* Skip the broken and the parity subdisk. */ 431 if ((s == broken) || 432 (s->consumer == wp->parity)) 433 continue; 434 435 rbp = gv_new_raid5_bit(); 436 rbp->consumer = s->consumer; 437 rbp->bio = g_new_bio(); 438 if (rbp->bio == NULL) 439 return (ENOMEM); 440 rbp->buf = g_malloc(wp->length, 441 M_NOWAIT | M_ZERO); 442 if (rbp->buf == NULL) 443 return (ENOMEM); 444 rbp->malloc = 1; 445 rbp->bio->bio_cmd = BIO_READ; 446 rbp->bio->bio_data = rbp->buf; 447 rbp->bio->bio_offset = wp->offset; 448 rbp->bio->bio_length = wp->length; 449 rbp->bio->bio_done = gv_raid5_done; 450 rbp->bio->bio_caller1 = wp; 451 rbp->bio->bio_caller2 = rbp; 452 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 453 wp->active++; 454 wp->rqcount++; 455 } 456 457 /* 458 * When we don't have the parity stripe we just write out the 459 * data. 460 */ 461 } else if (wp->type == NOPARITY) { 462 rbp = gv_new_raid5_bit(); 463 rbp->consumer = wp->original; 464 rbp->bio = g_new_bio(); 465 if (rbp->bio == NULL) 466 return (ENOMEM); 467 rbp->bio->bio_cmd = BIO_WRITE; 468 rbp->bio->bio_offset = wp->offset; 469 rbp->bio->bio_length = wp->length; 470 rbp->bio->bio_data = addr; 471 rbp->bio->bio_done = gv_raid5_done; 472 rbp->bio->bio_caller1 = wp; 473 rbp->bio->bio_caller2 = rbp; 474 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 475 wp->active++; 476 wp->rqcount++; 477 478 /* 479 * A combined write means that our data subdisk and the parity 480 * subdisks are both up, but another subdisk isn't. We need to 481 * read all valid stripes including the parity to recalculate 482 * the data of the stripe that is missing. Then we write our 483 * original data, and together with the other data stripes 484 * recalculate the parity again. 485 */ 486 } else if (wp->type == COMBINED) { 487 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 488 if (wp->buf == NULL) 489 return (ENOMEM); 490 wp->bufmalloc = 1; 491 492 /* Get the data from all subdisks. */ 493 LIST_FOREACH(s, &p->subdisks, in_plex) { 494 /* Skip the broken subdisk. */ 495 if (s == broken) 496 continue; 497 498 rbp = gv_new_raid5_bit(); 499 rbp->consumer = s->consumer; 500 rbp->bio = g_new_bio(); 501 if (rbp->bio == NULL) 502 return (ENOMEM); 503 rbp->bio->bio_cmd = BIO_READ; 504 rbp->buf = g_malloc(wp->length, 505 M_NOWAIT | M_ZERO); 506 if (rbp->buf == NULL) 507 return (ENOMEM); 508 rbp->malloc = 1; 509 rbp->bio->bio_data = rbp->buf; 510 rbp->bio->bio_offset = wp->offset; 511 rbp->bio->bio_length = wp->length; 512 rbp->bio->bio_done = gv_raid5_done; 513 rbp->bio->bio_caller1 = wp; 514 rbp->bio->bio_caller2 = rbp; 515 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 516 wp->active++; 517 wp->rqcount++; 518 } 519 520 /* Write the original data. */ 521 rbp = gv_new_raid5_bit(); 522 rbp->consumer = wp->original; 523 rbp->buf = addr; 524 rbp->bio = g_new_bio(); 525 if (rbp->bio == NULL) 526 return (ENOMEM); 527 rbp->bio->bio_cmd = BIO_WRITE; 528 rbp->bio->bio_data = rbp->buf; 529 rbp->bio->bio_offset = wp->offset; 530 rbp->bio->bio_length = wp->length; 531 rbp->bio->bio_done = gv_raid5_done; 532 rbp->bio->bio_caller1 = wp; 533 rbp->bio->bio_caller2 = rbp; 534 /* 535 * Insert at the tail, because we want to read the old 536 * data first. 537 */ 538 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 539 wp->active++; 540 wp->rqcount++; 541 542 /* Get the rest of the data again. */ 543 LIST_FOREACH(s, &p->subdisks, in_plex) { 544 /* 545 * Skip the broken subdisk, the parity, and the 546 * one we just wrote. 547 */ 548 if ((s == broken) || 549 (s->consumer == wp->parity) || 550 (s->consumer == wp->original)) 551 continue; 552 rbp = gv_new_raid5_bit(); 553 rbp->consumer = s->consumer; 554 rbp->bio = g_new_bio(); 555 if (rbp->bio == NULL) 556 return (ENOMEM); 557 rbp->bio->bio_cmd = BIO_READ; 558 rbp->buf = g_malloc(wp->length, 559 M_NOWAIT | M_ZERO); 560 if (rbp->buf == NULL) 561 return (ENOMEM); 562 rbp->malloc = 1; 563 rbp->bio->bio_data = rbp->buf; 564 rbp->bio->bio_offset = wp->offset; 565 rbp->bio->bio_length = wp->length; 566 rbp->bio->bio_done = gv_raid5_done; 567 rbp->bio->bio_caller1 = wp; 568 rbp->bio->bio_caller2 = rbp; 569 /* 570 * Again, insert at the tail to keep correct 571 * order. 572 */ 573 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 574 wp->active++; 575 wp->rqcount++; 576 } 577 578 579 /* 580 * A normal write request goes to the original subdisk, then we 581 * read in all other stripes, recalculate the parity and write 582 * out the parity again. 583 */ 584 } else { 585 wp->buf = g_malloc(wp->length, M_NOWAIT | M_ZERO); 586 if (wp->buf == NULL) 587 return (ENOMEM); 588 wp->bufmalloc = 1; 589 LIST_FOREACH(s, &p->subdisks, in_plex) { 590 /* Skip the parity stripe. */ 591 if (s->consumer == wp->parity) 592 continue; 593 594 rbp = gv_new_raid5_bit(); 595 rbp->consumer = s->consumer; 596 rbp->bio = g_new_bio(); 597 if (rbp->bio == NULL) 598 return (ENOMEM); 599 /* 600 * The data for the original stripe is written, 601 * the others need to be read in for the parity 602 * calculation. 603 */ 604 if (s->consumer == wp->original) { 605 rbp->bio->bio_cmd = BIO_WRITE; 606 rbp->buf = addr; 607 } else { 608 rbp->bio->bio_cmd = BIO_READ; 609 rbp->buf = g_malloc(wp->length, 610 M_NOWAIT | M_ZERO); 611 if (rbp->buf == NULL) 612 return (ENOMEM); 613 rbp->malloc = 1; 614 } 615 rbp->bio->bio_data = rbp->buf; 616 rbp->bio->bio_offset = wp->offset; 617 rbp->bio->bio_length = wp->length; 618 rbp->bio->bio_done = gv_raid5_done; 619 rbp->bio->bio_caller1 = wp; 620 rbp->bio->bio_caller2 = rbp; 621 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 622 wp->active++; 623 wp->rqcount++; 624 } 625 } 626 break; 627 default: 628 return (EINVAL); 629 } 630 631 wp->state = VALID; 632 return (0); 633 } 634