1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_parity(struct gv_raid5_packet *); 48 int gv_stripe_active(struct gv_raid5_packet *, struct gv_plex *); 49 50 struct gv_raid5_bit * 51 gv_new_raid5_bit(void) 52 { 53 struct gv_raid5_bit *r; 54 r = g_malloc(sizeof(*r), M_NOWAIT | M_ZERO); 55 KASSERT(r != NULL, ("gv_new_raid5_bit: NULL r")); 56 return (r); 57 } 58 59 struct gv_raid5_packet * 60 gv_new_raid5_packet(void) 61 { 62 struct gv_raid5_packet *wp; 63 64 wp = g_malloc(sizeof(*wp), M_NOWAIT | M_ZERO); 65 KASSERT(wp != NULL, ("gv_new_raid5_packet: NULL wp")); 66 wp->state = SETUP; 67 wp->type = JUNK; 68 TAILQ_INIT(&wp->bits); 69 70 return (wp); 71 } 72 73 /* 74 * Check if the stripe that the work packet wants is already being used by 75 * some other work packet. 76 */ 77 int 78 gv_stripe_active(struct gv_raid5_packet *wp, struct gv_plex *sc) 79 { 80 struct gv_raid5_packet *wpa; 81 82 TAILQ_FOREACH(wpa, &sc->worklist, list) { 83 if (wpa->lockbase == wp->lockbase) { 84 if (wpa->bio == wp->bio) 85 return (0); 86 return (1); 87 } 88 } 89 return (0); 90 } 91 92 /* 93 * The "worker" thread that runs through the worklist and fires off the 94 * "subrequests" needed to fulfill a RAID5 read or write request. 95 */ 96 void 97 gv_raid5_worker(void *arg) 98 { 99 struct bio *bp; 100 struct g_geom *gp; 101 struct gv_plex *p; 102 struct gv_raid5_packet *wp, *wpt; 103 struct gv_raid5_bit *rbp, *rbpt; 104 int error, restart; 105 106 gp = arg; 107 p = gp->softc; 108 109 mtx_lock(&p->worklist_mtx); 110 for (;;) { 111 restart = 0; 112 g_trace(G_T_TOPOLOGY, "gv_raid5_worker scan"); 113 TAILQ_FOREACH_SAFE(wp, &p->worklist, list, wpt) { 114 /* This request packet is already being processed. */ 115 if (wp->state == IO) 116 continue; 117 /* This request packet is ready for processing. */ 118 if (wp->state == VALID) { 119 /* Couldn't get the lock, try again. */ 120 if ((wp->lockbase != -1) && 121 gv_stripe_active(wp, p)) 122 continue; 123 124 wp->state = IO; 125 mtx_unlock(&p->worklist_mtx); 126 TAILQ_FOREACH_SAFE(rbp, &wp->bits, list, rbpt) 127 g_io_request(rbp->bio, rbp->consumer); 128 mtx_lock(&p->worklist_mtx); 129 continue; 130 } 131 if (wp->state == FINISH) { 132 bp = wp->bio; 133 bp->bio_completed += wp->length; 134 /* 135 * Deliver the original request if we have 136 * finished. 137 */ 138 if (bp->bio_completed == bp->bio_length) { 139 mtx_unlock(&p->worklist_mtx); 140 g_io_deliver(bp, 0); 141 mtx_lock(&p->worklist_mtx); 142 } 143 TAILQ_REMOVE(&p->worklist, wp, list); 144 if (wp->bufmalloc == 1) 145 g_free(wp->buf); 146 g_free(wp); 147 restart++; 148 /*break;*/ 149 } 150 } 151 if (!restart) { 152 /* Self-destruct. */ 153 if (p->flags & GV_PLEX_THREAD_DIE) 154 break; 155 g_trace(G_T_TOPOLOGY, "gv_raid5_worker sleep"); 156 error = msleep(p, &p->worklist_mtx, PRIBIO, "-", 157 hz/100); 158 } 159 } 160 mtx_unlock(&p->worklist_mtx); 161 162 g_trace(G_T_TOPOLOGY, "gv_raid5_worker die"); 163 164 /* Signal our plex that we are dead. */ 165 p->flags |= GV_PLEX_THREAD_DEAD; 166 wakeup(p); 167 kthread_exit(0); 168 } 169 170 /* Final bio transaction to write out the parity data. */ 171 int 172 gv_raid5_parity(struct gv_raid5_packet *wp) 173 { 174 struct bio *bp; 175 176 bp = g_new_bio(); 177 if (bp == NULL) 178 return (ENOMEM); 179 180 wp->type = ISPARITY; 181 bp->bio_cmd = BIO_WRITE; 182 bp->bio_data = wp->buf; 183 bp->bio_offset = wp->offset; 184 bp->bio_length = wp->length; 185 bp->bio_done = gv_raid5_done; 186 bp->bio_caller1 = wp; 187 bp->bio_caller2 = NULL; 188 g_io_request(bp, wp->parity); 189 190 return (0); 191 } 192 193 /* We end up here after each subrequest. */ 194 void 195 gv_raid5_done(struct bio *bp) 196 { 197 struct bio *obp; 198 struct g_geom *gp; 199 struct gv_plex *p; 200 struct gv_raid5_packet *wp; 201 struct gv_raid5_bit *rbp; 202 off_t i; 203 int error; 204 205 wp = bp->bio_caller1; 206 rbp = bp->bio_caller2; 207 obp = wp->bio; 208 gp = bp->bio_from->geom; 209 p = gp->softc; 210 211 /* One less active subrequest. */ 212 wp->active--; 213 214 switch (obp->bio_cmd) { 215 case BIO_READ: 216 /* Degraded reads need to handle parity data. */ 217 if (wp->type == DEGRADED) { 218 for (i = 0; i < wp->length; i++) 219 wp->buf[i] ^= bp->bio_data[i]; 220 221 /* When we're finished copy back the data we want. */ 222 if (wp->active == 0) 223 bcopy(wp->buf, wp->data, wp->length); 224 } 225 226 break; 227 228 case BIO_WRITE: 229 /* Handle the parity data, if needed. */ 230 if ((wp->type != NOPARITY) && (wp->type != ISPARITY)) { 231 for (i = 0; i < wp->length; i++) 232 wp->buf[i] ^= bp->bio_data[i]; 233 234 /* Write out the parity data we calculated. */ 235 if (wp->active == 0) { 236 wp->active++; 237 error = gv_raid5_parity(wp); 238 } 239 } 240 break; 241 } 242 243 g_destroy_bio(bp); 244 245 if (rbp != NULL) { 246 if (rbp->malloc == 1) 247 g_free(rbp->buf); 248 TAILQ_REMOVE(&wp->bits, rbp, list); 249 g_free(rbp); 250 } 251 252 /* This request group is done. */ 253 if (wp->active == 0) 254 wp->state = FINISH; 255 } 256 257 /* Build a request group to perform (part of) a RAID5 request. */ 258 int 259 gv_build_raid5_req(struct gv_raid5_packet *wp, struct bio *bp, caddr_t addr, 260 long bcount, off_t boff) 261 { 262 struct g_geom *gp; 263 struct gv_plex *p; 264 struct gv_raid5_bit *rbp; 265 struct gv_sd *broken, *original, *parity, *s; 266 int i, psdno, sdno; 267 off_t len_left, real_off, stripeend, stripeoff, stripestart; 268 269 gp = bp->bio_to->geom; 270 p = gp->softc; 271 272 if (p == NULL || LIST_EMPTY(&p->subdisks)) 273 return (ENXIO); 274 275 /* We are optimistic and assume that this request will be OK. */ 276 wp->type = NORMAL; 277 original = parity = broken = NULL; 278 279 /* The number of the subdisk containing the parity stripe. */ 280 psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 281 p->sdcount; 282 KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 283 284 /* Offset of the start address from the start of the stripe. */ 285 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 286 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 287 288 /* The number of the subdisk where the stripe resides. */ 289 sdno = stripeoff / p->stripesize; 290 KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 291 292 /* At or past parity subdisk. */ 293 if (sdno >= psdno) 294 sdno++; 295 296 /* The offset of the stripe on this subdisk. */ 297 stripestart = (boff - stripeoff) / (p->sdcount - 1); 298 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 299 300 stripeoff %= p->stripesize; 301 302 /* The offset of the request on this subdisk. */ 303 real_off = stripestart + stripeoff; 304 305 stripeend = stripestart + p->stripesize; 306 len_left = stripeend - real_off; 307 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 308 309 /* Find the right subdisks. */ 310 i = 0; 311 LIST_FOREACH(s, &p->subdisks, in_plex) { 312 if (i == sdno) 313 original = s; 314 if (i == psdno) 315 parity = s; 316 if (s->state != GV_SD_UP) 317 broken = s; 318 i++; 319 } 320 321 if ((original == NULL) || (parity == NULL)) 322 return (ENXIO); 323 324 /* Our data stripe is missing. */ 325 if (original->state != GV_SD_UP) 326 wp->type = DEGRADED; 327 /* Our parity stripe is missing. */ 328 if (parity->state != GV_SD_UP) { 329 /* We cannot take another failure if we're already degraded. */ 330 if (wp->type != NORMAL) 331 return (ENXIO); 332 else 333 wp->type = NOPARITY; 334 } 335 336 /* 337 * A combined write is necessary when the original data subdisk and the 338 * parity subdisk are both up, but one of the other subdisks isn't. 339 */ 340 if ((broken != NULL) && (broken != parity) && (broken != original)) 341 wp->type = COMBINED; 342 343 wp->offset = real_off; 344 wp->length = (bcount <= len_left) ? bcount : len_left; 345 wp->data = addr; 346 wp->original = original->consumer; 347 wp->parity = parity->consumer; 348 wp->lockbase = stripestart; 349 350 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 351 352 switch (bp->bio_cmd) { 353 case BIO_READ: 354 /* 355 * For a degraded read we need to read in all stripes except 356 * the broken one plus the parity stripe and then recalculate 357 * the desired data. 358 */ 359 if (wp->type == DEGRADED) { 360 wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 361 wp->bufmalloc = 1; 362 LIST_FOREACH(s, &p->subdisks, in_plex) { 363 /* Skip the broken subdisk. */ 364 if (s == broken) 365 continue; 366 rbp = gv_new_raid5_bit(); 367 rbp->consumer = s->consumer; 368 rbp->bio = g_new_bio(); 369 if (rbp->bio == NULL) 370 return (ENOMEM); 371 rbp->buf = g_malloc(wp->length, 372 M_WAITOK | M_ZERO); 373 rbp->malloc = 1; 374 rbp->bio->bio_cmd = BIO_READ; 375 rbp->bio->bio_offset = wp->offset; 376 rbp->bio->bio_length = wp->length; 377 rbp->bio->bio_data = rbp->buf; 378 rbp->bio->bio_done = gv_raid5_done; 379 rbp->bio->bio_caller1 = wp; 380 rbp->bio->bio_caller2 = rbp; 381 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 382 wp->active++; 383 wp->rqcount++; 384 } 385 386 /* A normal read can be fulfilled with the original subdisk. */ 387 } else { 388 rbp = gv_new_raid5_bit(); 389 rbp->consumer = wp->original; 390 rbp->bio = g_new_bio(); 391 if (rbp->bio == NULL) 392 return (ENOMEM); 393 rbp->bio->bio_cmd = BIO_READ; 394 rbp->bio->bio_offset = wp->offset; 395 rbp->bio->bio_length = wp->length; 396 rbp->buf = addr; 397 rbp->bio->bio_data = rbp->buf; 398 rbp->bio->bio_done = gv_raid5_done; 399 rbp->bio->bio_caller1 = wp; 400 rbp->bio->bio_caller2 = rbp; 401 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 402 wp->active++; 403 wp->rqcount++; 404 } 405 if (wp->type != COMBINED) 406 wp->lockbase = -1; 407 break; 408 409 case BIO_WRITE: 410 /* 411 * A degraded write means we cannot write to the original data 412 * subdisk. Thus we need to read in all valid stripes, 413 * recalculate the parity from the original data, and then 414 * write the parity stripe back out. 415 */ 416 if (wp->type == DEGRADED) { 417 wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 418 wp->bufmalloc = 1; 419 420 /* Copy the original data. */ 421 bcopy(wp->data, wp->buf, wp->length); 422 423 LIST_FOREACH(s, &p->subdisks, in_plex) { 424 /* Skip the broken and the parity subdisk. */ 425 if ((s == broken) || 426 (s->consumer == wp->parity)) 427 continue; 428 429 rbp = gv_new_raid5_bit(); 430 rbp->consumer = s->consumer; 431 rbp->bio = g_new_bio(); 432 if (rbp->bio == NULL) 433 return (ENOMEM); 434 rbp->buf = g_malloc(wp->length, 435 M_WAITOK | M_ZERO); 436 rbp->malloc = 1; 437 rbp->bio->bio_cmd = BIO_READ; 438 rbp->bio->bio_data = rbp->buf; 439 rbp->bio->bio_offset = wp->offset; 440 rbp->bio->bio_length = wp->length; 441 rbp->bio->bio_done = gv_raid5_done; 442 rbp->bio->bio_caller1 = wp; 443 rbp->bio->bio_caller2 = rbp; 444 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 445 wp->active++; 446 wp->rqcount++; 447 } 448 449 /* 450 * When we don't have the parity stripe we just write out the 451 * data. 452 */ 453 } else if (wp->type == NOPARITY) { 454 rbp = gv_new_raid5_bit(); 455 rbp->consumer = wp->original; 456 rbp->bio = g_new_bio(); 457 if (rbp->bio == NULL) 458 return (ENOMEM); 459 rbp->bio->bio_cmd = BIO_WRITE; 460 rbp->bio->bio_offset = wp->offset; 461 rbp->bio->bio_length = wp->length; 462 rbp->bio->bio_data = addr; 463 rbp->bio->bio_done = gv_raid5_done; 464 rbp->bio->bio_caller1 = wp; 465 rbp->bio->bio_caller2 = rbp; 466 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 467 wp->active++; 468 wp->rqcount++; 469 470 /* 471 * A combined write means that our data subdisk and the parity 472 * subdisks are both up, but another subdisk isn't. We need to 473 * read all valid stripes including the parity to recalculate 474 * the data of the stripe that is missing. Then we write our 475 * original data, and together with the other data stripes 476 * recalculate the parity again. 477 */ 478 } else if (wp->type == COMBINED) { 479 wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 480 wp->bufmalloc = 1; 481 482 /* Get the data from all subdisks. */ 483 LIST_FOREACH(s, &p->subdisks, in_plex) { 484 /* Skip the broken subdisk. */ 485 if (s == broken) 486 continue; 487 488 rbp = gv_new_raid5_bit(); 489 rbp->consumer = s->consumer; 490 rbp->bio = g_new_bio(); 491 if (rbp->bio == NULL) 492 return (ENOMEM); 493 rbp->bio->bio_cmd = BIO_READ; 494 rbp->buf = g_malloc(wp->length, 495 M_WAITOK | M_ZERO); 496 rbp->malloc = 1; 497 rbp->bio->bio_data = rbp->buf; 498 rbp->bio->bio_offset = wp->offset; 499 rbp->bio->bio_length = wp->length; 500 rbp->bio->bio_done = gv_raid5_done; 501 rbp->bio->bio_caller1 = wp; 502 rbp->bio->bio_caller2 = rbp; 503 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 504 wp->active++; 505 wp->rqcount++; 506 } 507 508 /* Write the original data. */ 509 rbp = gv_new_raid5_bit(); 510 rbp->consumer = wp->original; 511 rbp->buf = addr; 512 rbp->bio = g_new_bio(); 513 if (rbp->bio == NULL) 514 return (ENOMEM); 515 rbp->bio->bio_cmd = BIO_WRITE; 516 rbp->bio->bio_data = rbp->buf; 517 rbp->bio->bio_offset = wp->offset; 518 rbp->bio->bio_length = wp->length; 519 rbp->bio->bio_done = gv_raid5_done; 520 rbp->bio->bio_caller1 = wp; 521 rbp->bio->bio_caller2 = rbp; 522 /* 523 * Insert at the tail, because we want to read the old 524 * data first. 525 */ 526 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 527 wp->active++; 528 wp->rqcount++; 529 530 /* Get the rest of the data again. */ 531 LIST_FOREACH(s, &p->subdisks, in_plex) { 532 /* 533 * Skip the broken subdisk, the parity, and the 534 * one we just wrote. 535 */ 536 if ((s == broken) || 537 (s->consumer == wp->parity) || 538 (s->consumer == wp->original)) 539 continue; 540 rbp = gv_new_raid5_bit(); 541 rbp->consumer = s->consumer; 542 rbp->bio = g_new_bio(); 543 if (rbp->bio == NULL) 544 return (ENOMEM); 545 rbp->bio->bio_cmd = BIO_READ; 546 rbp->buf = g_malloc(wp->length, 547 M_WAITOK | M_ZERO); 548 rbp->malloc = 1; 549 rbp->bio->bio_data = rbp->buf; 550 rbp->bio->bio_offset = wp->offset; 551 rbp->bio->bio_length = wp->length; 552 rbp->bio->bio_done = gv_raid5_done; 553 rbp->bio->bio_caller1 = wp; 554 rbp->bio->bio_caller2 = rbp; 555 /* 556 * Again, insert at the tail to keep correct 557 * order. 558 */ 559 TAILQ_INSERT_TAIL(&wp->bits, rbp, list); 560 wp->active++; 561 wp->rqcount++; 562 } 563 564 565 /* 566 * A normal write request goes to the original subdisk, then we 567 * read in all other stripes, recalculate the parity and write 568 * out the parity again. 569 */ 570 } else { 571 wp->buf = g_malloc(wp->length, M_WAITOK | M_ZERO); 572 wp->bufmalloc = 1; 573 LIST_FOREACH(s, &p->subdisks, in_plex) { 574 /* Skip the parity stripe. */ 575 if (s->consumer == wp->parity) 576 continue; 577 578 rbp = gv_new_raid5_bit(); 579 rbp->consumer = s->consumer; 580 rbp->bio = g_new_bio(); 581 if (rbp->bio == NULL) 582 return (ENOMEM); 583 /* 584 * The data for the original stripe is written, 585 * the others need to be read in for the parity 586 * calculation. 587 */ 588 if (s->consumer == wp->original) { 589 rbp->bio->bio_cmd = BIO_WRITE; 590 rbp->buf = addr; 591 } else { 592 rbp->bio->bio_cmd = BIO_READ; 593 rbp->buf = g_malloc(wp->length, 594 M_WAITOK | M_ZERO); 595 rbp->malloc = 1; 596 } 597 rbp->bio->bio_data = rbp->buf; 598 rbp->bio->bio_offset = wp->offset; 599 rbp->bio->bio_length = wp->length; 600 rbp->bio->bio_done = gv_raid5_done; 601 rbp->bio->bio_caller1 = wp; 602 rbp->bio->bio_caller2 = rbp; 603 TAILQ_INSERT_HEAD(&wp->bits, rbp, list); 604 wp->active++; 605 wp->rqcount++; 606 } 607 } 608 break; 609 default: 610 return (EINVAL); 611 } 612 613 wp->state = VALID; 614 return (0); 615 } 616