1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, 48 int *, int *); 49 50 /* 51 * Check if the stripe that the work packet wants is already being used by 52 * some other work packet. 53 */ 54 int 55 gv_stripe_active(struct gv_plex *p, struct bio *bp) 56 { 57 struct gv_raid5_packet *wp, *owp; 58 int overlap; 59 60 wp = bp->bio_driver1; 61 if (wp->lockbase == -1) 62 return (0); 63 64 overlap = 0; 65 TAILQ_FOREACH(owp, &p->packets, list) { 66 if (owp == wp) 67 break; 68 if ((wp->lockbase >= owp->lockbase) && 69 (wp->lockbase <= owp->lockbase + owp->length)) { 70 overlap++; 71 break; 72 } 73 if ((wp->lockbase <= owp->lockbase) && 74 (wp->lockbase + wp->length >= owp->lockbase)) { 75 overlap++; 76 break; 77 } 78 } 79 80 return (overlap); 81 } 82 83 int 84 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 85 caddr_t addr, off_t boff, off_t bcount) 86 { 87 struct gv_sd *broken, *s; 88 struct gv_bioq *bq; 89 struct bio *cbp, *pbp; 90 off_t real_len, real_off; 91 92 if (p == NULL || LIST_EMPTY(&p->subdisks)) 93 return (ENXIO); 94 95 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL); 96 97 /* Find the right subdisk. */ 98 broken = NULL; 99 LIST_FOREACH(s, &p->subdisks, in_plex) { 100 if (s->state != GV_SD_UP) 101 broken = s; 102 } 103 104 /* Parity stripe not found. */ 105 if (broken == NULL) 106 return (ENXIO); 107 108 switch (broken->state) { 109 case GV_SD_UP: 110 return (EINVAL); 111 112 case GV_SD_STALE: 113 if (!(bp->bio_cflags & GV_BIO_REBUILD)) 114 return (ENXIO); 115 116 printf("GEOM_VINUM: sd %s is reviving\n", broken->name); 117 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 118 break; 119 120 case GV_SD_REVIVING: 121 break; 122 123 default: 124 /* All other subdisk states mean it's not accessible. */ 125 return (ENXIO); 126 } 127 128 wp->length = real_len; 129 wp->data = addr; 130 wp->lockbase = real_off; 131 132 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 133 134 /* Read all subdisks. */ 135 LIST_FOREACH(s, &p->subdisks, in_plex) { 136 /* Skip the broken subdisk. */ 137 if (s == broken) 138 continue; 139 140 cbp = g_clone_bio(bp); 141 if (cbp == NULL) 142 return (ENOMEM); 143 cbp->bio_cmd = BIO_READ; 144 cbp->bio_data = g_malloc(real_len, M_WAITOK); 145 cbp->bio_cflags |= GV_BIO_MALLOC; 146 cbp->bio_offset = real_off; 147 cbp->bio_length = real_len; 148 cbp->bio_done = gv_plex_done; 149 cbp->bio_caller2 = s->consumer; 150 cbp->bio_driver1 = wp; 151 152 GV_ENQUEUE(bp, cbp, pbp); 153 154 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 155 bq->bp = cbp; 156 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 157 } 158 159 /* Write the parity data. */ 160 cbp = g_clone_bio(bp); 161 if (cbp == NULL) 162 return (ENOMEM); 163 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 164 cbp->bio_cflags |= GV_BIO_MALLOC; 165 cbp->bio_offset = real_off; 166 cbp->bio_length = real_len; 167 cbp->bio_done = gv_plex_done; 168 cbp->bio_caller2 = broken->consumer; 169 cbp->bio_driver1 = wp; 170 cbp->bio_cflags |= GV_BIO_REBUILD; 171 wp->parity = cbp; 172 173 p->synced = boff; 174 175 return (0); 176 } 177 178 /* Build a request group to perform (part of) a RAID5 request. */ 179 int 180 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 181 struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 182 { 183 struct g_geom *gp; 184 struct gv_sd *broken, *original, *parity, *s; 185 struct gv_bioq *bq; 186 struct bio *cbp, *pbp; 187 int i, psdno, sdno, type; 188 off_t real_len, real_off; 189 190 gp = bp->bio_to->geom; 191 192 if (p == NULL || LIST_EMPTY(&p->subdisks)) 193 return (ENXIO); 194 195 /* We are optimistic and assume that this request will be OK. */ 196 #define REQ_TYPE_NORMAL 0 197 #define REQ_TYPE_DEGRADED 1 198 #define REQ_TYPE_NOPARITY 2 199 200 type = REQ_TYPE_NORMAL; 201 original = parity = broken = NULL; 202 203 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); 204 205 /* Find the right subdisks. */ 206 i = 0; 207 LIST_FOREACH(s, &p->subdisks, in_plex) { 208 if (i == sdno) 209 original = s; 210 if (i == psdno) 211 parity = s; 212 if (s->state != GV_SD_UP) 213 broken = s; 214 i++; 215 } 216 217 if ((original == NULL) || (parity == NULL)) 218 return (ENXIO); 219 220 /* Our data stripe is missing. */ 221 if (original->state != GV_SD_UP) 222 type = REQ_TYPE_DEGRADED; 223 /* Our parity stripe is missing. */ 224 if (parity->state != GV_SD_UP) { 225 /* We cannot take another failure if we're already degraded. */ 226 if (type != REQ_TYPE_NORMAL) 227 return (ENXIO); 228 else 229 type = REQ_TYPE_NOPARITY; 230 } 231 232 wp->length = real_len; 233 wp->data = addr; 234 wp->lockbase = real_off; 235 236 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 237 238 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) 239 type = REQ_TYPE_NORMAL; 240 241 switch (bp->bio_cmd) { 242 case BIO_READ: 243 /* 244 * For a degraded read we need to read in all stripes except 245 * the broken one plus the parity stripe and then recalculate 246 * the desired data. 247 */ 248 if (type == REQ_TYPE_DEGRADED) { 249 bzero(wp->data, wp->length); 250 LIST_FOREACH(s, &p->subdisks, in_plex) { 251 /* Skip the broken subdisk. */ 252 if (s == broken) 253 continue; 254 cbp = g_clone_bio(bp); 255 if (cbp == NULL) 256 return (ENOMEM); 257 cbp->bio_data = g_malloc(real_len, M_WAITOK); 258 cbp->bio_cflags |= GV_BIO_MALLOC; 259 cbp->bio_offset = real_off; 260 cbp->bio_length = real_len; 261 cbp->bio_done = gv_plex_done; 262 cbp->bio_caller2 = s->consumer; 263 cbp->bio_driver1 = wp; 264 265 GV_ENQUEUE(bp, cbp, pbp); 266 267 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 268 bq->bp = cbp; 269 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 270 } 271 272 /* A normal read can be fulfilled with the original subdisk. */ 273 } else { 274 cbp = g_clone_bio(bp); 275 if (cbp == NULL) 276 return (ENOMEM); 277 cbp->bio_offset = real_off; 278 cbp->bio_length = real_len; 279 cbp->bio_data = addr; 280 cbp->bio_done = g_std_done; 281 cbp->bio_caller2 = original->consumer; 282 283 GV_ENQUEUE(bp, cbp, pbp); 284 } 285 wp->lockbase = -1; 286 287 break; 288 289 case BIO_WRITE: 290 /* 291 * A degraded write means we cannot write to the original data 292 * subdisk. Thus we need to read in all valid stripes, 293 * recalculate the parity from the original data, and then 294 * write the parity stripe back out. 295 */ 296 if (type == REQ_TYPE_DEGRADED) { 297 /* Read all subdisks. */ 298 LIST_FOREACH(s, &p->subdisks, in_plex) { 299 /* Skip the broken and the parity subdisk. */ 300 if ((s == broken) || (s == parity)) 301 continue; 302 303 cbp = g_clone_bio(bp); 304 if (cbp == NULL) 305 return (ENOMEM); 306 cbp->bio_cmd = BIO_READ; 307 cbp->bio_data = g_malloc(real_len, M_WAITOK); 308 cbp->bio_cflags |= GV_BIO_MALLOC; 309 cbp->bio_offset = real_off; 310 cbp->bio_length = real_len; 311 cbp->bio_done = gv_plex_done; 312 cbp->bio_caller2 = s->consumer; 313 cbp->bio_driver1 = wp; 314 315 GV_ENQUEUE(bp, cbp, pbp); 316 317 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 318 bq->bp = cbp; 319 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 320 } 321 322 /* Write the parity data. */ 323 cbp = g_clone_bio(bp); 324 if (cbp == NULL) 325 return (ENOMEM); 326 cbp->bio_data = g_malloc(real_len, M_WAITOK); 327 cbp->bio_cflags |= GV_BIO_MALLOC; 328 bcopy(addr, cbp->bio_data, real_len); 329 cbp->bio_offset = real_off; 330 cbp->bio_length = real_len; 331 cbp->bio_done = gv_plex_done; 332 cbp->bio_caller2 = parity->consumer; 333 cbp->bio_driver1 = wp; 334 wp->parity = cbp; 335 336 /* 337 * When the parity stripe is missing we just write out the data. 338 */ 339 } else if (type == REQ_TYPE_NOPARITY) { 340 cbp = g_clone_bio(bp); 341 if (cbp == NULL) 342 return (ENOMEM); 343 cbp->bio_offset = real_off; 344 cbp->bio_length = real_len; 345 cbp->bio_data = addr; 346 cbp->bio_done = gv_plex_done; 347 cbp->bio_caller2 = original->consumer; 348 cbp->bio_driver1 = wp; 349 350 GV_ENQUEUE(bp, cbp, pbp); 351 352 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 353 bq->bp = cbp; 354 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 355 356 /* 357 * A normal write request goes to the original subdisk, then we 358 * read in all other stripes, recalculate the parity and write 359 * out the parity again. 360 */ 361 } else { 362 /* Read old parity. */ 363 cbp = g_clone_bio(bp); 364 if (cbp == NULL) 365 return (ENOMEM); 366 cbp->bio_cmd = BIO_READ; 367 cbp->bio_data = g_malloc(real_len, M_WAITOK); 368 cbp->bio_cflags |= GV_BIO_MALLOC; 369 cbp->bio_offset = real_off; 370 cbp->bio_length = real_len; 371 cbp->bio_done = gv_plex_done; 372 cbp->bio_caller2 = parity->consumer; 373 cbp->bio_driver1 = wp; 374 375 GV_ENQUEUE(bp, cbp, pbp); 376 377 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 378 bq->bp = cbp; 379 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 380 381 /* Read old data. */ 382 cbp = g_clone_bio(bp); 383 if (cbp == NULL) 384 return (ENOMEM); 385 cbp->bio_cmd = BIO_READ; 386 cbp->bio_data = g_malloc(real_len, M_WAITOK); 387 cbp->bio_cflags |= GV_BIO_MALLOC; 388 cbp->bio_offset = real_off; 389 cbp->bio_length = real_len; 390 cbp->bio_done = gv_plex_done; 391 cbp->bio_caller2 = original->consumer; 392 cbp->bio_driver1 = wp; 393 394 GV_ENQUEUE(bp, cbp, pbp); 395 396 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 397 bq->bp = cbp; 398 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 399 400 /* Write new data. */ 401 cbp = g_clone_bio(bp); 402 if (cbp == NULL) 403 return (ENOMEM); 404 cbp->bio_data = addr; 405 cbp->bio_offset = real_off; 406 cbp->bio_length = real_len; 407 cbp->bio_done = gv_plex_done; 408 cbp->bio_caller2 = original->consumer; 409 410 cbp->bio_driver1 = wp; 411 412 /* 413 * We must not write the new data until the old data 414 * was read, so hold this BIO back until we're ready 415 * for it. 416 */ 417 wp->waiting = cbp; 418 419 /* The final bio for the parity. */ 420 cbp = g_clone_bio(bp); 421 if (cbp == NULL) 422 return (ENOMEM); 423 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 424 cbp->bio_cflags |= GV_BIO_MALLOC; 425 cbp->bio_offset = real_off; 426 cbp->bio_length = real_len; 427 cbp->bio_done = gv_plex_done; 428 cbp->bio_caller2 = parity->consumer; 429 cbp->bio_driver1 = wp; 430 431 /* Remember that this is the BIO for the parity data. */ 432 wp->parity = cbp; 433 } 434 break; 435 436 default: 437 return (EINVAL); 438 } 439 440 return (0); 441 } 442 443 /* Calculate the offsets in the various subdisks for a RAID5 request. */ 444 int 445 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 446 off_t *real_len, int *sdno, int *psdno) 447 { 448 int sd, psd; 449 off_t len_left, stripeend, stripeoff, stripestart; 450 451 /* The number of the subdisk containing the parity stripe. */ 452 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 453 p->sdcount; 454 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 455 456 /* Offset of the start address from the start of the stripe. */ 457 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 458 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 459 460 /* The number of the subdisk where the stripe resides. */ 461 sd = stripeoff / p->stripesize; 462 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 463 464 /* At or past parity subdisk. */ 465 if (sd >= psd) 466 sd++; 467 468 /* The offset of the stripe on this subdisk. */ 469 stripestart = (boff - stripeoff) / (p->sdcount - 1); 470 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 471 472 stripeoff %= p->stripesize; 473 474 /* The offset of the request on this subdisk. */ 475 *real_off = stripestart + stripeoff; 476 477 stripeend = stripestart + p->stripesize; 478 len_left = stripeend - *real_off; 479 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 480 481 *real_len = (bcount <= len_left) ? bcount : len_left; 482 483 if (sdno != NULL) 484 *sdno = sd; 485 if (psdno != NULL) 486 *psdno = psd; 487 488 return (0); 489 } 490