1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 /* 48 * Check if the stripe that the work packet wants is already being used by 49 * some other work packet. 50 */ 51 int 52 gv_stripe_active(struct gv_plex *p, struct bio *bp) 53 { 54 struct gv_raid5_packet *wp, *owp; 55 int overlap; 56 57 wp = bp->bio_driver1; 58 if (wp->lockbase == -1) 59 return (0); 60 61 overlap = 0; 62 TAILQ_FOREACH(owp, &p->packets, list) { 63 if (owp == wp) 64 break; 65 if ((wp->lockbase >= owp->lockbase) && 66 (wp->lockbase <= owp->lockbase + owp->length)) { 67 overlap++; 68 break; 69 } 70 if ((wp->lockbase <= owp->lockbase) && 71 (wp->lockbase + wp->length >= owp->lockbase)) { 72 overlap++; 73 break; 74 } 75 } 76 77 return (overlap); 78 } 79 80 int 81 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 82 caddr_t addr, off_t boff, off_t bcount) 83 { 84 struct gv_sd *broken, *s; 85 struct gv_bioq *bq; 86 struct bio *cbp, *pbp; 87 off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart; 88 89 if (p == NULL || LIST_EMPTY(&p->subdisks)) 90 return (ENXIO); 91 92 /* Offset of the start address from the start of the stripe. */ 93 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 94 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 95 96 /* The offset of the stripe on this subdisk. */ 97 stripestart = (boff - stripeoff) / (p->sdcount - 1); 98 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 99 100 stripeoff %= p->stripesize; 101 102 /* The offset of the request on this subdisk. */ 103 real_off = stripestart + stripeoff; 104 105 stripeend = stripestart + p->stripesize; 106 len_left = stripeend - real_off; 107 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 108 109 /* Find the right subdisk. */ 110 broken = NULL; 111 LIST_FOREACH(s, &p->subdisks, in_plex) { 112 if (s->state != GV_SD_UP) 113 broken = s; 114 } 115 116 /* Parity stripe not found. */ 117 if (broken == NULL) 118 return (ENXIO); 119 120 switch (broken->state) { 121 case GV_SD_UP: 122 return (EINVAL); 123 124 case GV_SD_STALE: 125 if (!(bp->bio_cflags & GV_BIO_REBUILD)) 126 return (ENXIO); 127 128 printf("GEOM_VINUM: sd %s is reviving\n", broken->name); 129 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 130 break; 131 132 case GV_SD_REVIVING: 133 break; 134 135 default: 136 /* All other subdisk states mean it's not accessible. */ 137 return (ENXIO); 138 } 139 140 real_len = (bcount <= len_left) ? bcount : len_left; 141 wp->length = real_len; 142 wp->data = addr; 143 wp->lockbase = real_off; 144 145 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 146 147 /* Read all subdisks. */ 148 LIST_FOREACH(s, &p->subdisks, in_plex) { 149 /* Skip the broken subdisk. */ 150 if (s == broken) 151 continue; 152 153 cbp = g_clone_bio(bp); 154 if (cbp == NULL) 155 return (ENOMEM); 156 cbp->bio_cmd = BIO_READ; 157 cbp->bio_data = g_malloc(real_len, M_WAITOK); 158 cbp->bio_cflags |= GV_BIO_MALLOC; 159 cbp->bio_offset = real_off; 160 cbp->bio_length = real_len; 161 cbp->bio_done = gv_plex_done; 162 cbp->bio_caller2 = s->consumer; 163 cbp->bio_driver1 = wp; 164 165 GV_ENQUEUE(bp, cbp, pbp); 166 167 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 168 bq->bp = cbp; 169 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 170 } 171 172 /* Write the parity data. */ 173 cbp = g_clone_bio(bp); 174 if (cbp == NULL) 175 return (ENOMEM); 176 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 177 cbp->bio_cflags |= GV_BIO_MALLOC; 178 cbp->bio_offset = real_off; 179 cbp->bio_length = real_len; 180 cbp->bio_done = gv_plex_done; 181 cbp->bio_caller2 = broken->consumer; 182 cbp->bio_driver1 = wp; 183 cbp->bio_cflags |= GV_BIO_REBUILD; 184 wp->parity = cbp; 185 186 p->synced = boff; 187 188 return (0); 189 } 190 191 /* Build a request group to perform (part of) a RAID5 request. */ 192 int 193 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 194 struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 195 { 196 struct g_geom *gp; 197 struct gv_sd *broken, *original, *parity, *s; 198 struct gv_bioq *bq; 199 struct bio *cbp, *pbp; 200 int i, psdno, sdno, type; 201 off_t len_left, real_len, real_off, stripeend, stripeoff, stripestart; 202 203 gp = bp->bio_to->geom; 204 205 if (p == NULL || LIST_EMPTY(&p->subdisks)) 206 return (ENXIO); 207 208 /* We are optimistic and assume that this request will be OK. */ 209 #define REQ_TYPE_NORMAL 0 210 #define REQ_TYPE_DEGRADED 1 211 #define REQ_TYPE_NOPARITY 2 212 213 type = REQ_TYPE_NORMAL; 214 original = parity = broken = NULL; 215 216 /* The number of the subdisk containing the parity stripe. */ 217 psdno = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 218 p->sdcount; 219 KASSERT(psdno >= 0, ("gv_build_raid5_request: psdno < 0")); 220 221 /* Offset of the start address from the start of the stripe. */ 222 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 223 KASSERT(stripeoff >= 0, ("gv_build_raid5_request: stripeoff < 0")); 224 225 /* The number of the subdisk where the stripe resides. */ 226 sdno = stripeoff / p->stripesize; 227 KASSERT(sdno >= 0, ("gv_build_raid5_request: sdno < 0")); 228 229 /* At or past parity subdisk. */ 230 if (sdno >= psdno) 231 sdno++; 232 233 /* The offset of the stripe on this subdisk. */ 234 stripestart = (boff - stripeoff) / (p->sdcount - 1); 235 KASSERT(stripestart >= 0, ("gv_build_raid5_request: stripestart < 0")); 236 237 stripeoff %= p->stripesize; 238 239 /* The offset of the request on this subdisk. */ 240 real_off = stripestart + stripeoff; 241 242 stripeend = stripestart + p->stripesize; 243 len_left = stripeend - real_off; 244 KASSERT(len_left >= 0, ("gv_build_raid5_request: len_left < 0")); 245 246 /* Find the right subdisks. */ 247 i = 0; 248 LIST_FOREACH(s, &p->subdisks, in_plex) { 249 if (i == sdno) 250 original = s; 251 if (i == psdno) 252 parity = s; 253 if (s->state != GV_SD_UP) 254 broken = s; 255 i++; 256 } 257 258 if ((original == NULL) || (parity == NULL)) 259 return (ENXIO); 260 261 /* Our data stripe is missing. */ 262 if (original->state != GV_SD_UP) 263 type = REQ_TYPE_DEGRADED; 264 /* Our parity stripe is missing. */ 265 if (parity->state != GV_SD_UP) { 266 /* We cannot take another failure if we're already degraded. */ 267 if (type != REQ_TYPE_NORMAL) 268 return (ENXIO); 269 else 270 type = REQ_TYPE_NOPARITY; 271 } 272 273 real_len = (bcount <= len_left) ? bcount : len_left; 274 wp->length = real_len; 275 wp->data = addr; 276 wp->lockbase = real_off; 277 278 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 279 280 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) 281 type = REQ_TYPE_NORMAL; 282 283 switch (bp->bio_cmd) { 284 case BIO_READ: 285 /* 286 * For a degraded read we need to read in all stripes except 287 * the broken one plus the parity stripe and then recalculate 288 * the desired data. 289 */ 290 if (type == REQ_TYPE_DEGRADED) { 291 bzero(wp->data, wp->length); 292 LIST_FOREACH(s, &p->subdisks, in_plex) { 293 /* Skip the broken subdisk. */ 294 if (s == broken) 295 continue; 296 cbp = g_clone_bio(bp); 297 if (cbp == NULL) 298 return (ENOMEM); 299 cbp->bio_data = g_malloc(real_len, M_WAITOK); 300 cbp->bio_cflags |= GV_BIO_MALLOC; 301 cbp->bio_offset = real_off; 302 cbp->bio_length = real_len; 303 cbp->bio_done = gv_plex_done; 304 cbp->bio_caller2 = s->consumer; 305 cbp->bio_driver1 = wp; 306 307 GV_ENQUEUE(bp, cbp, pbp); 308 309 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 310 bq->bp = cbp; 311 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 312 } 313 314 /* A normal read can be fulfilled with the original subdisk. */ 315 } else { 316 cbp = g_clone_bio(bp); 317 if (cbp == NULL) 318 return (ENOMEM); 319 cbp->bio_offset = real_off; 320 cbp->bio_length = real_len; 321 cbp->bio_data = addr; 322 cbp->bio_done = g_std_done; 323 cbp->bio_caller2 = original->consumer; 324 325 GV_ENQUEUE(bp, cbp, pbp); 326 } 327 wp->lockbase = -1; 328 329 break; 330 331 case BIO_WRITE: 332 /* 333 * A degraded write means we cannot write to the original data 334 * subdisk. Thus we need to read in all valid stripes, 335 * recalculate the parity from the original data, and then 336 * write the parity stripe back out. 337 */ 338 if (type == REQ_TYPE_DEGRADED) { 339 /* Read all subdisks. */ 340 LIST_FOREACH(s, &p->subdisks, in_plex) { 341 /* Skip the broken and the parity subdisk. */ 342 if ((s == broken) || (s == parity)) 343 continue; 344 345 cbp = g_clone_bio(bp); 346 if (cbp == NULL) 347 return (ENOMEM); 348 cbp->bio_cmd = BIO_READ; 349 cbp->bio_data = g_malloc(real_len, M_WAITOK); 350 cbp->bio_cflags |= GV_BIO_MALLOC; 351 cbp->bio_offset = real_off; 352 cbp->bio_length = real_len; 353 cbp->bio_done = gv_plex_done; 354 cbp->bio_caller2 = s->consumer; 355 cbp->bio_driver1 = wp; 356 357 GV_ENQUEUE(bp, cbp, pbp); 358 359 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 360 bq->bp = cbp; 361 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 362 } 363 364 /* Write the parity data. */ 365 cbp = g_clone_bio(bp); 366 if (cbp == NULL) 367 return (ENOMEM); 368 cbp->bio_data = g_malloc(real_len, M_WAITOK); 369 cbp->bio_cflags |= GV_BIO_MALLOC; 370 bcopy(addr, cbp->bio_data, real_len); 371 cbp->bio_offset = real_off; 372 cbp->bio_length = real_len; 373 cbp->bio_done = gv_plex_done; 374 cbp->bio_caller2 = parity->consumer; 375 cbp->bio_driver1 = wp; 376 wp->parity = cbp; 377 378 /* 379 * When the parity stripe is missing we just write out the data. 380 */ 381 } else if (type == REQ_TYPE_NOPARITY) { 382 cbp = g_clone_bio(bp); 383 if (cbp == NULL) 384 return (ENOMEM); 385 cbp->bio_offset = real_off; 386 cbp->bio_length = real_len; 387 cbp->bio_data = addr; 388 cbp->bio_done = gv_plex_done; 389 cbp->bio_caller2 = original->consumer; 390 cbp->bio_driver1 = wp; 391 392 GV_ENQUEUE(bp, cbp, pbp); 393 394 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 395 bq->bp = cbp; 396 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 397 398 /* 399 * A normal write request goes to the original subdisk, then we 400 * read in all other stripes, recalculate the parity and write 401 * out the parity again. 402 */ 403 } else { 404 /* Read old parity. */ 405 cbp = g_clone_bio(bp); 406 if (cbp == NULL) 407 return (ENOMEM); 408 cbp->bio_cmd = BIO_READ; 409 cbp->bio_data = g_malloc(real_len, M_WAITOK); 410 cbp->bio_cflags |= GV_BIO_MALLOC; 411 cbp->bio_offset = real_off; 412 cbp->bio_length = real_len; 413 cbp->bio_done = gv_plex_done; 414 cbp->bio_caller2 = parity->consumer; 415 cbp->bio_driver1 = wp; 416 417 GV_ENQUEUE(bp, cbp, pbp); 418 419 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 420 bq->bp = cbp; 421 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 422 423 /* Read old data. */ 424 cbp = g_clone_bio(bp); 425 if (cbp == NULL) 426 return (ENOMEM); 427 cbp->bio_cmd = BIO_READ; 428 cbp->bio_data = g_malloc(real_len, M_WAITOK); 429 cbp->bio_cflags |= GV_BIO_MALLOC; 430 cbp->bio_offset = real_off; 431 cbp->bio_length = real_len; 432 cbp->bio_done = gv_plex_done; 433 cbp->bio_caller2 = original->consumer; 434 cbp->bio_driver1 = wp; 435 436 GV_ENQUEUE(bp, cbp, pbp); 437 438 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 439 bq->bp = cbp; 440 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 441 442 /* Write new data. */ 443 cbp = g_clone_bio(bp); 444 if (cbp == NULL) 445 return (ENOMEM); 446 cbp->bio_data = addr; 447 cbp->bio_offset = real_off; 448 cbp->bio_length = real_len; 449 cbp->bio_done = gv_plex_done; 450 cbp->bio_caller2 = original->consumer; 451 452 cbp->bio_driver1 = wp; 453 454 /* 455 * We must not write the new data until the old data 456 * was read, so hold this BIO back until we're ready 457 * for it. 458 */ 459 wp->waiting = cbp; 460 461 /* The final bio for the parity. */ 462 cbp = g_clone_bio(bp); 463 if (cbp == NULL) 464 return (ENOMEM); 465 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 466 cbp->bio_cflags |= GV_BIO_MALLOC; 467 cbp->bio_offset = real_off; 468 cbp->bio_length = real_len; 469 cbp->bio_done = gv_plex_done; 470 cbp->bio_caller2 = parity->consumer; 471 cbp->bio_driver1 = wp; 472 473 /* Remember that this is the BIO for the parity data. */ 474 wp->parity = cbp; 475 } 476 break; 477 478 default: 479 return (EINVAL); 480 } 481 482 return (0); 483 } 484