1 /*- 2 * Copyright (c) 1997, 1998, 1999 3 * Nan Yang Computer Services Limited. All rights reserved. 4 * 5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. 6 * 7 * Written by Greg Lehey 8 * 9 * This software is distributed under the so-called ``Berkeley 10 * License'': 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by Nan Yang Computer 23 * Services Limited. 24 * 4. Neither the name of the Company nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * This software is provided ``as is'', and any express or implied 29 * warranties, including, but not limited to, the implied warranties of 30 * merchantability and fitness for a particular purpose are disclaimed. 31 * In no event shall the company or contributors be liable for any 32 * direct, indirect, incidental, special, exemplary, or consequential 33 * damages (including, but not limited to, procurement of substitute 34 * goods or services; loss of use, data, or profits; or business 35 * interruption) however caused and on any theory of liability, whether 36 * in contract, strict liability, or tort (including negligence or 37 * otherwise) arising in any way out of the use of this software, even if 38 * advised of the possibility of such damage. 39 * 40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $ 41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $ 42 */ 43 44 #include "vinumhdr.h" 45 #include "request.h" 46 47 /* 48 * Revive a block of a subdisk. Return an error 49 * indication. EAGAIN means successful copy, but 50 * that more blocks remain to be copied. EINVAL 51 * means that the subdisk isn't associated with a 52 * plex (which means a programming error if we get 53 * here at all; FIXME). 54 */ 55 56 int 57 revive_block(int sdno) 58 { 59 struct sd *sd; 60 struct plex *plex; 61 struct volume *vol; 62 struct buf *bp; 63 cdev_t dev; 64 int error = EAGAIN; 65 int size; /* size of revive block, bytes */ 66 vinum_off_t plexblkno; /* lblkno in plex */ 67 int psd; /* parity subdisk number */ 68 u_int64_t stripe; /* stripe number */ 69 int paritysd = 0; /* set if this is the parity stripe */ 70 struct rangelock *lock; /* for locking */ 71 vinum_off_t stripeoffset; /* offset in stripe */ 72 73 plexblkno = 0; /* to keep the compiler happy */ 74 sd = &SD[sdno]; 75 lock = NULL; 76 if (sd->plexno < 0) /* no plex? */ 77 return EINVAL; 78 plex = &PLEX[sd->plexno]; /* point to plex */ 79 if (plex->volno >= 0) 80 vol = &VOL[plex->volno]; 81 else 82 vol = NULL; 83 84 if ((sd->revive_blocksize == 0) /* no block size */ 85 ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */ 86 sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE; 87 else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE) 88 sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE; 89 size = u64min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT; 90 sd->reviver = curproc->p_pid; /* note who last had a bash at it */ 91 92 /* Now decide where to read from */ 93 switch (plex->organization) { 94 case plex_concat: 95 plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */ 96 break; 97 98 case plex_striped: 99 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ 100 if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize) 101 size = (plex->stripesize - stripeoffset) << DEV_BSHIFT; 102 plexblkno = sd->plexoffset /* base */ 103 + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */ 104 + stripeoffset; /* offset from beginning of stripe */ 105 break; 106 107 case plex_raid4: 108 case plex_raid5: 109 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ 110 plexblkno = sd->plexoffset /* base */ 111 + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */ 112 +stripeoffset; /* offset from beginning of stripe */ 113 stripe = (sd->revived / plex->stripesize); /* stripe number */ 114 115 /* Make sure we don't go beyond the end of the band. */ 116 size = u64min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT); 117 if (plex->organization == plex_raid4) 118 psd = plex->subdisks - 1; /* parity subdisk for this stripe */ 119 else 120 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ 121 paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */ 122 123 /* 124 * Now adjust for the strangenesses 125 * in RAID-4 and RAID-5 striping. 126 */ 127 if (sd->plexsdno > psd) /* beyond the parity stripe, */ 128 plexblkno -= plex->stripesize; /* one stripe less */ 129 else if (paritysd) 130 plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */ 131 break; 132 133 case plex_disorg: /* to keep the compiler happy */ 134 break; 135 } 136 137 if (paritysd) { /* we're reviving a parity block, */ 138 bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */ 139 if (bp == NULL) /* no buffer space */ 140 return ENOMEM; /* chicken out */ 141 } else { /* data block */ 142 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 143 bp->b_data = Malloc(size); 144 145 /* 146 * Amount to transfer: block size, unless it 147 * would overlap the end. 148 */ 149 bp->b_bcount = size; 150 bp->b_resid = bp->b_bcount; 151 bp->b_bio1.bio_offset = (off_t)plexblkno << DEV_BSHIFT; /* start here */ 152 bp->b_bio1.bio_done = biodone_sync; 153 bp->b_bio1.bio_flags |= BIO_SYNC; 154 if (isstriped(plex)) /* we need to lock striped plexes */ 155 lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */ 156 if (vol != NULL) /* it's part of a volume, */ 157 /* 158 * First, read the data from the volume. We 159 * don't care which plex, that's bre's job. 160 */ 161 dev = vol->vol_dev; 162 else /* it's an unattached plex */ 163 dev = PLEX[sd->plexno].plex_dev; 164 165 bp->b_cmd = BUF_CMD_READ; 166 vinumstart(dev, &bp->b_bio1, 1); 167 biowait(&bp->b_bio1, "drvrd"); 168 } 169 170 if (bp->b_flags & B_ERROR) 171 error = bp->b_error; 172 else 173 /* Now write to the subdisk */ 174 { 175 dev = SD[sdno].sd_dev; 176 KKASSERT(dev != NULL); 177 bp->b_cmd = BUF_CMD_WRITE; 178 bp->b_resid = bp->b_bcount; 179 bp->b_bio1.bio_offset = (off_t)sd->revived << DEV_BSHIFT; /* write it to here */ 180 bp->b_bio1.bio_driver_info = dev; 181 bp->b_bio1.bio_done = biodone_sync; 182 sdio(&bp->b_bio1); /* perform the I/O */ 183 biowait(&bp->b_bio1, "drvwr"); 184 if (bp->b_flags & B_ERROR) 185 error = bp->b_error; 186 else { 187 sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */ 188 if (sd->revived >= sd->sectors) { /* finished */ 189 sd->revived = 0; 190 set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */ 191 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); 192 save_config(); /* and save the updated configuration */ 193 error = 0; /* we're done */ 194 } 195 } 196 if (lock) /* we took a lock, */ 197 unlockrange(sd->plexno, lock); /* give it back */ 198 while (sd->waitlist) { /* we have waiting requests */ 199 #if VINUMDEBUG 200 struct request *rq = sd->waitlist; 201 cdev_t dev; 202 203 if (debug & DEBUG_REVIVECONFLICT) { 204 dev = rq->bio->bio_driver_info; 205 log(LOG_DEBUG, 206 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%jx, length %d\n", 207 rq->sdno, 208 rq, 209 (rq->bio->bio_buf->b_cmd == BUF_CMD_READ) ? "Read" : "Write", 210 major(dev), 211 minor(dev), 212 (uintmax_t)rq->bio->bio_offset, 213 rq->bio->bio_buf->b_bcount); 214 } 215 #endif 216 launch_requests(sd->waitlist, 1); /* do them now */ 217 sd->waitlist = sd->waitlist->next; /* and move on to the next */ 218 } 219 } 220 Free(bp->b_data); 221 relpbuf(bp, &vinum_conf.physbufs); 222 return error; 223 } 224 225 /* 226 * Check or rebuild the parity blocks of a RAID-4 227 * or RAID-5 plex. 228 * 229 * The variables plex->checkblock and 230 * plex->rebuildblock represent the 231 * subdisk-relative address of the stripe we're 232 * looking at, not the plex-relative address. We 233 * store it in the plex and not as a local 234 * variable because this function could be 235 * stopped, and we don't want to repeat the part 236 * we've already done. This is also the reason 237 * why we don't initialize it here except at the 238 * end. It gets initialized with the plex on 239 * creation. 240 * 241 * Each call to this function processes at most 242 * one stripe. We can't loop in this function, 243 * because we're unstoppable, so we have to be 244 * called repeatedly from userland. 245 */ 246 void 247 parityops(struct vinum_ioctl_msg *data) 248 { 249 int plexno; 250 struct plex *plex; 251 int size; /* I/O transfer size, bytes */ 252 struct rangelock *lock; /* lock on stripe */ 253 struct _ioctl_reply *reply; 254 off_t pstripe; /* pointer to our stripe counter */ 255 struct buf *pbp; 256 off_t errorloc; /* offset of parity error */ 257 enum parityop op; /* operation to perform */ 258 259 plexno = data->index; 260 op = data->op; 261 pbp = NULL; 262 reply = (struct _ioctl_reply *) data; 263 reply->error = EAGAIN; /* expect to repeat this call */ 264 plex = &PLEX[plexno]; 265 if (!isparity(plex)) { /* not RAID-4 or RAID-5 */ 266 reply->error = EINVAL; 267 return; 268 } else if (plex->state < plex_flaky) { 269 reply->error = EIO; 270 strcpy(reply->msg, "Plex is not completely accessible\n"); 271 return; 272 } 273 pstripe = data->offset; 274 size = imin(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ 275 plex->stripesize << DEV_BSHIFT); 276 277 errorloc = 0; /* avoid gcc warnings */ 278 pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */ 279 if (pbp == NULL) { /* no buffer space */ 280 reply->error = ENOMEM; 281 return; /* chicken out */ 282 } 283 /* 284 * Now we have a result in the data buffer of 285 * the parity buffer header, which we have kept. 286 * Decide what to do with it. 287 */ 288 reply->msg[0] = '\0'; /* until shown otherwise */ 289 if ((pbp->b_flags & B_ERROR) == 0) { /* no error */ 290 if ((op == rebuildparity) 291 || (op == rebuildandcheckparity)) { 292 pbp->b_cmd = BUF_CMD_WRITE; 293 pbp->b_resid = pbp->b_bcount; 294 pbp->b_bio1.bio_done = biodone_sync; 295 sdio(&pbp->b_bio1); /* write the parity block */ 296 biowait(&pbp->b_bio1, "drvwr"); 297 } 298 if (((op == checkparity) 299 || (op == rebuildandcheckparity)) 300 && (errorloc != -1)) { 301 if (op == checkparity) 302 reply->error = EIO; 303 ksprintf(reply->msg, 304 "Parity incorrect at offset 0x%llx\n", 305 (long long)errorloc); 306 } 307 if (reply->error == EAGAIN) { /* still OK, */ 308 plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */ 309 if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */ 310 plex->checkblock = 0; 311 reply->error = 0; 312 } 313 } 314 } 315 if (pbp->b_flags & B_ERROR) 316 reply->error = pbp->b_error; 317 Free(pbp->b_data); 318 relpbuf(pbp, &vinum_conf.physbufs); 319 unlockrange(plexno, lock); 320 } 321 322 /* 323 * Rebuild a parity stripe. Return pointer to 324 * parity bp. On return, 325 * 326 * 1. The band is locked. The caller must unlock 327 * the band and release the buffer header. 328 * 329 * 2. All buffer headers except php have been 330 * released. The caller must release pbp. 331 * 332 * 3. For checkparity and rebuildandcheckparity, 333 * the parity is compared with the current 334 * parity block. If it's different, the 335 * offset of the error is returned to 336 * errorloc. The caller can set the value of 337 * the pointer to NULL if this is called for 338 * rebuilding parity. 339 * 340 * pstripe is the subdisk-relative base address of 341 * the data to be reconstructed, size is the size 342 * of the transfer in bytes. 343 */ 344 struct buf * 345 parityrebuild(struct plex *plex, 346 vinum_off_t pstripe, 347 int size, 348 enum parityop op, 349 struct rangelock **lockp, 350 off_t * errorloc) 351 { 352 int error; 353 int sdno; 354 u_int64_t stripe; /* stripe number */ 355 int *parity_buf; /* buffer address for current parity block */ 356 int *newparity_buf; /* and for new parity block */ 357 int mysize; /* I/O transfer size for this transfer */ 358 int isize; /* mysize in ints */ 359 int i; 360 int psd; /* parity subdisk number */ 361 int newpsd; /* and "subdisk number" of new parity */ 362 struct buf **bpp; /* pointers to our bps */ 363 struct buf *pbp; /* buffer header for parity stripe */ 364 int *sbuf; 365 int bufcount; /* number of buffers we need */ 366 367 stripe = pstripe / plex->stripesize; /* stripe number */ 368 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ 369 parity_buf = NULL; /* to keep the compiler happy */ 370 error = 0; 371 372 /* 373 * It's possible that the default transfer size 374 * we chose is not a factor of the stripe size. 375 * We *must* limit this operation to a single 376 * stripe, at least for RAID-5 rebuild, since 377 * the parity subdisk changes between stripes, 378 * so in this case we need to perform a short 379 * transfer. Set variable mysize to reflect 380 * this. 381 */ 382 mysize = u64min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT); 383 isize = mysize / (sizeof(int)); /* number of ints in the buffer */ 384 bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */ 385 newpsd = plex->subdisks; 386 bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */ 387 388 /* First, build requests for all subdisks */ 389 for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */ 390 if ((sdno != psd) || (op != rebuildparity)) { 391 /* Get a buffer header and initialize it. */ 392 bpp[sdno] = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 393 bpp[sdno]->b_data = Malloc(mysize); 394 if (sdno == psd) 395 parity_buf = (int *) bpp[sdno]->b_data; 396 if (sdno == newpsd) /* the new one? */ 397 bpp[sdno]->b_bio1.bio_driver_info = SD[plex->sdnos[psd]].sd_dev; /* write back to the parity SD */ 398 else 399 bpp[sdno]->b_bio1.bio_driver_info = SD[plex->sdnos[sdno]].sd_dev; /* device number */ 400 KKASSERT(bpp[sdno]->b_bio1.bio_driver_info); 401 bpp[sdno]->b_cmd = BUF_CMD_READ; /* either way, read it */ 402 bpp[sdno]->b_bcount = mysize; 403 bpp[sdno]->b_resid = bpp[sdno]->b_bcount; 404 bpp[sdno]->b_bio1.bio_offset = (off_t)pstripe << DEV_BSHIFT; /* transfer from here */ 405 bpp[sdno]->b_bio1.bio_done = biodone_sync; 406 } 407 } 408 409 /* Initialize result buffer */ 410 pbp = bpp[newpsd]; 411 newparity_buf = (int *) bpp[newpsd]->b_data; 412 bzero(newparity_buf, mysize); 413 414 /* 415 * Now lock the stripe with the first non-parity 416 * bp as locking bp. 417 */ 418 *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1), 419 bpp[psd ? 0 : 1], 420 plex); 421 422 /* 423 * Then issue requests for all subdisks in 424 * parallel. Don't transfer the parity stripe 425 * if we're rebuilding parity, unless we also 426 * want to check it. 427 */ 428 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */ 429 if ((sdno != psd) || (op != rebuildparity)) { 430 sdio(&bpp[sdno]->b_bio1); 431 } 432 } 433 434 /* 435 * Next, wait for the requests to complete. 436 * We wait in the order in which they were 437 * issued, which isn't necessarily the order in 438 * which they complete, but we don't have a 439 * convenient way of doing the latter, and the 440 * delay is minimal. 441 */ 442 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */ 443 if ((sdno != psd) || (op != rebuildparity)) { 444 biowait(&bpp[sdno]->b_bio1, "drvio"); 445 if (bpp[sdno]->b_flags & B_ERROR) /* can't read, */ 446 error = bpp[sdno]->b_error; 447 else if (sdno != psd) { /* update parity */ 448 sbuf = (int *) bpp[sdno]->b_data; 449 for (i = 0; i < isize; i++) 450 newparity_buf[i] ^= sbuf[i]; /* xor in the buffer */ 451 } 452 } 453 if (sdno != psd) { /* release all bps except parity */ 454 Free(bpp[sdno]->b_data); 455 relpbuf(bpp[sdno], &vinum_conf.physbufs); /* give back our resources */ 456 } 457 } 458 459 /* 460 * If we're checking, compare the calculated 461 * and the read parity block. If they're 462 * different, return the plex-relative offset; 463 * otherwise return -1. 464 */ 465 if ((op == checkparity) 466 || (op == rebuildandcheckparity)) { 467 *errorloc = -1; /* no error yet */ 468 for (i = 0; i < isize; i++) { 469 if (parity_buf[i] != newparity_buf[i]) { 470 *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1) 471 + i * sizeof(int); 472 break; 473 } 474 } 475 Free(bpp[psd]->b_data); 476 relpbuf(bpp[psd], &vinum_conf.physbufs); /* give back our resources */ 477 } 478 /* release our resources */ 479 Free(bpp); 480 if (error) { 481 pbp->b_flags |= B_ERROR; 482 pbp->b_error = error; 483 } 484 return pbp; 485 } 486 487 /* 488 * Initialize a subdisk by writing zeroes to the 489 * complete address space. If verify is set, 490 * check each transfer for correctness. 491 * 492 * Each call to this function writes (and maybe 493 * checks) a single block. 494 */ 495 int 496 initsd(int sdno, int verify) 497 { 498 struct sd *sd; 499 struct plex *plex; 500 struct buf *bp; 501 int error; 502 int size; /* size of init block, bytes */ 503 int verified; /* set when we're happy with what we wrote */ 504 505 error = 0; 506 sd = &SD[sdno]; 507 if (sd->plexno < 0) /* no plex? */ 508 return EINVAL; 509 plex = &PLEX[sd->plexno]; /* point to plex */ 510 511 if (sd->init_blocksize == 0) { 512 if (plex->stripesize != 0) /* we're striped, don't init more than */ 513 sd->init_blocksize = u64min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ 514 plex->stripesize << DEV_BSHIFT); 515 else 516 sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE; 517 } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE) 518 sd->init_blocksize = MAX_REVIVE_BLOCKSIZE; 519 520 size = u64min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT; 521 522 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 523 bp->b_data = Malloc(size); 524 525 verified = 0; 526 while (!verified) { /* until we're happy with it, */ 527 bp->b_bcount = size; 528 bp->b_resid = bp->b_bcount; 529 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* write it to here */ 530 bp->b_bio1.bio_driver_info = SD[sdno].sd_dev; 531 bp->b_bio1.bio_done = biodone_sync; 532 KKASSERT(bp->b_bio1.bio_driver_info); 533 bzero(bp->b_data, bp->b_bcount); 534 bp->b_cmd = BUF_CMD_WRITE; 535 sdio(&bp->b_bio1); /* perform the I/O */ 536 biowait(&bp->b_bio1, "drvwr"); 537 if (bp->b_flags & B_ERROR) 538 error = bp->b_error; 539 if ((error == 0) && verify) { /* check that it got there */ 540 bp->b_bcount = size; 541 bp->b_resid = bp->b_bcount; 542 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* read from here */ 543 bp->b_bio1.bio_driver_info = SD[sdno].sd_dev; 544 bp->b_bio1.bio_done = biodone_sync; 545 KKASSERT(bp->b_bio1.bio_driver_info); 546 bp->b_cmd = BUF_CMD_READ; /* read it back */ 547 sdio(&bp->b_bio1); 548 biowait(&bp->b_bio1, "drvrd"); 549 /* 550 * XXX Bug fix code. This is hopefully no 551 * longer needed (21 February 2000). 552 */ 553 if (bp->b_flags & B_ERROR) 554 error = bp->b_error; 555 else if ((*bp->b_data != 0) /* first word spammed */ 556 ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */ 557 kprintf("vinum: init error on %s, offset 0x%llx sectors\n", 558 sd->name, 559 (long long) sd->initialized); 560 verified = 0; 561 } else 562 verified = 1; 563 } else 564 verified = 1; 565 } 566 Free(bp->b_data); 567 relpbuf(bp, &vinum_conf.physbufs); 568 if (error == 0) { /* did it, */ 569 sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */ 570 if (sd->initialized >= sd->sectors) { /* finished */ 571 sd->initialized = 0; 572 set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */ 573 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); 574 save_config(); /* and save the updated configuration */ 575 } else /* more to go, */ 576 error = EAGAIN; /* ya'll come back, see? */ 577 } 578 return error; 579 } 580