1 /*- 2 * Copyright (c) 1997, 1998, 1999 3 * Nan Yang Computer Services Limited. All rights reserved. 4 * 5 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. 6 * 7 * Written by Greg Lehey 8 * 9 * This software is distributed under the so-called ``Berkeley 10 * License'': 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. All advertising materials mentioning features or use of this software 21 * must display the following acknowledgement: 22 * This product includes software developed by Nan Yang Computer 23 * Services Limited. 24 * 4. Neither the name of the Company nor the names of its contributors 25 * may be used to endorse or promote products derived from this software 26 * without specific prior written permission. 27 * 28 * This software is provided ``as is'', and any express or implied 29 * warranties, including, but not limited to, the implied warranties of 30 * merchantability and fitness for a particular purpose are disclaimed. 31 * In no event shall the company or contributors be liable for any 32 * direct, indirect, incidental, special, exemplary, or consequential 33 * damages (including, but not limited to, procurement of substitute 34 * goods or services; loss of use, data, or profits; or business 35 * interruption) however caused and on any theory of liability, whether 36 * in contract, strict liability, or tort (including negligence or 37 * otherwise) arising in any way out of the use of this software, even if 38 * advised of the possibility of such damage. 39 * 40 * $Id: vinumrevive.c,v 1.14 2000/12/21 01:55:11 grog Exp grog $ 41 * $FreeBSD: src/sys/dev/vinum/vinumrevive.c,v 1.22.2.5 2001/03/13 02:59:43 grog Exp $ 42 * $DragonFly: src/sys/dev/raid/vinum/vinumrevive.c,v 1.15 2006/12/22 23:26:24 swildner Exp $ 43 */ 44 45 #include "vinumhdr.h" 46 #include "request.h" 47 48 /* 49 * Revive a block of a subdisk. Return an error 50 * indication. EAGAIN means successful copy, but 51 * that more blocks remain to be copied. EINVAL 52 * means that the subdisk isn't associated with a 53 * plex (which means a programming error if we get 54 * here at all; FIXME). 55 */ 56 57 int 58 revive_block(int sdno) 59 { 60 struct sd *sd; 61 struct plex *plex; 62 struct volume *vol; 63 struct buf *bp; 64 cdev_t dev; 65 int error = EAGAIN; 66 int size; /* size of revive block, bytes */ 67 daddr_t plexblkno; /* lblkno in plex */ 68 int psd; /* parity subdisk number */ 69 u_int64_t stripe; /* stripe number */ 70 int paritysd = 0; /* set if this is the parity stripe */ 71 struct rangelock *lock; /* for locking */ 72 daddr_t stripeoffset; /* offset in stripe */ 73 74 plexblkno = 0; /* to keep the compiler happy */ 75 sd = &SD[sdno]; 76 lock = NULL; 77 if (sd->plexno < 0) /* no plex? */ 78 return EINVAL; 79 plex = &PLEX[sd->plexno]; /* point to plex */ 80 if (plex->volno >= 0) 81 vol = &VOL[plex->volno]; 82 else 83 vol = NULL; 84 85 if ((sd->revive_blocksize == 0) /* no block size */ 86 ||(sd->revive_blocksize & ((1 << DEV_BSHIFT) - 1))) /* or invalid block size */ 87 sd->revive_blocksize = DEFAULT_REVIVE_BLOCKSIZE; 88 else if (sd->revive_blocksize > MAX_REVIVE_BLOCKSIZE) 89 sd->revive_blocksize = MAX_REVIVE_BLOCKSIZE; 90 size = min(sd->revive_blocksize >> DEV_BSHIFT, sd->sectors - sd->revived) << DEV_BSHIFT; 91 sd->reviver = curproc->p_pid; /* note who last had a bash at it */ 92 93 /* Now decide where to read from */ 94 switch (plex->organization) { 95 case plex_concat: 96 plexblkno = sd->revived + sd->plexoffset; /* corresponding address in plex */ 97 break; 98 99 case plex_striped: 100 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ 101 if (stripeoffset + (size >> DEV_BSHIFT) > plex->stripesize) 102 size = (plex->stripesize - stripeoffset) << DEV_BSHIFT; 103 plexblkno = sd->plexoffset /* base */ 104 + (sd->revived - stripeoffset) * plex->subdisks /* offset to beginning of stripe */ 105 + stripeoffset; /* offset from beginning of stripe */ 106 break; 107 108 case plex_raid4: 109 case plex_raid5: 110 stripeoffset = sd->revived % plex->stripesize; /* offset from beginning of stripe */ 111 plexblkno = sd->plexoffset /* base */ 112 + (sd->revived - stripeoffset) * (plex->subdisks - 1) /* offset to beginning of stripe */ 113 +stripeoffset; /* offset from beginning of stripe */ 114 stripe = (sd->revived / plex->stripesize); /* stripe number */ 115 116 /* Make sure we don't go beyond the end of the band. */ 117 size = min(size, (plex->stripesize - stripeoffset) << DEV_BSHIFT); 118 if (plex->organization == plex_raid4) 119 psd = plex->subdisks - 1; /* parity subdisk for this stripe */ 120 else 121 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ 122 paritysd = plex->sdnos[psd] == sdno; /* note if it's the parity subdisk */ 123 124 /* 125 * Now adjust for the strangenesses 126 * in RAID-4 and RAID-5 striping. 127 */ 128 if (sd->plexsdno > psd) /* beyond the parity stripe, */ 129 plexblkno -= plex->stripesize; /* one stripe less */ 130 else if (paritysd) 131 plexblkno -= plex->stripesize * sd->plexsdno; /* go back to the beginning of the band */ 132 break; 133 134 case plex_disorg: /* to keep the compiler happy */ 135 break; 136 } 137 138 if (paritysd) { /* we're reviving a parity block, */ 139 bp = parityrebuild(plex, sd->revived, size, rebuildparity, &lock, NULL); /* do the grunt work */ 140 if (bp == NULL) /* no buffer space */ 141 return ENOMEM; /* chicken out */ 142 } else { /* data block */ 143 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 144 bp->b_data = Malloc(size); 145 146 /* 147 * Amount to transfer: block size, unless it 148 * would overlap the end. 149 */ 150 bp->b_bcount = size; 151 bp->b_resid = bp->b_bcount; 152 bp->b_bio1.bio_offset = (off_t)plexblkno << DEV_BSHIFT; /* start here */ 153 if (isstriped(plex)) /* we need to lock striped plexes */ 154 lock = lockrange(plexblkno << DEV_BSHIFT, bp, plex); /* lock it */ 155 if (vol != NULL) /* it's part of a volume, */ 156 /* 157 * First, read the data from the volume. We 158 * don't care which plex, that's bre's job. 159 */ 160 dev = VINUMDEV(plex->volno, 0, 0, VINUM_VOLUME_TYPE); /* create the device number */ 161 else /* it's an unattached plex */ 162 dev = VINUM_PLEX(sd->plexno); /* create the device number */ 163 164 bp->b_cmd = BUF_CMD_READ; 165 vinumstart(dev, &bp->b_bio1, 1); 166 biowait(bp); 167 } 168 169 if (bp->b_flags & B_ERROR) 170 error = bp->b_error; 171 else 172 /* Now write to the subdisk */ 173 { 174 dev = VINUM_SD(sdno); /* create the device number */ 175 bp->b_flags |= B_ORDERED; /* and make this an ordered write */ 176 bp->b_cmd = BUF_CMD_WRITE; 177 bp->b_resid = bp->b_bcount; 178 bp->b_bio1.bio_offset = (off_t)sd->revived << DEV_BSHIFT; /* write it to here */ 179 bp->b_bio1.bio_driver_info = dev; 180 sdio(&bp->b_bio1); /* perform the I/O */ 181 biowait(bp); 182 if (bp->b_flags & B_ERROR) 183 error = bp->b_error; 184 else { 185 sd->revived += bp->b_bcount >> DEV_BSHIFT; /* moved this much further down */ 186 if (sd->revived >= sd->sectors) { /* finished */ 187 sd->revived = 0; 188 set_sd_state(sdno, sd_up, setstate_force); /* bring the sd up */ 189 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); 190 save_config(); /* and save the updated configuration */ 191 error = 0; /* we're done */ 192 } 193 } 194 if (lock) /* we took a lock, */ 195 unlockrange(sd->plexno, lock); /* give it back */ 196 while (sd->waitlist) { /* we have waiting requests */ 197 #if VINUMDEBUG 198 struct request *rq = sd->waitlist; 199 cdev_t dev; 200 201 if (debug & DEBUG_REVIVECONFLICT) { 202 dev = rq->bio->bio_driver_info; 203 log(LOG_DEBUG, 204 "Relaunch revive conflict sd %d: %p\n%s dev %d.%d, offset 0x%llx, length %d\n", 205 rq->sdno, 206 rq, 207 (rq->bio->bio_buf->b_cmd == BUF_CMD_READ) ? "Read" : "Write", 208 major(dev), 209 minor(dev), 210 rq->bio->bio_offset, 211 rq->bio->bio_buf->b_bcount); 212 } 213 #endif 214 launch_requests(sd->waitlist, 1); /* do them now */ 215 sd->waitlist = sd->waitlist->next; /* and move on to the next */ 216 } 217 } 218 Free(bp->b_data); 219 relpbuf(bp, &vinum_conf.physbufs); 220 return error; 221 } 222 223 /* 224 * Check or rebuild the parity blocks of a RAID-4 225 * or RAID-5 plex. 226 * 227 * The variables plex->checkblock and 228 * plex->rebuildblock represent the 229 * subdisk-relative address of the stripe we're 230 * looking at, not the plex-relative address. We 231 * store it in the plex and not as a local 232 * variable because this function could be 233 * stopped, and we don't want to repeat the part 234 * we've already done. This is also the reason 235 * why we don't initialize it here except at the 236 * end. It gets initialized with the plex on 237 * creation. 238 * 239 * Each call to this function processes at most 240 * one stripe. We can't loop in this function, 241 * because we're unstoppable, so we have to be 242 * called repeatedly from userland. 243 */ 244 void 245 parityops(struct vinum_ioctl_msg *data) 246 { 247 int plexno; 248 struct plex *plex; 249 int size; /* I/O transfer size, bytes */ 250 int stripe; /* stripe number in plex */ 251 int psd; /* parity subdisk number */ 252 struct rangelock *lock; /* lock on stripe */ 253 struct _ioctl_reply *reply; 254 off_t pstripe; /* pointer to our stripe counter */ 255 struct buf *pbp; 256 off_t errorloc; /* offset of parity error */ 257 enum parityop op; /* operation to perform */ 258 259 plexno = data->index; 260 op = data->op; 261 pbp = NULL; 262 reply = (struct _ioctl_reply *) data; 263 reply->error = EAGAIN; /* expect to repeat this call */ 264 plex = &PLEX[plexno]; 265 if (!isparity(plex)) { /* not RAID-4 or RAID-5 */ 266 reply->error = EINVAL; 267 return; 268 } else if (plex->state < plex_flaky) { 269 reply->error = EIO; 270 strcpy(reply->msg, "Plex is not completely accessible\n"); 271 return; 272 } 273 pstripe = data->offset; 274 stripe = pstripe / plex->stripesize; /* stripe number */ 275 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ 276 size = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ 277 plex->stripesize << DEV_BSHIFT); 278 279 pbp = parityrebuild(plex, pstripe, size, op, &lock, &errorloc); /* do the grunt work */ 280 if (pbp == NULL) { /* no buffer space */ 281 reply->error = ENOMEM; 282 return; /* chicken out */ 283 } 284 /* 285 * Now we have a result in the data buffer of 286 * the parity buffer header, which we have kept. 287 * Decide what to do with it. 288 */ 289 reply->msg[0] = '\0'; /* until shown otherwise */ 290 if ((pbp->b_flags & B_ERROR) == 0) { /* no error */ 291 if ((op == rebuildparity) 292 || (op == rebuildandcheckparity)) { 293 pbp->b_cmd = BUF_CMD_WRITE; 294 pbp->b_resid = pbp->b_bcount; 295 sdio(&pbp->b_bio1); /* write the parity block */ 296 biowait(pbp); 297 } 298 if (((op == checkparity) 299 || (op == rebuildandcheckparity)) 300 && (errorloc != -1)) { 301 if (op == checkparity) 302 reply->error = EIO; 303 ksprintf(reply->msg, 304 "Parity incorrect at offset 0x%llx\n", 305 errorloc); 306 } 307 if (reply->error == EAGAIN) { /* still OK, */ 308 plex->checkblock = pstripe + (pbp->b_bcount >> DEV_BSHIFT); /* moved this much further down */ 309 if (plex->checkblock >= SD[plex->sdnos[0]].sectors) { /* finished */ 310 plex->checkblock = 0; 311 reply->error = 0; 312 } 313 } 314 } 315 if (pbp->b_flags & B_ERROR) 316 reply->error = pbp->b_error; 317 Free(pbp->b_data); 318 relpbuf(pbp, &vinum_conf.physbufs); 319 unlockrange(plexno, lock); 320 } 321 322 /* 323 * Rebuild a parity stripe. Return pointer to 324 * parity bp. On return, 325 * 326 * 1. The band is locked. The caller must unlock 327 * the band and release the buffer header. 328 * 329 * 2. All buffer headers except php have been 330 * released. The caller must release pbp. 331 * 332 * 3. For checkparity and rebuildandcheckparity, 333 * the parity is compared with the current 334 * parity block. If it's different, the 335 * offset of the error is returned to 336 * errorloc. The caller can set the value of 337 * the pointer to NULL if this is called for 338 * rebuilding parity. 339 * 340 * pstripe is the subdisk-relative base address of 341 * the data to be reconstructed, size is the size 342 * of the transfer in bytes. 343 */ 344 struct buf * 345 parityrebuild(struct plex *plex, 346 u_int64_t pstripe, 347 int size, 348 enum parityop op, 349 struct rangelock **lockp, 350 off_t * errorloc) 351 { 352 int error; 353 int sdno; 354 u_int64_t stripe; /* stripe number */ 355 int *parity_buf; /* buffer address for current parity block */ 356 int *newparity_buf; /* and for new parity block */ 357 int mysize; /* I/O transfer size for this transfer */ 358 int isize; /* mysize in ints */ 359 int i; 360 int psd; /* parity subdisk number */ 361 int newpsd; /* and "subdisk number" of new parity */ 362 struct buf **bpp; /* pointers to our bps */ 363 struct buf *pbp; /* buffer header for parity stripe */ 364 int *sbuf; 365 int bufcount; /* number of buffers we need */ 366 367 stripe = pstripe / plex->stripesize; /* stripe number */ 368 psd = plex->subdisks - 1 - stripe % plex->subdisks; /* parity subdisk for this stripe */ 369 parity_buf = NULL; /* to keep the compiler happy */ 370 error = 0; 371 372 /* 373 * It's possible that the default transfer size 374 * we chose is not a factor of the stripe size. 375 * We *must* limit this operation to a single 376 * stripe, at least for RAID-5 rebuild, since 377 * the parity subdisk changes between stripes, 378 * so in this case we need to perform a short 379 * transfer. Set variable mysize to reflect 380 * this. 381 */ 382 mysize = min(size, (plex->stripesize * (stripe + 1) - pstripe) << DEV_BSHIFT); 383 isize = mysize / (sizeof(int)); /* number of ints in the buffer */ 384 bufcount = plex->subdisks + 1; /* sd buffers plus result buffer */ 385 newpsd = plex->subdisks; 386 bpp = (struct buf **) Malloc(bufcount * sizeof(struct buf *)); /* array of pointers to bps */ 387 388 /* First, build requests for all subdisks */ 389 for (sdno = 0; sdno < bufcount; sdno++) { /* for each subdisk */ 390 if ((sdno != psd) || (op != rebuildparity)) { 391 /* Get a buffer header and initialize it. */ 392 bpp[sdno] = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 393 bpp[sdno]->b_data = Malloc(mysize); 394 if (sdno == psd) 395 parity_buf = (int *) bpp[sdno]->b_data; 396 if (sdno == newpsd) /* the new one? */ 397 bpp[sdno]->b_bio1.bio_driver_info = VINUM_SD(plex->sdnos[psd]); /* write back to the parity SD */ 398 else 399 bpp[sdno]->b_bio1.bio_driver_info = VINUM_SD(plex->sdnos[sdno]); /* device number */ 400 bpp[sdno]->b_cmd = BUF_CMD_READ; /* either way, read it */ 401 bpp[sdno]->b_bcount = mysize; 402 bpp[sdno]->b_resid = bpp[sdno]->b_bcount; 403 bpp[sdno]->b_bio1.bio_offset = (off_t)pstripe << DEV_BSHIFT; /* transfer from here */ 404 } 405 } 406 407 /* Initialize result buffer */ 408 pbp = bpp[newpsd]; 409 newparity_buf = (int *) bpp[newpsd]->b_data; 410 bzero(newparity_buf, mysize); 411 412 /* 413 * Now lock the stripe with the first non-parity 414 * bp as locking bp. 415 */ 416 *lockp = lockrange(pstripe * plex->stripesize * (plex->subdisks - 1), 417 bpp[psd ? 0 : 1], 418 plex); 419 420 /* 421 * Then issue requests for all subdisks in 422 * parallel. Don't transfer the parity stripe 423 * if we're rebuilding parity, unless we also 424 * want to check it. 425 */ 426 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each real subdisk */ 427 if ((sdno != psd) || (op != rebuildparity)) { 428 sdio(&bpp[sdno]->b_bio1); 429 } 430 } 431 432 /* 433 * Next, wait for the requests to complete. 434 * We wait in the order in which they were 435 * issued, which isn't necessarily the order in 436 * which they complete, but we don't have a 437 * convenient way of doing the latter, and the 438 * delay is minimal. 439 */ 440 for (sdno = 0; sdno < plex->subdisks; sdno++) { /* for each subdisk */ 441 if ((sdno != psd) || (op != rebuildparity)) { 442 biowait(bpp[sdno]); 443 if (bpp[sdno]->b_flags & B_ERROR) /* can't read, */ 444 error = bpp[sdno]->b_error; 445 else if (sdno != psd) { /* update parity */ 446 sbuf = (int *) bpp[sdno]->b_data; 447 for (i = 0; i < isize; i++) 448 ((int *) newparity_buf)[i] ^= sbuf[i]; /* xor in the buffer */ 449 } 450 } 451 if (sdno != psd) { /* release all bps except parity */ 452 Free(bpp[sdno]->b_data); 453 relpbuf(bpp[sdno], &vinum_conf.physbufs); /* give back our resources */ 454 } 455 } 456 457 /* 458 * If we're checking, compare the calculated 459 * and the read parity block. If they're 460 * different, return the plex-relative offset; 461 * otherwise return -1. 462 */ 463 if ((op == checkparity) 464 || (op == rebuildandcheckparity)) { 465 *errorloc = -1; /* no error yet */ 466 for (i = 0; i < isize; i++) { 467 if (parity_buf[i] != newparity_buf[i]) { 468 *errorloc = (off_t) (pstripe << DEV_BSHIFT) * (plex->subdisks - 1) 469 + i * sizeof(int); 470 break; 471 } 472 } 473 Free(bpp[psd]->b_data); 474 relpbuf(bpp[psd], &vinum_conf.physbufs); /* give back our resources */ 475 } 476 /* release our resources */ 477 Free(bpp); 478 if (error) { 479 pbp->b_flags |= B_ERROR; 480 pbp->b_error = error; 481 } 482 return pbp; 483 } 484 485 /* 486 * Initialize a subdisk by writing zeroes to the 487 * complete address space. If verify is set, 488 * check each transfer for correctness. 489 * 490 * Each call to this function writes (and maybe 491 * checks) a single block. 492 */ 493 int 494 initsd(int sdno, int verify) 495 { 496 struct sd *sd; 497 struct plex *plex; 498 struct volume *vol; 499 struct buf *bp; 500 int error; 501 int size; /* size of init block, bytes */ 502 daddr_t plexblkno; /* lblkno in plex */ 503 int verified; /* set when we're happy with what we wrote */ 504 505 error = 0; 506 plexblkno = 0; /* to keep the compiler happy */ 507 sd = &SD[sdno]; 508 if (sd->plexno < 0) /* no plex? */ 509 return EINVAL; 510 plex = &PLEX[sd->plexno]; /* point to plex */ 511 if (plex->volno >= 0) 512 vol = &VOL[plex->volno]; 513 else 514 vol = NULL; 515 516 if (sd->init_blocksize == 0) { 517 if (plex->stripesize != 0) /* we're striped, don't init more than */ 518 sd->init_blocksize = min(DEFAULT_REVIVE_BLOCKSIZE, /* one block at a time */ 519 plex->stripesize << DEV_BSHIFT); 520 else 521 sd->init_blocksize = DEFAULT_REVIVE_BLOCKSIZE; 522 } else if (sd->init_blocksize > MAX_REVIVE_BLOCKSIZE) 523 sd->init_blocksize = MAX_REVIVE_BLOCKSIZE; 524 525 size = min(sd->init_blocksize >> DEV_BSHIFT, sd->sectors - sd->initialized) << DEV_BSHIFT; 526 527 bp = getpbuf(&vinum_conf.physbufs); /* Get a buffer */ 528 bp->b_data = Malloc(size); 529 530 verified = 0; 531 while (!verified) { /* until we're happy with it, */ 532 bp->b_bcount = size; 533 bp->b_resid = bp->b_bcount; 534 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* write it to here */ 535 bp->b_bio1.bio_driver_info = VINUM_SD(sdno); 536 bzero(bp->b_data, bp->b_bcount); 537 bp->b_cmd = BUF_CMD_WRITE; 538 sdio(&bp->b_bio1); /* perform the I/O */ 539 biowait(bp); 540 if (bp->b_flags & B_ERROR) 541 error = bp->b_error; 542 if ((error == 0) && verify) { /* check that it got there */ 543 bp->b_bcount = size; 544 bp->b_resid = bp->b_bcount; 545 bp->b_bio1.bio_offset = (off_t)sd->initialized << DEV_BSHIFT; /* read from here */ 546 bp->b_bio1.bio_driver_info = VINUM_SD(sdno); 547 bp->b_cmd = BUF_CMD_READ; /* read it back */ 548 sdio(&bp->b_bio1); 549 biowait(bp); 550 /* 551 * XXX Bug fix code. This is hopefully no 552 * longer needed (21 February 2000). 553 */ 554 if (bp->b_flags & B_ERROR) 555 error = bp->b_error; 556 else if ((*bp->b_data != 0) /* first word spammed */ 557 ||(bcmp(bp->b_data, &bp->b_data[1], bp->b_bcount - 1))) { /* or one of the others */ 558 kprintf("vinum: init error on %s, offset 0x%llx sectors\n", 559 sd->name, 560 (long long) sd->initialized); 561 verified = 0; 562 } else 563 verified = 1; 564 } else 565 verified = 1; 566 } 567 Free(bp->b_data); 568 relpbuf(bp, &vinum_conf.physbufs); 569 if (error == 0) { /* did it, */ 570 sd->initialized += size >> DEV_BSHIFT; /* moved this much further down */ 571 if (sd->initialized >= sd->sectors) { /* finished */ 572 sd->initialized = 0; 573 set_sd_state(sdno, sd_initialized, setstate_force); /* bring the sd up */ 574 log(LOG_INFO, "vinum: %s is %s\n", sd->name, sd_state(sd->state)); 575 save_config(); /* and save the updated configuration */ 576 } else /* more to go, */ 577 error = EAGAIN; /* ya'll come back, see? */ 578 } 579 return error; 580 } 581 582 /* Local Variables: */ 583 /* fill-column: 50 */ 584 /* End: */ 585