1 /* vinuminterrupt.c: bottom half of the driver */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 1999 5 * Nan Yang Computer Services Limited. All rights reserved. 6 * 7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. 8 * 9 * Written by Greg Lehey 10 * 11 * This software is distributed under the so-called ``Berkeley 12 * License'': 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by Nan Yang Computer 25 * Services Limited. 26 * 4. Neither the name of the Company nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * This software is provided ``as is'', and any express or implied 31 * warranties, including, but not limited to, the implied warranties of 32 * merchantability and fitness for a particular purpose are disclaimed. 33 * In no event shall the company or contributors be liable for any 34 * direct, indirect, incidental, special, exemplary, or consequential 35 * damages (including, but not limited to, procurement of substitute 36 * goods or services; loss of use, data, or profits; or business 37 * interruption) however caused and on any theory of liability, whether 38 * in contract, strict liability, or tort (including negligence or 39 * otherwise) arising in any way out of the use of this software, even if 40 * advised of the possibility of such damage. 41 * 42 * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $ 43 * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $ 44 * $DragonFly: src/sys/dev/raid/vinum/vinuminterrupt.c,v 1.13 2007/08/01 11:46:46 swildner Exp $ 45 */ 46 47 #include "vinumhdr.h" 48 #include "request.h" 49 #include <sys/resourcevar.h> 50 51 void complete_raid5_write(struct rqelement *); 52 void complete_rqe(struct bio *bio); 53 void sdio_done(struct bio *bio); 54 55 /* 56 * Take a completed buffer, transfer the data back if 57 * it's a read, and complete the high-level request 58 * if this is the last subrequest. 59 * 60 * The bp parameter is in fact a struct rqelement, which 61 * includes a couple of extras at the end. 62 */ 63 void 64 complete_rqe(struct bio *bio) 65 { 66 struct buf *bp = bio->bio_buf; 67 struct rqelement *rqe; 68 struct request *rq; 69 struct rqgroup *rqg; 70 struct bio *ubio; /* user buffer */ 71 struct drive *drive; 72 struct sd *sd; 73 char *gravity; /* for error messages */ 74 75 get_mplock(); 76 77 rqe = (struct rqelement *) bp; /* point to the element that completed */ 78 rqg = rqe->rqg; /* and the request group */ 79 rq = rqg->rq; /* and the complete request */ 80 ubio = rq->bio; /* user buffer */ 81 82 #ifdef VINUMDEBUG 83 if (debug & DEBUG_LASTREQS) 84 logrq(loginfo_iodone, (union rqinfou) rqe, ubio); 85 #endif 86 drive = &DRIVE[rqe->driveno]; 87 drive->active--; /* one less outstanding I/O on this drive */ 88 vinum_conf.active--; /* one less outstanding I/O globally */ 89 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ 90 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ 91 wakeup(&launch_requests); /* let another one at it */ 92 if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ 93 gravity = ""; 94 sd = &SD[rqe->sdno]; 95 96 if (bp->b_error != 0) /* did it return a number? */ 97 rq->error = bp->b_error; /* yes, put it in. */ 98 else if (rq->error == 0) /* no: do we have one already? */ 99 rq->error = EIO; /* no: catchall "I/O error" */ 100 sd->lasterror = rq->error; 101 if (bp->b_cmd == BUF_CMD_READ) { 102 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { 103 gravity = " fatal"; 104 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ 105 } 106 log(LOG_ERR, 107 "%s:%s read error, offset %lld for %d bytes\n", 108 gravity, 109 sd->name, 110 (long long)bio->bio_offset, 111 bp->b_bcount); 112 } else { /* write operation */ 113 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { 114 gravity = "fatal "; 115 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ 116 } 117 log(LOG_ERR, 118 "%s:%s write error, offset %lld for %d bytes\n", 119 gravity, 120 sd->name, 121 (long long)bio->bio_offset, 122 bp->b_bcount); 123 } 124 log(LOG_ERR, 125 "%s: user buffer offset %lld for %d bytes\n", 126 sd->name, 127 (long long)ubio->bio_offset, 128 ubio->bio_buf->b_bcount); 129 if (rq->error == ENXIO) { /* the drive's down too */ 130 log(LOG_ERR, 131 "%s: fatal drive I/O error, offset %lld for %d bytes\n", 132 DRIVE[rqe->driveno].label.name, 133 (long long)bio->bio_offset, 134 bp->b_bcount); 135 DRIVE[rqe->driveno].lasterror = rq->error; 136 set_drive_state(rqe->driveno, /* take the drive down */ 137 drive_down, 138 setstate_force); 139 } 140 } 141 /* Now update the statistics */ 142 if (bp->b_cmd == BUF_CMD_READ) { /* read operation */ 143 DRIVE[rqe->driveno].reads++; 144 DRIVE[rqe->driveno].bytes_read += bp->b_bcount; 145 SD[rqe->sdno].reads++; 146 SD[rqe->sdno].bytes_read += bp->b_bcount; 147 PLEX[rqe->rqg->plexno].reads++; 148 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; 149 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ 150 VOL[PLEX[rqe->rqg->plexno].volno].reads++; 151 VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; 152 } 153 } else { /* write operation */ 154 DRIVE[rqe->driveno].writes++; 155 DRIVE[rqe->driveno].bytes_written += bp->b_bcount; 156 SD[rqe->sdno].writes++; 157 SD[rqe->sdno].bytes_written += bp->b_bcount; 158 PLEX[rqe->rqg->plexno].writes++; 159 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; 160 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ 161 VOL[PLEX[rqe->rqg->plexno].volno].writes++; 162 VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; 163 } 164 } 165 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ 166 int *sdata; /* source */ 167 int *data; /* and group data */ 168 int length; /* and count involved */ 169 int count; /* loop counter */ 170 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ 171 172 /* XOR destination is the user data */ 173 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ 174 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ 175 length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ 176 177 for (count = 0; count < length; count++) 178 data[count] ^= sdata[count]; 179 180 /* 181 * In a normal read, we will normally read directly 182 * into the user buffer. This doesn't work if 183 * we're also doing a recovery, so we have to 184 * copy it 185 */ 186 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ 187 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ 188 char *dst; 189 190 dst = (char *) ubio->bio_buf->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ 191 length = rqe->datalen << DEV_BSHIFT; /* and count involved */ 192 bcopy(src, dst, length); /* move it */ 193 } 194 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ 195 &&(rqg->active == 1)) /* and this is the last active request */ 196 complete_raid5_write(rqe); 197 /* 198 * This is the earliest place where we can be 199 * sure that the request has really finished, 200 * since complete_raid5_write can issue new 201 * requests. 202 */ 203 rqg->active--; /* this request now finished */ 204 if (rqg->active == 0) { /* request group finished, */ 205 rq->active--; /* one less */ 206 if (rqg->lock) { /* got a lock? */ 207 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ 208 rqg->lock = 0; 209 } 210 } 211 if (rq->active == 0) { /* request finished, */ 212 #ifdef VINUMDEBUG 213 if (debug & DEBUG_RESID) { 214 if (ubio->bio_buf->b_resid != 0) /* still something to transfer? */ 215 Debugger("resid"); 216 } 217 #endif 218 219 if (rq->error) { /* did we have an error? */ 220 if (rq->isplex) { /* plex operation, */ 221 ubio->bio_buf->b_flags |= B_ERROR; /* yes, propagate to user */ 222 ubio->bio_buf->b_error = rq->error; 223 } else /* try to recover */ 224 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */ 225 } else { 226 ubio->bio_buf->b_resid = 0; /* completed our transfer */ 227 if (rq->isplex == 0) /* volume request, */ 228 VOL[rq->volplex.volno].active--; /* another request finished */ 229 biodone(ubio); /* top level buffer completed */ 230 freerq(rq); /* return the request storage */ 231 } 232 } 233 rel_mplock(); 234 } 235 236 /* Free a request block and anything hanging off it */ 237 void 238 freerq(struct request *rq) 239 { 240 struct rqgroup *rqg; 241 struct rqgroup *nrqg; /* next in chain */ 242 int rqno; 243 244 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */ 245 if (rqg->lock) /* got a lock? */ 246 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ 247 for (rqno = 0; rqno < rqg->count; rqno++) { 248 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */ 249 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */ 250 Free(rqg->rqe[rqno].b.b_data); /* free it */ 251 if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */ 252 BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */ 253 uninitbufbio(&rqg->rqe[rqno].b); 254 } 255 } 256 nrqg = rqg->next; /* note the next one */ 257 Free(rqg); /* and free this one */ 258 } 259 Free(rq); /* free the request itself */ 260 } 261 262 /* I/O on subdisk completed */ 263 void 264 sdio_done(struct bio *bio) 265 { 266 struct sdbuf *sbp; 267 268 get_mplock(); 269 270 sbp = (struct sdbuf *) bio->bio_buf; 271 if (sbp->b.b_flags & B_ERROR) { /* had an error */ 272 sbp->bio->bio_buf->b_flags |= B_ERROR; /* propagate upwards */ 273 sbp->bio->bio_buf->b_error = sbp->b.b_error; 274 } 275 #ifdef VINUMDEBUG 276 if (debug & DEBUG_LASTREQS) 277 logrq(loginfo_sdiodone, (union rqinfou)bio, bio); 278 #endif 279 sbp->bio->bio_buf->b_resid = sbp->b.b_resid; /* copy the resid field */ 280 /* Now update the statistics */ 281 if (sbp->b.b_cmd == BUF_CMD_READ) { /* read operation */ 282 DRIVE[sbp->driveno].reads++; 283 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount; 284 SD[sbp->sdno].reads++; 285 SD[sbp->sdno].bytes_read += sbp->b.b_bcount; 286 } else { /* write operation */ 287 DRIVE[sbp->driveno].writes++; 288 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount; 289 SD[sbp->sdno].writes++; 290 SD[sbp->sdno].bytes_written += sbp->b.b_bcount; 291 } 292 biodone_sync(bio); 293 biodone(sbp->bio); /* complete the caller's I/O */ 294 BUF_UNLOCK(&sbp->b); 295 uninitbufbio(&sbp->b); 296 Free(sbp); 297 rel_mplock(); 298 } 299 300 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */ 301 void 302 complete_raid5_write(struct rqelement *rqe) 303 { 304 int *sdata; /* source */ 305 int *pdata; /* and parity block data */ 306 int length; /* and count involved */ 307 int count; /* loop counter */ 308 int rqno; /* request index */ 309 int rqoffset; /* offset of request data from parity data */ 310 struct bio *ubio; /* user buffer header */ 311 struct request *rq; /* pointer to our request */ 312 struct rqgroup *rqg; /* and to the request group */ 313 struct rqelement *prqe; /* point to the parity block */ 314 struct drive *drive; /* drive to access */ 315 rqg = rqe->rqg; /* and to our request group */ 316 rq = rqg->rq; /* point to our request */ 317 ubio = rq->bio; /* user's buffer header */ 318 prqe = &rqg->rqe[0]; /* point to the parity block */ 319 320 /* 321 * If we get to this function, we have normal or 322 * degraded writes, or a combination of both. We do 323 * the same thing in each case: we perform an 324 * exclusive or to the parity block. The only 325 * difference is the origin of the data and the 326 * address range. 327 */ 328 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ 329 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ 330 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ 331 332 /* Now get what data we need from each block */ 333 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ 334 rqe = &rqg->rqe[rqno]; /* this request */ 335 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ 336 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ 337 338 /* 339 * Add the data block to the parity block. Before 340 * we started the request, we zeroed the parity 341 * block, so the result of adding all the other 342 * blocks and the block we want to write will be 343 * the correct parity block. 344 */ 345 for (count = 0; count < length; count++) 346 pdata[count] ^= sdata[count]; 347 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ 348 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ 349 Free(rqe->b.b_data); /* free it now */ 350 rqe->flags &= ~XFR_MALLOCED; 351 } 352 } 353 } 354 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ 355 /* Get what data we need from each block */ 356 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ 357 rqe = &rqg->rqe[rqno]; /* this request */ 358 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) 359 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ 360 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ 361 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ 362 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ 363 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ 364 365 /* 366 * "remove" the old data block 367 * from the parity block 368 */ 369 if ((pdata < ((int *) prqe->b.b_data)) 370 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) 371 || (sdata < ((int *) rqe->b.b_data)) 372 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) 373 panic("complete_raid5_write: bounds overflow"); 374 for (count = 0; count < length; count++) 375 pdata[count] ^= sdata[count]; 376 377 /* "add" the new data block */ 378 sdata = (int *) (&ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ 379 if ((sdata < ((int *) ubio->bio_buf->b_data)) 380 || (&sdata[length] > ((int *) (ubio->bio_buf->b_data + ubio->bio_buf->b_bcount)))) 381 panic("complete_raid5_write: bounds overflow"); 382 for (count = 0; count < length; count++) 383 pdata[count] ^= sdata[count]; 384 385 /* Free the malloced buffer */ 386 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ 387 Free(rqe->b.b_data); /* free it */ 388 rqe->flags &= ~XFR_MALLOCED; 389 } else 390 panic("complete_raid5_write: malloc conflict"); 391 392 if ((rqe->b.b_cmd == BUF_CMD_READ) /* this was a read */ 393 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ 394 rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */ 395 rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */ 396 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ 397 rqe->b.b_data = &ubio->bio_buf->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ 398 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ 399 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ 400 rqe->b.b_bio1.bio_offset += (off_t)rqe->dataoffset << DEV_BSHIFT; /* point to the correct block */ 401 drive = &DRIVE[rqe->driveno]; /* drive to access */ 402 rqe->b.b_bio1.bio_driver_info = drive->dev; 403 rqg->active++; /* another active request */ 404 405 /* We can't sleep here, so we just increment the counters. */ 406 drive->active++; 407 if (drive->active >= drive->maxactive) 408 drive->maxactive = drive->active; 409 vinum_conf.active++; 410 if (vinum_conf.active >= vinum_conf.maxactive) 411 vinum_conf.maxactive = vinum_conf.active; 412 #if VINUMDEBUG 413 if (debug & DEBUG_ADDRESSES) 414 log(LOG_DEBUG, 415 " %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n", 416 (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write", 417 drive->devicename, 418 rqe->sdno, 419 rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT), 420 rqe->b.b_bio1.bio_offset, 421 rqe->b.b_bcount); 422 if (debug & DEBUG_LASTREQS) 423 logrq(loginfo_raid5_data, (union rqinfou) rqe, ubio); 424 #endif 425 vn_strategy(drive->vp, &rqe->b.b_bio1); 426 } 427 } 428 } 429 } 430 /* Finally, write the parity block */ 431 rqe = &rqg->rqe[0]; 432 rqe->b.b_cmd = BUF_CMD_WRITE; /* we're writing now */ 433 rqe->b.b_bio1.bio_done = complete_rqe; /* by calling us here */ 434 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ 435 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ 436 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ 437 drive = &DRIVE[rqe->driveno]; /* drive to access */ 438 rqe->b.b_bio1.bio_driver_info = drive->dev; 439 rqg->active++; /* another active request */ 440 441 /* We can't sleep here, so we just increment the counters. */ 442 drive->active++; 443 if (drive->active >= drive->maxactive) 444 drive->maxactive = drive->active; 445 vinum_conf.active++; 446 if (vinum_conf.active >= vinum_conf.maxactive) 447 vinum_conf.maxactive = vinum_conf.active; 448 449 #if VINUMDEBUG 450 if (debug & DEBUG_ADDRESSES) 451 log(LOG_DEBUG, 452 " %s dev %s, sd %d, offset 0x%llx, devoffset 0x%llx, length %d\n", 453 (rqe->b.b_cmd == BUF_CMD_READ) ? "Read" : "Write", 454 drive->devicename, 455 rqe->sdno, 456 rqe->b.b_bio1.bio_offset - ((off_t)SD[rqe->sdno].driveoffset << DEV_BSHIFT), 457 rqe->b.b_bio1.bio_offset, 458 rqe->b.b_bcount); 459 if (debug & DEBUG_LASTREQS) 460 logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubio); 461 #endif 462 vn_strategy(drive->vp, &rqe->b.b_bio1); 463 } 464 465 /* Local Variables: */ 466 /* fill-column: 50 */ 467 /* End: */ 468