1 /* vinuminterrupt.c: bottom half of the driver */ 2 3 /*- 4 * Copyright (c) 1997, 1998, 1999 5 * Nan Yang Computer Services Limited. All rights reserved. 6 * 7 * Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project. 8 * 9 * Written by Greg Lehey 10 * 11 * This software is distributed under the so-called ``Berkeley 12 * License'': 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by Nan Yang Computer 25 * Services Limited. 26 * 4. Neither the name of the Company nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * This software is provided ``as is'', and any express or implied 31 * warranties, including, but not limited to, the implied warranties of 32 * merchantability and fitness for a particular purpose are disclaimed. 33 * In no event shall the company or contributors be liable for any 34 * direct, indirect, incidental, special, exemplary, or consequential 35 * damages (including, but not limited to, procurement of substitute 36 * goods or services; loss of use, data, or profits; or business 37 * interruption) however caused and on any theory of liability, whether 38 * in contract, strict liability, or tort (including negligence or 39 * otherwise) arising in any way out of the use of this software, even if 40 * advised of the possibility of such damage. 41 * 42 * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $ 43 * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $ 44 * $DragonFly: src/sys/dev/raid/vinum/vinuminterrupt.c,v 1.5 2005/09/16 04:33:14 dillon Exp $ 45 */ 46 47 #include "vinumhdr.h" 48 #include "request.h" 49 #include <sys/resourcevar.h> 50 51 void complete_raid5_write(struct rqelement *); 52 void complete_rqe(struct buf *bp); 53 void sdio_done(struct buf *bp); 54 55 /* 56 * Take a completed buffer, transfer the data back if 57 * it's a read, and complete the high-level request 58 * if this is the last subrequest. 59 * 60 * The bp parameter is in fact a struct rqelement, which 61 * includes a couple of extras at the end. 62 */ 63 void 64 complete_rqe(struct buf *bp) 65 { 66 struct rqelement *rqe; 67 struct request *rq; 68 struct rqgroup *rqg; 69 struct buf *ubp; /* user buffer */ 70 struct drive *drive; 71 struct sd *sd; 72 char *gravity; /* for error messages */ 73 74 rqe = (struct rqelement *) bp; /* point to the element that completed */ 75 rqg = rqe->rqg; /* and the request group */ 76 rq = rqg->rq; /* and the complete request */ 77 ubp = rq->bp; /* user buffer */ 78 79 #ifdef VINUMDEBUG 80 if (debug & DEBUG_LASTREQS) 81 logrq(loginfo_iodone, (union rqinfou) rqe, ubp); 82 #endif 83 drive = &DRIVE[rqe->driveno]; 84 drive->active--; /* one less outstanding I/O on this drive */ 85 vinum_conf.active--; /* one less outstanding I/O globally */ 86 if ((drive->active == (DRIVE_MAXACTIVE - 1)) /* we were at the drive limit */ 87 ||(vinum_conf.active == VINUM_MAXACTIVE)) /* or the global limit */ 88 wakeup(&launch_requests); /* let another one at it */ 89 if ((bp->b_flags & B_ERROR) != 0) { /* transfer in error */ 90 gravity = ""; 91 sd = &SD[rqe->sdno]; 92 93 if (bp->b_error != 0) /* did it return a number? */ 94 rq->error = bp->b_error; /* yes, put it in. */ 95 else if (rq->error == 0) /* no: do we have one already? */ 96 rq->error = EIO; /* no: catchall "I/O error" */ 97 sd->lasterror = rq->error; 98 if (bp->b_flags & B_READ) { 99 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { 100 gravity = " fatal"; 101 set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */ 102 } 103 log(LOG_ERR, 104 "%s:%s read error, block %d for %ld bytes\n", 105 gravity, 106 sd->name, 107 bp->b_blkno, 108 bp->b_bcount); 109 } else { /* write operation */ 110 if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) { 111 gravity = "fatal "; 112 set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */ 113 } 114 log(LOG_ERR, 115 "%s:%s write error, block %d for %ld bytes\n", 116 gravity, 117 sd->name, 118 bp->b_blkno, 119 bp->b_bcount); 120 } 121 log(LOG_ERR, 122 "%s: user buffer block %d for %ld bytes\n", 123 sd->name, 124 ubp->b_blkno, 125 ubp->b_bcount); 126 if (rq->error == ENXIO) { /* the drive's down too */ 127 log(LOG_ERR, 128 "%s: fatal drive I/O error, block %d for %ld bytes\n", 129 DRIVE[rqe->driveno].label.name, 130 bp->b_blkno, 131 bp->b_bcount); 132 DRIVE[rqe->driveno].lasterror = rq->error; 133 set_drive_state(rqe->driveno, /* take the drive down */ 134 drive_down, 135 setstate_force); 136 } 137 } 138 /* Now update the statistics */ 139 if (bp->b_flags & B_READ) { /* read operation */ 140 DRIVE[rqe->driveno].reads++; 141 DRIVE[rqe->driveno].bytes_read += bp->b_bcount; 142 SD[rqe->sdno].reads++; 143 SD[rqe->sdno].bytes_read += bp->b_bcount; 144 PLEX[rqe->rqg->plexno].reads++; 145 PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount; 146 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ 147 VOL[PLEX[rqe->rqg->plexno].volno].reads++; 148 VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount; 149 } 150 } else { /* write operation */ 151 DRIVE[rqe->driveno].writes++; 152 DRIVE[rqe->driveno].bytes_written += bp->b_bcount; 153 SD[rqe->sdno].writes++; 154 SD[rqe->sdno].bytes_written += bp->b_bcount; 155 PLEX[rqe->rqg->plexno].writes++; 156 PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount; 157 if (PLEX[rqe->rqg->plexno].volno >= 0) { /* volume I/O, not plex */ 158 VOL[PLEX[rqe->rqg->plexno].volno].writes++; 159 VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount; 160 } 161 } 162 if (rqg->flags & XFR_RECOVERY_READ) { /* recovery read, */ 163 int *sdata; /* source */ 164 int *data; /* and group data */ 165 int length; /* and count involved */ 166 int count; /* loop counter */ 167 struct rqelement *urqe = &rqg->rqe[rqg->badsdno]; /* rqe of the bad subdisk */ 168 169 /* XOR destination is the user data */ 170 sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]; /* old data contents */ 171 data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */ 172 length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ 173 174 for (count = 0; count < length; count++) 175 data[count] ^= sdata[count]; 176 177 /* 178 * In a normal read, we will normally read directly 179 * into the user buffer. This doesn't work if 180 * we're also doing a recovery, so we have to 181 * copy it 182 */ 183 if (rqe->flags & XFR_NORMAL_READ) { /* normal read as well, */ 184 char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */ 185 char *dst; 186 187 dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */ 188 length = rqe->datalen << DEV_BSHIFT; /* and count involved */ 189 bcopy(src, dst, length); /* move it */ 190 } 191 } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation */ 192 &&(rqg->active == 1)) /* and this is the last active request */ 193 complete_raid5_write(rqe); 194 /* 195 * This is the earliest place where we can be 196 * sure that the request has really finished, 197 * since complete_raid5_write can issue new 198 * requests. 199 */ 200 rqg->active--; /* this request now finished */ 201 if (rqg->active == 0) { /* request group finished, */ 202 rq->active--; /* one less */ 203 if (rqg->lock) { /* got a lock? */ 204 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ 205 rqg->lock = 0; 206 } 207 } 208 if (rq->active == 0) { /* request finished, */ 209 #ifdef VINUMDEBUG 210 if (debug & DEBUG_RESID) { 211 if (ubp->b_resid != 0) /* still something to transfer? */ 212 Debugger("resid"); 213 } 214 #endif 215 216 if (rq->error) { /* did we have an error? */ 217 if (rq->isplex) { /* plex operation, */ 218 ubp->b_flags |= B_ERROR; /* yes, propagate to user */ 219 ubp->b_error = rq->error; 220 } else /* try to recover */ 221 queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */ 222 } else { 223 ubp->b_resid = 0; /* completed our transfer */ 224 if (rq->isplex == 0) /* volume request, */ 225 VOL[rq->volplex.volno].active--; /* another request finished */ 226 biodone(ubp); /* top level buffer completed */ 227 freerq(rq); /* return the request storage */ 228 } 229 } 230 } 231 232 /* Free a request block and anything hanging off it */ 233 void 234 freerq(struct request *rq) 235 { 236 struct rqgroup *rqg; 237 struct rqgroup *nrqg; /* next in chain */ 238 int rqno; 239 240 for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) { /* through the whole request chain */ 241 if (rqg->lock) /* got a lock? */ 242 unlockrange(rqg->plexno, rqg->lock); /* yes, free it */ 243 for (rqno = 0; rqno < rqg->count; rqno++) { 244 if ((rqg->rqe[rqno].flags & XFR_MALLOCED) /* data buffer was malloced, */ 245 &&rqg->rqe[rqno].b.b_data) /* and the allocation succeeded */ 246 Free(rqg->rqe[rqno].b.b_data); /* free it */ 247 if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) { /* locked this buffer, */ 248 BUF_UNLOCK(&rqg->rqe[rqno].b); /* unlock it again */ 249 BUF_LOCKFREE(&rqg->rqe[rqno].b); 250 } 251 } 252 nrqg = rqg->next; /* note the next one */ 253 Free(rqg); /* and free this one */ 254 } 255 Free(rq); /* free the request itself */ 256 } 257 258 /* I/O on subdisk completed */ 259 void 260 sdio_done(struct buf *bp) 261 { 262 struct sdbuf *sbp; 263 264 sbp = (struct sdbuf *) bp; 265 if (sbp->b.b_flags & B_ERROR) { /* had an error */ 266 sbp->bp->b_flags |= B_ERROR; /* propagate upwards */ 267 sbp->bp->b_error = sbp->b.b_error; 268 } 269 #ifdef VINUMDEBUG 270 if (debug & DEBUG_LASTREQS) 271 logrq(loginfo_sdiodone, (union rqinfou) bp, bp); 272 #endif 273 sbp->bp->b_resid = sbp->b.b_resid; /* copy the resid field */ 274 /* Now update the statistics */ 275 if (bp->b_flags & B_READ) { /* read operation */ 276 DRIVE[sbp->driveno].reads++; 277 DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount; 278 SD[sbp->sdno].reads++; 279 SD[sbp->sdno].bytes_read += sbp->b.b_bcount; 280 } else { /* write operation */ 281 DRIVE[sbp->driveno].writes++; 282 DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount; 283 SD[sbp->sdno].writes++; 284 SD[sbp->sdno].bytes_written += sbp->b.b_bcount; 285 } 286 biodone(sbp->bp); /* complete the caller's I/O */ 287 BUF_UNLOCK(&sbp->b); 288 BUF_LOCKFREE(&sbp->b); 289 Free(sbp); 290 } 291 292 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */ 293 void 294 complete_raid5_write(struct rqelement *rqe) 295 { 296 int *sdata; /* source */ 297 int *pdata; /* and parity block data */ 298 int length; /* and count involved */ 299 int count; /* loop counter */ 300 int rqno; /* request index */ 301 int rqoffset; /* offset of request data from parity data */ 302 struct buf *ubp; /* user buffer header */ 303 struct request *rq; /* pointer to our request */ 304 struct rqgroup *rqg; /* and to the request group */ 305 struct rqelement *prqe; /* point to the parity block */ 306 struct drive *drive; /* drive to access */ 307 308 rqg = rqe->rqg; /* and to our request group */ 309 rq = rqg->rq; /* point to our request */ 310 ubp = rq->bp; /* user's buffer header */ 311 prqe = &rqg->rqe[0]; /* point to the parity block */ 312 313 /* 314 * If we get to this function, we have normal or 315 * degraded writes, or a combination of both. We do 316 * the same thing in each case: we perform an 317 * exclusive or to the parity block. The only 318 * difference is the origin of the data and the 319 * address range. 320 */ 321 if (rqe->flags & XFR_DEGRADED_WRITE) { /* do the degraded write stuff */ 322 pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */ 323 bzero(pdata, prqe->grouplen << DEV_BSHIFT); /* start with nothing in the parity block */ 324 325 /* Now get what data we need from each block */ 326 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ 327 rqe = &rqg->rqe[rqno]; /* this request */ 328 sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */ 329 length = rqe->grouplen << (DEV_BSHIFT - 2); /* and count involved */ 330 331 /* 332 * Add the data block to the parity block. Before 333 * we started the request, we zeroed the parity 334 * block, so the result of adding all the other 335 * blocks and the block we want to write will be 336 * the correct parity block. 337 */ 338 for (count = 0; count < length; count++) 339 pdata[count] ^= sdata[count]; 340 if ((rqe->flags & XFR_MALLOCED) /* the buffer was malloced, */ 341 &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) { /* and we have no normal write, */ 342 Free(rqe->b.b_data); /* free it now */ 343 rqe->flags &= ~XFR_MALLOCED; 344 } 345 } 346 } 347 if (rqg->flags & XFR_NORMAL_WRITE) { /* do normal write stuff */ 348 /* Get what data we need from each block */ 349 for (rqno = 1; rqno < rqg->count; rqno++) { /* for all the data blocks */ 350 rqe = &rqg->rqe[rqno]; /* this request */ 351 if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE)) 352 == (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) { /* good data block to write */ 353 sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */ 354 rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */ 355 pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */ 356 length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */ 357 358 /* 359 * "remove" the old data block 360 * from the parity block 361 */ 362 if ((pdata < ((int *) prqe->b.b_data)) 363 || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount))) 364 || (sdata < ((int *) rqe->b.b_data)) 365 || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount)))) 366 panic("complete_raid5_write: bounds overflow"); 367 for (count = 0; count < length; count++) 368 pdata[count] ^= sdata[count]; 369 370 /* "add" the new data block */ 371 sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */ 372 if ((sdata < ((int *) ubp->b_data)) 373 || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount)))) 374 panic("complete_raid5_write: bounds overflow"); 375 for (count = 0; count < length; count++) 376 pdata[count] ^= sdata[count]; 377 378 /* Free the malloced buffer */ 379 if (rqe->flags & XFR_MALLOCED) { /* the buffer was malloced, */ 380 Free(rqe->b.b_data); /* free it */ 381 rqe->flags &= ~XFR_MALLOCED; 382 } else 383 panic("complete_raid5_write: malloc conflict"); 384 385 if ((rqe->b.b_flags & B_READ) /* this was a read */ 386 &&((rqe->flags & XFR_BAD_SUBDISK) == 0)) { /* and we can write this block */ 387 rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ 388 rqe->b.b_iodone = complete_rqe; /* by calling us here */ 389 rqe->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ 390 rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */ 391 rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */ 392 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim more */ 393 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ 394 rqe->b.b_blkno += rqe->dataoffset; /* point to the correct block */ 395 rqe->b.b_dev = DRIVE[rqe->driveno].dev; 396 rqg->active++; /* another active request */ 397 drive = &DRIVE[rqe->driveno]; /* drive to access */ 398 399 /* We can't sleep here, so we just increment the counters. */ 400 drive->active++; 401 if (drive->active >= drive->maxactive) 402 drive->maxactive = drive->active; 403 vinum_conf.active++; 404 if (vinum_conf.active >= vinum_conf.maxactive) 405 vinum_conf.maxactive = vinum_conf.active; 406 #if VINUMDEBUG 407 if (debug & DEBUG_ADDRESSES) 408 log(LOG_DEBUG, 409 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", 410 rqe->b.b_flags & B_READ ? "Read" : "Write", 411 major(rqe->b.b_dev), 412 minor(rqe->b.b_dev), 413 rqe->sdno, 414 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), 415 rqe->b.b_blkno, 416 rqe->b.b_bcount); 417 if (debug & DEBUG_LASTREQS) 418 logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp); 419 #endif 420 BUF_STRATEGY(&rqe->b, 0); 421 } 422 } 423 } 424 } 425 /* Finally, write the parity block */ 426 rqe = &rqg->rqe[0]; 427 rqe->b.b_flags &= ~(B_READ | B_DONE); /* we're writing now */ 428 rqe->b.b_iodone = complete_rqe; /* by calling us here */ 429 rqg->flags &= ~XFR_PARITYOP; /* reset flags that brought us here */ 430 rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT; /* length to write */ 431 rqe->b.b_bufsize = rqe->b.b_bcount; /* don't claim we have more */ 432 rqe->b.b_resid = rqe->b.b_bcount; /* nothing transferred */ 433 rqe->b.b_dev = DRIVE[rqe->driveno].dev; 434 rqg->active++; /* another active request */ 435 drive = &DRIVE[rqe->driveno]; /* drive to access */ 436 437 /* We can't sleep here, so we just increment the counters. */ 438 drive->active++; 439 if (drive->active >= drive->maxactive) 440 drive->maxactive = drive->active; 441 vinum_conf.active++; 442 if (vinum_conf.active >= vinum_conf.maxactive) 443 vinum_conf.maxactive = vinum_conf.active; 444 445 #if VINUMDEBUG 446 if (debug & DEBUG_ADDRESSES) 447 log(LOG_DEBUG, 448 " %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n", 449 rqe->b.b_flags & B_READ ? "Read" : "Write", 450 major(rqe->b.b_dev), 451 minor(rqe->b.b_dev), 452 rqe->sdno, 453 (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset), 454 rqe->b.b_blkno, 455 rqe->b.b_bcount); 456 if (debug & DEBUG_LASTREQS) 457 logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp); 458 #endif 459 BUF_STRATEGY(&rqe->b, 0); 460 } 461 462 /* Local Variables: */ 463 /* fill-column: 50 */ 464 /* End: */ 465