xref: /dragonfly/sys/dev/raid/vinum/vinuminterrupt.c (revision 38a690d7)
1 /* vinuminterrupt.c: bottom half of the driver */
2 
3 /*-
4  * Copyright (c) 1997, 1998, 1999
5  *	Nan Yang Computer Services Limited.  All rights reserved.
6  *
7  *  Parts copyright (c) 1997, 1998 Cybernet Corporation, NetMAX project.
8  *
9  *  Written by Greg Lehey
10  *
11  *  This software is distributed under the so-called ``Berkeley
12  *  License'':
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by Nan Yang Computer
25  *      Services Limited.
26  * 4. Neither the name of the Company nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * This software is provided ``as is'', and any express or implied
31  * warranties, including, but not limited to, the implied warranties of
32  * merchantability and fitness for a particular purpose are disclaimed.
33  * In no event shall the company or contributors be liable for any
34  * direct, indirect, incidental, special, exemplary, or consequential
35  * damages (including, but not limited to, procurement of substitute
36  * goods or services; loss of use, data, or profits; or business
37  * interruption) however caused and on any theory of liability, whether
38  * in contract, strict liability, or tort (including negligence or
39  * otherwise) arising in any way out of the use of this software, even if
40  * advised of the possibility of such damage.
41  *
42  * $Id: vinuminterrupt.c,v 1.12 2000/11/24 03:41:42 grog Exp grog $
43  * $FreeBSD: src/sys/dev/vinum/vinuminterrupt.c,v 1.25.2.3 2001/05/28 05:56:27 grog Exp $
44  * $DragonFly: src/sys/dev/raid/vinum/vinuminterrupt.c,v 1.3 2003/08/07 21:17:09 dillon Exp $
45  */
46 
47 #include "vinumhdr.h"
48 #include "request.h"
49 #include <sys/resourcevar.h>
50 
51 void complete_raid5_write(struct rqelement *);
52 void complete_rqe(struct buf *bp);
53 void sdio_done(struct buf *bp);
54 
55 /*
56  * Take a completed buffer, transfer the data back if
57  * it's a read, and complete the high-level request
58  * if this is the last subrequest.
59  *
60  * The bp parameter is in fact a struct rqelement, which
61  * includes a couple of extras at the end.
62  */
63 void
64 complete_rqe(struct buf *bp)
65 {
66     struct rqelement *rqe;
67     struct request *rq;
68     struct rqgroup *rqg;
69     struct buf *ubp;					    /* user buffer */
70     struct drive *drive;
71     struct sd *sd;
72     char *gravity;					    /* for error messages */
73 
74     rqe = (struct rqelement *) bp;			    /* point to the element that completed */
75     rqg = rqe->rqg;					    /* and the request group */
76     rq = rqg->rq;					    /* and the complete request */
77     ubp = rq->bp;					    /* user buffer */
78 
79 #ifdef VINUMDEBUG
80     if (debug & DEBUG_LASTREQS)
81 	logrq(loginfo_iodone, (union rqinfou) rqe, ubp);
82 #endif
83     drive = &DRIVE[rqe->driveno];
84     drive->active--;					    /* one less outstanding I/O on this drive */
85     vinum_conf.active--;				    /* one less outstanding I/O globally */
86     if ((drive->active == (DRIVE_MAXACTIVE - 1))	    /* we were at the drive limit */
87     ||(vinum_conf.active == VINUM_MAXACTIVE))		    /* or the global limit */
88 	wakeup(&launch_requests);			    /* let another one at it */
89     if ((bp->b_flags & B_ERROR) != 0) {			    /* transfer in error */
90 	gravity = "";
91 	sd = &SD[rqe->sdno];
92 
93 	if (bp->b_error != 0)				    /* did it return a number? */
94 	    rq->error = bp->b_error;			    /* yes, put it in. */
95 	else if (rq->error == 0)			    /* no: do we have one already? */
96 	    rq->error = EIO;				    /* no: catchall "I/O error" */
97 	sd->lasterror = rq->error;
98 	if (bp->b_flags & B_READ) {
99 	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
100 		gravity = " fatal";
101 		set_sd_state(rqe->sdno, sd_crashed, setstate_force); /* subdisk is crashed */
102 	    }
103 	    log(LOG_ERR,
104 		"%s:%s read error, block %d for %ld bytes\n",
105 		gravity,
106 		sd->name,
107 		bp->b_blkno,
108 		bp->b_bcount);
109 	} else {					    /* write operation */
110 	    if ((rq->error == ENXIO) || (sd->flags & VF_RETRYERRORS) == 0) {
111 		gravity = "fatal ";
112 		set_sd_state(rqe->sdno, sd_stale, setstate_force); /* subdisk is stale */
113 	    }
114 	    log(LOG_ERR,
115 		"%s:%s write error, block %d for %ld bytes\n",
116 		gravity,
117 		sd->name,
118 		bp->b_blkno,
119 		bp->b_bcount);
120 	}
121 	log(LOG_ERR,
122 	    "%s: user buffer block %d for %ld bytes\n",
123 	    sd->name,
124 	    ubp->b_blkno,
125 	    ubp->b_bcount);
126 	if (rq->error == ENXIO) {			    /* the drive's down too */
127 	    log(LOG_ERR,
128 		"%s: fatal drive I/O error, block %d for %ld bytes\n",
129 		DRIVE[rqe->driveno].label.name,
130 		bp->b_blkno,
131 		bp->b_bcount);
132 	    DRIVE[rqe->driveno].lasterror = rq->error;
133 	    set_drive_state(rqe->driveno,		    /* take the drive down */
134 		drive_down,
135 		setstate_force);
136 	}
137     }
138     /* Now update the statistics */
139     if (bp->b_flags & B_READ) {				    /* read operation */
140 	DRIVE[rqe->driveno].reads++;
141 	DRIVE[rqe->driveno].bytes_read += bp->b_bcount;
142 	SD[rqe->sdno].reads++;
143 	SD[rqe->sdno].bytes_read += bp->b_bcount;
144 	PLEX[rqe->rqg->plexno].reads++;
145 	PLEX[rqe->rqg->plexno].bytes_read += bp->b_bcount;
146 	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
147 	    VOL[PLEX[rqe->rqg->plexno].volno].reads++;
148 	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_read += bp->b_bcount;
149 	}
150     } else {						    /* write operation */
151 	DRIVE[rqe->driveno].writes++;
152 	DRIVE[rqe->driveno].bytes_written += bp->b_bcount;
153 	SD[rqe->sdno].writes++;
154 	SD[rqe->sdno].bytes_written += bp->b_bcount;
155 	PLEX[rqe->rqg->plexno].writes++;
156 	PLEX[rqe->rqg->plexno].bytes_written += bp->b_bcount;
157 	if (PLEX[rqe->rqg->plexno].volno >= 0) {	    /* volume I/O, not plex */
158 	    VOL[PLEX[rqe->rqg->plexno].volno].writes++;
159 	    VOL[PLEX[rqe->rqg->plexno].volno].bytes_written += bp->b_bcount;
160 	}
161     }
162     if (rqg->flags & XFR_RECOVERY_READ) {		    /* recovery read, */
163 	int *sdata;					    /* source */
164 	int *data;					    /* and group data */
165 	int length;					    /* and count involved */
166 	int count;					    /* loop counter */
167 	struct rqelement *urqe = &rqg->rqe[rqg->badsdno];   /* rqe of the bad subdisk */
168 
169 	/* XOR destination is the user data */
170 	sdata = (int *) &rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT];	/* old data contents */
171 	data = (int *) &urqe->b.b_data[urqe->groupoffset << DEV_BSHIFT]; /* destination */
172 	length = urqe->grouplen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
173 
174 	for (count = 0; count < length; count++)
175 	    data[count] ^= sdata[count];
176 
177 	/*
178 	 * In a normal read, we will normally read directly
179 	 * into the user buffer.  This doesn't work if
180 	 * we're also doing a recovery, so we have to
181 	 * copy it
182 	 */
183 	if (rqe->flags & XFR_NORMAL_READ) {		    /* normal read as well, */
184 	    char *src = &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* read data is here */
185 	    char *dst;
186 
187 	    dst = (char *) ubp->b_data + (rqe->useroffset << DEV_BSHIFT); /* where to put it in user buffer */
188 	    length = rqe->datalen << DEV_BSHIFT;	    /* and count involved */
189 	    bcopy(src, dst, length);			    /* move it */
190 	}
191     } else if ((rqg->flags & (XFR_NORMAL_WRITE | XFR_DEGRADED_WRITE)) /* RAID 4/5 group write operation  */
192     &&(rqg->active == 1))				    /* and this is the last active request */
193 	complete_raid5_write(rqe);
194     /*
195      * This is the earliest place where we can be
196      * sure that the request has really finished,
197      * since complete_raid5_write can issue new
198      * requests.
199      */
200     rqg->active--;					    /* this request now finished */
201     if (rqg->active == 0) {				    /* request group finished, */
202 	rq->active--;					    /* one less */
203 	if (rqg->lock) {				    /* got a lock? */
204 	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
205 	    rqg->lock = 0;
206 	}
207     }
208     if (rq->active == 0) {				    /* request finished, */
209 #ifdef VINUMDEBUG
210 	if (debug & DEBUG_RESID) {
211 	    if (ubp->b_resid != 0)			    /* still something to transfer? */
212 		Debugger("resid");
213 	}
214 #endif
215 
216 	if (rq->error) {				    /* did we have an error? */
217 	    if (rq->isplex) {				    /* plex operation, */
218 		ubp->b_flags |= B_ERROR;		    /* yes, propagate to user */
219 		ubp->b_error = rq->error;
220 	    } else					    /* try to recover */
221 		queue_daemon_request(daemonrq_ioerror, (union daemoninfo) rq); /* let the daemon complete */
222 	} else {
223 	    ubp->b_resid = 0;				    /* completed our transfer */
224 	    if (rq->isplex == 0)			    /* volume request, */
225 		VOL[rq->volplex.volno].active--;	    /* another request finished */
226 	    biodone(ubp);				    /* top level buffer completed */
227 	    freerq(rq);					    /* return the request storage */
228 	}
229     }
230 }
231 
232 /* Free a request block and anything hanging off it */
233 void
234 freerq(struct request *rq)
235 {
236     struct rqgroup *rqg;
237     struct rqgroup *nrqg;				    /* next in chain */
238     int rqno;
239 
240     for (rqg = rq->rqg; rqg != NULL; rqg = nrqg) {	    /* through the whole request chain */
241 	if (rqg->lock)					    /* got a lock? */
242 	    unlockrange(rqg->plexno, rqg->lock);	    /* yes, free it */
243 	for (rqno = 0; rqno < rqg->count; rqno++) {
244 	    if ((rqg->rqe[rqno].flags & XFR_MALLOCED)	    /* data buffer was malloced, */
245 	    &&rqg->rqe[rqno].b.b_data)			    /* and the allocation succeeded */
246 		Free(rqg->rqe[rqno].b.b_data);		    /* free it */
247 	    if (rqg->rqe[rqno].flags & XFR_BUFLOCKED) {	    /* locked this buffer, */
248 		BUF_UNLOCK(&rqg->rqe[rqno].b);		    /* unlock it again */
249 		BUF_LOCKFREE(&rqg->rqe[rqno].b);
250 	    }
251 	}
252 	nrqg = rqg->next;				    /* note the next one */
253 	Free(rqg);					    /* and free this one */
254     }
255     Free(rq);						    /* free the request itself */
256 }
257 
258 /* I/O on subdisk completed */
259 void
260 sdio_done(struct buf *bp)
261 {
262     struct sdbuf *sbp;
263 
264     sbp = (struct sdbuf *) bp;
265     if (sbp->b.b_flags & B_ERROR) {			    /* had an error */
266 	sbp->bp->b_flags |= B_ERROR;			    /* propagate upwards */
267 	sbp->bp->b_error = sbp->b.b_error;
268     }
269 #ifdef VINUMDEBUG
270     if (debug & DEBUG_LASTREQS)
271 	logrq(loginfo_sdiodone, (union rqinfou) bp, bp);
272 #endif
273     sbp->bp->b_resid = sbp->b.b_resid;			    /* copy the resid field */
274     /* Now update the statistics */
275     if (bp->b_flags & B_READ) {				    /* read operation */
276 	DRIVE[sbp->driveno].reads++;
277 	DRIVE[sbp->driveno].bytes_read += sbp->b.b_bcount;
278 	SD[sbp->sdno].reads++;
279 	SD[sbp->sdno].bytes_read += sbp->b.b_bcount;
280     } else {						    /* write operation */
281 	DRIVE[sbp->driveno].writes++;
282 	DRIVE[sbp->driveno].bytes_written += sbp->b.b_bcount;
283 	SD[sbp->sdno].writes++;
284 	SD[sbp->sdno].bytes_written += sbp->b.b_bcount;
285     }
286     biodone(sbp->bp);					    /* complete the caller's I/O */
287     BUF_UNLOCK(&sbp->b);
288     BUF_LOCKFREE(&sbp->b);
289     Free(sbp);
290 }
291 
292 /* Start the second phase of a RAID-4 or RAID-5 group write operation. */
293 void
294 complete_raid5_write(struct rqelement *rqe)
295 {
296     int *sdata;						    /* source */
297     int *pdata;						    /* and parity block data */
298     int length;						    /* and count involved */
299     int count;						    /* loop counter */
300     int rqno;						    /* request index */
301     int rqoffset;					    /* offset of request data from parity data */
302     struct buf *ubp;					    /* user buffer header */
303     struct request *rq;					    /* pointer to our request */
304     struct rqgroup *rqg;				    /* and to the request group */
305     struct rqelement *prqe;				    /* point to the parity block */
306     struct drive *drive;				    /* drive to access */
307 
308     rqg = rqe->rqg;					    /* and to our request group */
309     rq = rqg->rq;					    /* point to our request */
310     ubp = rq->bp;					    /* user's buffer header */
311     prqe = &rqg->rqe[0];				    /* point to the parity block */
312 
313     /*
314      * If we get to this function, we have normal or
315      * degraded writes, or a combination of both.  We do
316      * the same thing in each case: we perform an
317      * exclusive or to the parity block.  The only
318      * difference is the origin of the data and the
319      * address range.
320      */
321     if (rqe->flags & XFR_DEGRADED_WRITE) {		    /* do the degraded write stuff */
322 	pdata = (int *) (&prqe->b.b_data[(prqe->groupoffset) << DEV_BSHIFT]); /* parity data pointer */
323 	bzero(pdata, prqe->grouplen << DEV_BSHIFT);	    /* start with nothing in the parity block */
324 
325 	/* Now get what data we need from each block */
326 	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
327 	    rqe = &rqg->rqe[rqno];			    /* this request */
328 	    sdata = (int *) (&rqe->b.b_data[rqe->groupoffset << DEV_BSHIFT]); /* old data */
329 	    length = rqe->grouplen << (DEV_BSHIFT - 2);	    /* and count involved */
330 
331 	    /*
332 	     * Add the data block to the parity block.  Before
333 	     * we started the request, we zeroed the parity
334 	     * block, so the result of adding all the other
335 	     * blocks and the block we want to write will be
336 	     * the correct parity block.
337 	     */
338 	    for (count = 0; count < length; count++)
339 		pdata[count] ^= sdata[count];
340 	    if ((rqe->flags & XFR_MALLOCED)		    /* the buffer was malloced, */
341 	    &&((rqg->flags & XFR_NORMAL_WRITE) == 0)) {	    /* and we have no normal write, */
342 		Free(rqe->b.b_data);			    /* free it now */
343 		rqe->flags &= ~XFR_MALLOCED;
344 	    }
345 	}
346     }
347     if (rqg->flags & XFR_NORMAL_WRITE) {		    /* do normal write stuff */
348 	/* Get what data we need from each block */
349 	for (rqno = 1; rqno < rqg->count; rqno++) {	    /* for all the data blocks */
350 	    rqe = &rqg->rqe[rqno];			    /* this request */
351 	    if ((rqe->flags & (XFR_DATA_BLOCK | XFR_BAD_SUBDISK | XFR_NORMAL_WRITE))
352 		== (XFR_DATA_BLOCK | XFR_NORMAL_WRITE)) {   /* good data block to write */
353 		sdata = (int *) &rqe->b.b_data[rqe->dataoffset << DEV_BSHIFT]; /* old data contents */
354 		rqoffset = rqe->dataoffset + rqe->sdoffset - prqe->sdoffset; /* corresponding parity block offset */
355 		pdata = (int *) (&prqe->b.b_data[rqoffset << DEV_BSHIFT]); /* parity data pointer */
356 		length = rqe->datalen * (DEV_BSIZE / sizeof(int)); /* and number of ints */
357 
358 		/*
359 		 * "remove" the old data block
360 		 * from the parity block
361 		 */
362 		if ((pdata < ((int *) prqe->b.b_data))
363 		    || (&pdata[length] > ((int *) (prqe->b.b_data + prqe->b.b_bcount)))
364 		    || (sdata < ((int *) rqe->b.b_data))
365 		    || (&sdata[length] > ((int *) (rqe->b.b_data + rqe->b.b_bcount))))
366 		    panic("complete_raid5_write: bounds overflow");
367 		for (count = 0; count < length; count++)
368 		    pdata[count] ^= sdata[count];
369 
370 		/* "add" the new data block */
371 		sdata = (int *) (&ubp->b_data[rqe->useroffset << DEV_BSHIFT]); /* new data */
372 		if ((sdata < ((int *) ubp->b_data))
373 		    || (&sdata[length] > ((int *) (ubp->b_data + ubp->b_bcount))))
374 		    panic("complete_raid5_write: bounds overflow");
375 		for (count = 0; count < length; count++)
376 		    pdata[count] ^= sdata[count];
377 
378 		/* Free the malloced buffer */
379 		if (rqe->flags & XFR_MALLOCED) {	    /* the buffer was malloced, */
380 		    Free(rqe->b.b_data);		    /* free it */
381 		    rqe->flags &= ~XFR_MALLOCED;
382 		} else
383 		    panic("complete_raid5_write: malloc conflict");
384 
385 		if ((rqe->b.b_flags & B_READ)		    /* this was a read */
386 		&&((rqe->flags & XFR_BAD_SUBDISK) == 0)) {  /* and we can write this block */
387 		    rqe->b.b_flags &= ~(B_READ | B_DONE);   /* we're writing now */
388 		    rqe->b.b_flags |= B_CALL;		    /* call us when you're done */
389 		    rqe->b.b_iodone = complete_rqe;	    /* by calling us here */
390 		    rqe->flags &= ~XFR_PARITYOP;	    /* reset flags that brought us here */
391 		    rqe->b.b_data = &ubp->b_data[rqe->useroffset << DEV_BSHIFT]; /* point to the user data */
392 		    rqe->b.b_bcount = rqe->datalen << DEV_BSHIFT; /* length to write */
393 		    rqe->b.b_bufsize = rqe->b.b_bcount;	    /* don't claim more */
394 		    rqe->b.b_resid = rqe->b.b_bcount;	    /* nothing transferred */
395 		    rqe->b.b_blkno += rqe->dataoffset;	    /* point to the correct block */
396 		    rqg->active++;			    /* another active request */
397 		    drive = &DRIVE[rqe->driveno];	    /* drive to access */
398 
399 							    /* We can't sleep here, so we just increment the counters. */
400 		    drive->active++;
401 		    if (drive->active >= drive->maxactive)
402 			drive->maxactive = drive->active;
403 		    vinum_conf.active++;
404 		    if (vinum_conf.active >= vinum_conf.maxactive)
405 			vinum_conf.maxactive = vinum_conf.active;
406 #if VINUMDEBUG
407 		    if (debug & DEBUG_ADDRESSES)
408 			log(LOG_DEBUG,
409 			    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
410 			    rqe->b.b_flags & B_READ ? "Read" : "Write",
411 			    major(rqe->b.b_dev),
412 			    minor(rqe->b.b_dev),
413 			    rqe->sdno,
414 			    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
415 			    rqe->b.b_blkno,
416 			    rqe->b.b_bcount);
417 		    if (debug & DEBUG_LASTREQS)
418 			logrq(loginfo_raid5_data, (union rqinfou) rqe, ubp);
419 #endif
420 		    BUF_STRATEGY(&rqe->b, 0);
421 		}
422 	    }
423 	}
424     }
425     /* Finally, write the parity block */
426     rqe = &rqg->rqe[0];
427     rqe->b.b_flags &= ~(B_READ | B_DONE);		    /* we're writing now */
428     rqe->b.b_flags |= B_CALL;				    /* tell us when you're done */
429     rqe->b.b_iodone = complete_rqe;			    /* by calling us here */
430     rqg->flags &= ~XFR_PARITYOP;			    /* reset flags that brought us here */
431     rqe->b.b_bcount = rqe->buflen << DEV_BSHIFT;	    /* length to write */
432     rqe->b.b_bufsize = rqe->b.b_bcount;			    /* don't claim we have more */
433     rqe->b.b_resid = rqe->b.b_bcount;			    /* nothing transferred */
434     rqg->active++;					    /* another active request */
435     drive = &DRIVE[rqe->driveno];			    /* drive to access */
436 
437     /* We can't sleep here, so we just increment the counters. */
438     drive->active++;
439     if (drive->active >= drive->maxactive)
440 	drive->maxactive = drive->active;
441     vinum_conf.active++;
442     if (vinum_conf.active >= vinum_conf.maxactive)
443 	vinum_conf.maxactive = vinum_conf.active;
444 
445 #if VINUMDEBUG
446     if (debug & DEBUG_ADDRESSES)
447 	log(LOG_DEBUG,
448 	    "  %s dev %d.%d, sd %d, offset 0x%x, devoffset 0x%x, length %ld\n",
449 	    rqe->b.b_flags & B_READ ? "Read" : "Write",
450 	    major(rqe->b.b_dev),
451 	    minor(rqe->b.b_dev),
452 	    rqe->sdno,
453 	    (u_int) (rqe->b.b_blkno - SD[rqe->sdno].driveoffset),
454 	    rqe->b.b_blkno,
455 	    rqe->b.b_bcount);
456     if (debug & DEBUG_LASTREQS)
457 	logrq(loginfo_raid5_parity, (union rqinfou) rqe, ubp);
458 #endif
459     BUF_STRATEGY(&rqe->b, 0);
460 }
461 
462 /* Local Variables: */
463 /* fill-column: 50 */
464 /* End: */
465